add wxMBConvStrictUTF8 class implementing just UTF-8 conversion, without support for PUA/octal mappings and use it for wxConvUTF8 as it's simpler and more efficient (~20% faster)

git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@47703 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
2007-07-24 15:01:10 +00:00
parent 6989272940
commit 0286d08d14
2 changed files with 289 additions and 6 deletions
--- a/include/wx/strconv.h
+++ b/include/wx/strconv.h
@@ -257,11 +257,31 @@ public:
 // wxMBConvUTF8 (for conversion using UTF8 encoding)
 // ----------------------------------------------------------------------------
-class WXDLLIMPEXP_BASE wxMBConvUTF8 : public wxMBConv
+// this is the real UTF-8 conversion class, it has to be called "strict UTF-8"
 // for compatibility reasons: the wxMBConvUTF8 class below also supports lossy
 // conversions if it is created with non default options
 class WXDLLIMPEXP_BASE wxMBConvStrictUTF8 : public wxMBConv
 {
 public:
    // compiler-generated default ctor and other methods are ok
    virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
                           const char *src, size_t srcLen = wxNO_LEN) const;
    virtual size_t FromWChar(char *dst, size_t dstLen,
                             const wchar_t *src, size_t srcLen = wxNO_LEN) const;
    virtual wxMBConv *Clone() const { return new wxMBConvStrictUTF8(); }
 #if wxUSE_UNICODE_UTF8
    // NB: other mapping modes are not, strictly speaking, UTF-8, so we can't
    //     take the shortcut in that case
    virtual bool IsUTF8() const { return true; }
 #endif
 };
 class WXDLLIMPEXP_BASE wxMBConvUTF8 : public wxMBConvStrictUTF8
 {
 public:
    // FIXME-UTF8: split this class into multiple classes, one strict and
    //             other lossy (PUA, OCTAL mappings)
    enum
    {
        MAP_INVALID_UTF8_NOT = 0,
@@ -470,7 +490,7 @@ WX_DECLARE_GLOBAL_CONV(wxMBConv, wxConvLibc)
 WX_DECLARE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1)
 #define wxConvISO8859_1 wxGet_wxConvISO8859_1()
-WX_DECLARE_GLOBAL_CONV(wxMBConvUTF8, wxConvUTF8)
+WX_DECLARE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8)
 #define wxConvUTF8 wxGet_wxConvUTF8()
 WX_DECLARE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7)
--- a/src/common/strconv.cpp
+++ b/src/common/strconv.cpp
@@ -714,8 +714,268 @@ static wxUint32 utf8_max[]=
 const wxUint32 wxUnicodePUA = 0x100000;
 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 // this table gives the length of the UTF-8 encoding from its first character:
 unsigned char tableUtf8Lengths[256] = {
    // single-byte sequences (ASCII):
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
    // these are invalid:
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
    0, 0,                                            // C0,C1
    // two-byte sequences:
          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
    // three-byte sequences:
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
    // four-byte sequences:
    4, 4, 4, 4, 4,                                   // F0..F4
    // these are invalid again (5- or 6-byte
    // sequences and sequences for code points
    // above U+10FFFF, as restricted by RFC 3629):
                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 };
 size_t
 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
                            const char *src, size_t srcLen) const
 {
    wchar_t *out = dstLen ? dst : NULL;
    size_t written = 0;
    if ( srcLen == wxNO_LEN )
        srcLen = strlen(src) + 1;
    for ( const char *p = src; ; p++ )
    {
        if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
        {
            // all done successfully, just add the trailing NULL if we are not
            // using explicit length
            if ( srcLen == wxNO_LEN )
            {
                if ( out )
                {
                    if ( !dstLen )
                        break;
                    *out = L'\0';
                }
                written++;
            }
            return written;
        }
        unsigned char c = *p;
        unsigned len = tableUtf8Lengths[c];
        if ( !len )
            break;
        if ( srcLen < len ) // the test works for wxNO_LEN too
            break;
        if ( srcLen != wxNO_LEN )
            srcLen -= len;
        if ( out && !dstLen-- )
            break;
        //   Char. number range   |        UTF-8 octet sequence
        //      (hexadecimal)     |              (binary)
        //  ----------------------+---------------------------------------------
        //  0000 0000 - 0000 007F | 0xxxxxxx
        //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
        //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
        //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
        //
        //  Code point value is stored in bits marked with 'x', lowest-order bit
        //  of the value on the right side in the diagram above.
        //                                                       (from RFC 3629)
        // mask to extract lead byte's value ('x' bits above), by sequence length:
        static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
        // mask and value of lead byte's most significant bits, by length:
        static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
        static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
        len--; // it's more convenient to work with 0-based length here
        // extract the lead byte's value bits:
        if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
            break;
        wxUint32 code = c & leadValueMask[len];
        // all remaining bytes, if any, are handled in the same way regardless of
        // sequence's length:
        for ( ; len; --len )
        {
            c = *++p;
            if ( (c & 0xC0) != 0x80 )
                return wxCONV_FAILED;
            code <<= 6;
            code |= c & 0x3F;
        }
 #ifdef WC_UTF16
        // cast is ok because wchar_t == wxUint16 if WC_UTF16
        if ( encode_utf16(code, (wxUint16 *)out) == 2 )
        {
            if ( out )
                out++;
            written++;
        }
 #else // !WC_UTF16
        if ( out )
            *out = code;
 #endif // WC_UTF16/!WC_UTF16
        if ( out )
            out++;
        written++;
    }
    return wxCONV_FAILED;
 }
 size_t
 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
                              const wchar_t *src, size_t srcLen) const
 {
    char *out = dstLen ? dst : NULL;
    size_t written = 0;
    for ( const wchar_t *wp = src; ; wp++ )
    {
        if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
        {
            // all done successfully, just add the trailing NULL if we are not
            // using explicit length
            if ( srcLen == wxNO_LEN )
            {
                if ( out )
                {
                    if ( !dstLen )
                        break;
                    *out = '\0';
                }
                written++;
            }
            return written;
        }
        wxUint32 code;
 #ifdef WC_UTF16
        // cast is ok for WC_UTF16
        if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
        {
            // skip the next char too as we decoded a surrogate
            wp++;
        }
 #else // wchar_t is UTF-32
        code = *wp & 0x7fffffff;
 #endif
        unsigned len;
        if ( code <= 0x7F )
        {
            len = 1;
            if ( out )
            {
                if ( dstLen < len )
                    break;
                out[0] = (char)code;
            }
        }
        else if ( code <= 0x07FF )
        {
            len = 2;
            if ( out )
            {
                if ( dstLen < len )
                    break;
                // NB: this line takes 6 least significant bits, encodes them as
                // 10xxxxxx and discards them so that the next byte can be encoded:
                out[1] = 0x80 | (code & 0x3F);  code >>= 6;
                out[0] = 0xC0 | code;
            }
        }
        else if ( code < 0xFFFF )
        {
            len = 3;
            if ( out )
            {
                if ( dstLen < len )
                    break;
                out[2] = 0x80 | (code & 0x3F);  code >>= 6;
                out[1] = 0x80 | (code & 0x3F);  code >>= 6;
                out[0] = 0xE0 | code;
            }
        }
        else if ( code <= 0x10FFFF )
        {
            len = 4;
            if ( out )
            {
                if ( dstLen < len )
                    break;
                out[3] = 0x80 | (code & 0x3F);  code >>= 6;
                out[2] = 0x80 | (code & 0x3F);  code >>= 6;
                out[1] = 0x80 | (code & 0x3F);  code >>= 6;
                out[0] = 0xF0 | code;
            }
        }
        else
        {
            wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
            break;
        }
        if ( out )
        {
            out += len;
            dstLen -= len;
        }
        written += len;
    }
    // we only get here if an error occurs during decoding
    return wxCONV_FAILED;
 }
 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 {
    if ( m_options == MAP_INVALID_UTF8_NOT )
        return wxMBConvStrictUTF8::MB2WC(buf, psz, n);
    size_t len = 0;
    while (*psz && ((!buf) || (len < n)))
@@ -785,7 +1045,7 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
                else
                {
 #ifdef WC_UTF16
-                    // cast is ok because wchar_t == wxUuint16 if WC_UTF16
+                    // cast is ok because wchar_t == wxUint16 if WC_UTF16
                    size_t pa = encode_utf16(res, (wxUint16 *)buf);
                    if (pa == wxCONV_FAILED)
                    {
@@ -865,6 +1125,9 @@ static inline bool isoctal(wchar_t wch)
 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 {
    if ( m_options == MAP_INVALID_UTF8_NOT )
        return wxMBConvStrictUTF8::WC2MB(buf, psz, n);
    size_t len = 0;
    while (*psz && ((!buf) || (len < n)))
@@ -2903,7 +3166,7 @@ wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
    WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
 #endif
-WX_DEFINE_GLOBAL_CONV(wxMBConvUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
+WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));