add wxMBConvStrictUTF8 class implementing just UTF-8 conversion, without support for PUA/octal mappings and use it for wxConvUTF8 as it's simpler and more efficient (~20% faster)

git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@47703 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
This commit is contained in:
Vadim Zeitlin
2007-07-24 15:01:10 +00:00
parent 6989272940
commit 0286d08d14
2 changed files with 289 additions and 6 deletions

View File

@@ -714,8 +714,268 @@ static wxUint32 utf8_max[]=
const wxUint32 wxUnicodePUA = 0x100000;
const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
// this table gives the length of the UTF-8 encoding from its first character:
unsigned char tableUtf8Lengths[256] = {
// single-byte sequences (ASCII):
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
// these are invalid:
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
0, 0, // C0,C1
// two-byte sequences:
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
// three-byte sequences:
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
// four-byte sequences:
4, 4, 4, 4, 4, // F0..F4
// these are invalid again (5- or 6-byte
// sequences and sequences for code points
// above U+10FFFF, as restricted by RFC 3629):
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
};
size_t
wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
const char *src, size_t srcLen) const
{
wchar_t *out = dstLen ? dst : NULL;
size_t written = 0;
if ( srcLen == wxNO_LEN )
srcLen = strlen(src) + 1;
for ( const char *p = src; ; p++ )
{
if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
{
// all done successfully, just add the trailing NULL if we are not
// using explicit length
if ( srcLen == wxNO_LEN )
{
if ( out )
{
if ( !dstLen )
break;
*out = L'\0';
}
written++;
}
return written;
}
unsigned char c = *p;
unsigned len = tableUtf8Lengths[c];
if ( !len )
break;
if ( srcLen < len ) // the test works for wxNO_LEN too
break;
if ( srcLen != wxNO_LEN )
srcLen -= len;
if ( out && !dstLen-- )
break;
// Char. number range | UTF-8 octet sequence
// (hexadecimal) | (binary)
// ----------------------+---------------------------------------------
// 0000 0000 - 0000 007F | 0xxxxxxx
// 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
// 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
// 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
//
// Code point value is stored in bits marked with 'x', lowest-order bit
// of the value on the right side in the diagram above.
// (from RFC 3629)
// mask to extract lead byte's value ('x' bits above), by sequence length:
static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
// mask and value of lead byte's most significant bits, by length:
static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
len--; // it's more convenient to work with 0-based length here
// extract the lead byte's value bits:
if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
break;
wxUint32 code = c & leadValueMask[len];
// all remaining bytes, if any, are handled in the same way regardless of
// sequence's length:
for ( ; len; --len )
{
c = *++p;
if ( (c & 0xC0) != 0x80 )
return wxCONV_FAILED;
code <<= 6;
code |= c & 0x3F;
}
#ifdef WC_UTF16
// cast is ok because wchar_t == wxUint16 if WC_UTF16
if ( encode_utf16(code, (wxUint16 *)out) == 2 )
{
if ( out )
out++;
written++;
}
#else // !WC_UTF16
if ( out )
*out = code;
#endif // WC_UTF16/!WC_UTF16
if ( out )
out++;
written++;
}
return wxCONV_FAILED;
}
size_t
wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
const wchar_t *src, size_t srcLen) const
{
char *out = dstLen ? dst : NULL;
size_t written = 0;
for ( const wchar_t *wp = src; ; wp++ )
{
if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
{
// all done successfully, just add the trailing NULL if we are not
// using explicit length
if ( srcLen == wxNO_LEN )
{
if ( out )
{
if ( !dstLen )
break;
*out = '\0';
}
written++;
}
return written;
}
wxUint32 code;
#ifdef WC_UTF16
// cast is ok for WC_UTF16
if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
{
// skip the next char too as we decoded a surrogate
wp++;
}
#else // wchar_t is UTF-32
code = *wp & 0x7fffffff;
#endif
unsigned len;
if ( code <= 0x7F )
{
len = 1;
if ( out )
{
if ( dstLen < len )
break;
out[0] = (char)code;
}
}
else if ( code <= 0x07FF )
{
len = 2;
if ( out )
{
if ( dstLen < len )
break;
// NB: this line takes 6 least significant bits, encodes them as
// 10xxxxxx and discards them so that the next byte can be encoded:
out[1] = 0x80 | (code & 0x3F); code >>= 6;
out[0] = 0xC0 | code;
}
}
else if ( code < 0xFFFF )
{
len = 3;
if ( out )
{
if ( dstLen < len )
break;
out[2] = 0x80 | (code & 0x3F); code >>= 6;
out[1] = 0x80 | (code & 0x3F); code >>= 6;
out[0] = 0xE0 | code;
}
}
else if ( code <= 0x10FFFF )
{
len = 4;
if ( out )
{
if ( dstLen < len )
break;
out[3] = 0x80 | (code & 0x3F); code >>= 6;
out[2] = 0x80 | (code & 0x3F); code >>= 6;
out[1] = 0x80 | (code & 0x3F); code >>= 6;
out[0] = 0xF0 | code;
}
}
else
{
wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
break;
}
if ( out )
{
out += len;
dstLen -= len;
}
written += len;
}
// we only get here if an error occurs during decoding
return wxCONV_FAILED;
}
size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
{
if ( m_options == MAP_INVALID_UTF8_NOT )
return wxMBConvStrictUTF8::MB2WC(buf, psz, n);
size_t len = 0;
while (*psz && ((!buf) || (len < n)))
@@ -785,7 +1045,7 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
else
{
#ifdef WC_UTF16
// cast is ok because wchar_t == wxUuint16 if WC_UTF16
// cast is ok because wchar_t == wxUint16 if WC_UTF16
size_t pa = encode_utf16(res, (wxUint16 *)buf);
if (pa == wxCONV_FAILED)
{
@@ -865,6 +1125,9 @@ static inline bool isoctal(wchar_t wch)
size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
{
if ( m_options == MAP_INVALID_UTF8_NOT )
return wxMBConvStrictUTF8::WC2MB(buf, psz, n);
size_t len = 0;
while (*psz && ((!buf) || (len < n)))
@@ -2903,7 +3166,7 @@ wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
#endif
WX_DEFINE_GLOBAL_CONV(wxMBConvUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));