add wxMBConvStrictUTF8 class implementing just UTF-8 conversion, without support for PUA/octal mappings and use it for wxConvUTF8 as it's simpler and more efficient (~20% faster)
git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@47703 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
This commit is contained in:
@@ -714,8 +714,268 @@ static wxUint32 utf8_max[]=
|
||||
const wxUint32 wxUnicodePUA = 0x100000;
|
||||
const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
|
||||
|
||||
// this table gives the length of the UTF-8 encoding from its first character:
|
||||
unsigned char tableUtf8Lengths[256] = {
|
||||
// single-byte sequences (ASCII):
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
|
||||
|
||||
// these are invalid:
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
|
||||
0, 0, // C0,C1
|
||||
|
||||
// two-byte sequences:
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
|
||||
|
||||
// three-byte sequences:
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
|
||||
|
||||
// four-byte sequences:
|
||||
4, 4, 4, 4, 4, // F0..F4
|
||||
|
||||
// these are invalid again (5- or 6-byte
|
||||
// sequences and sequences for code points
|
||||
// above U+10FFFF, as restricted by RFC 3629):
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
|
||||
};
|
||||
|
||||
size_t
|
||||
wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
|
||||
const char *src, size_t srcLen) const
|
||||
{
|
||||
wchar_t *out = dstLen ? dst : NULL;
|
||||
size_t written = 0;
|
||||
|
||||
if ( srcLen == wxNO_LEN )
|
||||
srcLen = strlen(src) + 1;
|
||||
|
||||
for ( const char *p = src; ; p++ )
|
||||
{
|
||||
if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
|
||||
{
|
||||
// all done successfully, just add the trailing NULL if we are not
|
||||
// using explicit length
|
||||
if ( srcLen == wxNO_LEN )
|
||||
{
|
||||
if ( out )
|
||||
{
|
||||
if ( !dstLen )
|
||||
break;
|
||||
|
||||
*out = L'\0';
|
||||
}
|
||||
|
||||
written++;
|
||||
}
|
||||
|
||||
return written;
|
||||
}
|
||||
|
||||
unsigned char c = *p;
|
||||
unsigned len = tableUtf8Lengths[c];
|
||||
if ( !len )
|
||||
break;
|
||||
|
||||
if ( srcLen < len ) // the test works for wxNO_LEN too
|
||||
break;
|
||||
|
||||
if ( srcLen != wxNO_LEN )
|
||||
srcLen -= len;
|
||||
|
||||
if ( out && !dstLen-- )
|
||||
break;
|
||||
|
||||
|
||||
// Char. number range | UTF-8 octet sequence
|
||||
// (hexadecimal) | (binary)
|
||||
// ----------------------+---------------------------------------------
|
||||
// 0000 0000 - 0000 007F | 0xxxxxxx
|
||||
// 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
|
||||
// 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
|
||||
// 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
//
|
||||
// Code point value is stored in bits marked with 'x', lowest-order bit
|
||||
// of the value on the right side in the diagram above.
|
||||
// (from RFC 3629)
|
||||
|
||||
// mask to extract lead byte's value ('x' bits above), by sequence length:
|
||||
static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
|
||||
|
||||
// mask and value of lead byte's most significant bits, by length:
|
||||
static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
|
||||
static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
|
||||
|
||||
len--; // it's more convenient to work with 0-based length here
|
||||
|
||||
// extract the lead byte's value bits:
|
||||
if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
|
||||
break;
|
||||
|
||||
wxUint32 code = c & leadValueMask[len];
|
||||
|
||||
// all remaining bytes, if any, are handled in the same way regardless of
|
||||
// sequence's length:
|
||||
for ( ; len; --len )
|
||||
{
|
||||
c = *++p;
|
||||
if ( (c & 0xC0) != 0x80 )
|
||||
return wxCONV_FAILED;
|
||||
|
||||
code <<= 6;
|
||||
code |= c & 0x3F;
|
||||
}
|
||||
|
||||
#ifdef WC_UTF16
|
||||
// cast is ok because wchar_t == wxUint16 if WC_UTF16
|
||||
if ( encode_utf16(code, (wxUint16 *)out) == 2 )
|
||||
{
|
||||
if ( out )
|
||||
out++;
|
||||
written++;
|
||||
}
|
||||
#else // !WC_UTF16
|
||||
if ( out )
|
||||
*out = code;
|
||||
#endif // WC_UTF16/!WC_UTF16
|
||||
|
||||
if ( out )
|
||||
out++;
|
||||
|
||||
written++;
|
||||
}
|
||||
|
||||
return wxCONV_FAILED;
|
||||
}
|
||||
|
||||
size_t
|
||||
wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
|
||||
const wchar_t *src, size_t srcLen) const
|
||||
{
|
||||
char *out = dstLen ? dst : NULL;
|
||||
size_t written = 0;
|
||||
|
||||
for ( const wchar_t *wp = src; ; wp++ )
|
||||
{
|
||||
if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
|
||||
{
|
||||
// all done successfully, just add the trailing NULL if we are not
|
||||
// using explicit length
|
||||
if ( srcLen == wxNO_LEN )
|
||||
{
|
||||
if ( out )
|
||||
{
|
||||
if ( !dstLen )
|
||||
break;
|
||||
|
||||
*out = '\0';
|
||||
}
|
||||
|
||||
written++;
|
||||
}
|
||||
|
||||
return written;
|
||||
}
|
||||
|
||||
|
||||
wxUint32 code;
|
||||
#ifdef WC_UTF16
|
||||
// cast is ok for WC_UTF16
|
||||
if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
|
||||
{
|
||||
// skip the next char too as we decoded a surrogate
|
||||
wp++;
|
||||
}
|
||||
#else // wchar_t is UTF-32
|
||||
code = *wp & 0x7fffffff;
|
||||
#endif
|
||||
|
||||
unsigned len;
|
||||
if ( code <= 0x7F )
|
||||
{
|
||||
len = 1;
|
||||
if ( out )
|
||||
{
|
||||
if ( dstLen < len )
|
||||
break;
|
||||
|
||||
out[0] = (char)code;
|
||||
}
|
||||
}
|
||||
else if ( code <= 0x07FF )
|
||||
{
|
||||
len = 2;
|
||||
if ( out )
|
||||
{
|
||||
if ( dstLen < len )
|
||||
break;
|
||||
|
||||
// NB: this line takes 6 least significant bits, encodes them as
|
||||
// 10xxxxxx and discards them so that the next byte can be encoded:
|
||||
out[1] = 0x80 | (code & 0x3F); code >>= 6;
|
||||
out[0] = 0xC0 | code;
|
||||
}
|
||||
}
|
||||
else if ( code < 0xFFFF )
|
||||
{
|
||||
len = 3;
|
||||
if ( out )
|
||||
{
|
||||
if ( dstLen < len )
|
||||
break;
|
||||
|
||||
out[2] = 0x80 | (code & 0x3F); code >>= 6;
|
||||
out[1] = 0x80 | (code & 0x3F); code >>= 6;
|
||||
out[0] = 0xE0 | code;
|
||||
}
|
||||
}
|
||||
else if ( code <= 0x10FFFF )
|
||||
{
|
||||
len = 4;
|
||||
if ( out )
|
||||
{
|
||||
if ( dstLen < len )
|
||||
break;
|
||||
|
||||
out[3] = 0x80 | (code & 0x3F); code >>= 6;
|
||||
out[2] = 0x80 | (code & 0x3F); code >>= 6;
|
||||
out[1] = 0x80 | (code & 0x3F); code >>= 6;
|
||||
out[0] = 0xF0 | code;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
|
||||
break;
|
||||
}
|
||||
|
||||
if ( out )
|
||||
{
|
||||
out += len;
|
||||
dstLen -= len;
|
||||
}
|
||||
|
||||
written += len;
|
||||
}
|
||||
|
||||
// we only get here if an error occurs during decoding
|
||||
return wxCONV_FAILED;
|
||||
}
|
||||
|
||||
size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
|
||||
{
|
||||
if ( m_options == MAP_INVALID_UTF8_NOT )
|
||||
return wxMBConvStrictUTF8::MB2WC(buf, psz, n);
|
||||
|
||||
size_t len = 0;
|
||||
|
||||
while (*psz && ((!buf) || (len < n)))
|
||||
@@ -785,7 +1045,7 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
|
||||
else
|
||||
{
|
||||
#ifdef WC_UTF16
|
||||
// cast is ok because wchar_t == wxUuint16 if WC_UTF16
|
||||
// cast is ok because wchar_t == wxUint16 if WC_UTF16
|
||||
size_t pa = encode_utf16(res, (wxUint16 *)buf);
|
||||
if (pa == wxCONV_FAILED)
|
||||
{
|
||||
@@ -865,6 +1125,9 @@ static inline bool isoctal(wchar_t wch)
|
||||
|
||||
size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
|
||||
{
|
||||
if ( m_options == MAP_INVALID_UTF8_NOT )
|
||||
return wxMBConvStrictUTF8::WC2MB(buf, psz, n);
|
||||
|
||||
size_t len = 0;
|
||||
|
||||
while (*psz && ((!buf) || (len < n)))
|
||||
@@ -2903,7 +3166,7 @@ wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
|
||||
WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
|
||||
#endif
|
||||
|
||||
WX_DEFINE_GLOBAL_CONV(wxMBConvUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
|
||||
WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
|
||||
WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
|
||||
|
||||
WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
|
||||
|
Reference in New Issue
Block a user