add wxMBConvStrictUTF8 class implementing just UTF-8 conversion, without support for PUA/octal mappings and use it for wxConvUTF8 as it's simpler and more efficient (~20% faster)
git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@47703 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
This commit is contained in:
@@ -257,11 +257,31 @@ public:
|
|||||||
// wxMBConvUTF8 (for conversion using UTF8 encoding)
|
// wxMBConvUTF8 (for conversion using UTF8 encoding)
|
||||||
// ----------------------------------------------------------------------------
|
// ----------------------------------------------------------------------------
|
||||||
|
|
||||||
class WXDLLIMPEXP_BASE wxMBConvUTF8 : public wxMBConv
|
// this is the real UTF-8 conversion class, it has to be called "strict UTF-8"
|
||||||
|
// for compatibility reasons: the wxMBConvUTF8 class below also supports lossy
|
||||||
|
// conversions if it is created with non default options
|
||||||
|
class WXDLLIMPEXP_BASE wxMBConvStrictUTF8 : public wxMBConv
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
// compiler-generated default ctor and other methods are ok
|
||||||
|
|
||||||
|
virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
|
||||||
|
const char *src, size_t srcLen = wxNO_LEN) const;
|
||||||
|
virtual size_t FromWChar(char *dst, size_t dstLen,
|
||||||
|
const wchar_t *src, size_t srcLen = wxNO_LEN) const;
|
||||||
|
|
||||||
|
virtual wxMBConv *Clone() const { return new wxMBConvStrictUTF8(); }
|
||||||
|
|
||||||
|
#if wxUSE_UNICODE_UTF8
|
||||||
|
// NB: other mapping modes are not, strictly speaking, UTF-8, so we can't
|
||||||
|
// take the shortcut in that case
|
||||||
|
virtual bool IsUTF8() const { return true; }
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
class WXDLLIMPEXP_BASE wxMBConvUTF8 : public wxMBConvStrictUTF8
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
// FIXME-UTF8: split this class into multiple classes, one strict and
|
|
||||||
// other lossy (PUA, OCTAL mappings)
|
|
||||||
enum
|
enum
|
||||||
{
|
{
|
||||||
MAP_INVALID_UTF8_NOT = 0,
|
MAP_INVALID_UTF8_NOT = 0,
|
||||||
@@ -470,7 +490,7 @@ WX_DECLARE_GLOBAL_CONV(wxMBConv, wxConvLibc)
|
|||||||
WX_DECLARE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1)
|
WX_DECLARE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1)
|
||||||
#define wxConvISO8859_1 wxGet_wxConvISO8859_1()
|
#define wxConvISO8859_1 wxGet_wxConvISO8859_1()
|
||||||
|
|
||||||
WX_DECLARE_GLOBAL_CONV(wxMBConvUTF8, wxConvUTF8)
|
WX_DECLARE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8)
|
||||||
#define wxConvUTF8 wxGet_wxConvUTF8()
|
#define wxConvUTF8 wxGet_wxConvUTF8()
|
||||||
|
|
||||||
WX_DECLARE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7)
|
WX_DECLARE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7)
|
||||||
|
@@ -714,8 +714,268 @@ static wxUint32 utf8_max[]=
|
|||||||
const wxUint32 wxUnicodePUA = 0x100000;
|
const wxUint32 wxUnicodePUA = 0x100000;
|
||||||
const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
|
const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
|
||||||
|
|
||||||
|
// this table gives the length of the UTF-8 encoding from its first character:
|
||||||
|
unsigned char tableUtf8Lengths[256] = {
|
||||||
|
// single-byte sequences (ASCII):
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
|
||||||
|
|
||||||
|
// these are invalid:
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
|
||||||
|
0, 0, // C0,C1
|
||||||
|
|
||||||
|
// two-byte sequences:
|
||||||
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
|
||||||
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
|
||||||
|
|
||||||
|
// three-byte sequences:
|
||||||
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
|
||||||
|
|
||||||
|
// four-byte sequences:
|
||||||
|
4, 4, 4, 4, 4, // F0..F4
|
||||||
|
|
||||||
|
// these are invalid again (5- or 6-byte
|
||||||
|
// sequences and sequences for code points
|
||||||
|
// above U+10FFFF, as restricted by RFC 3629):
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
|
||||||
|
};
|
||||||
|
|
||||||
|
size_t
|
||||||
|
wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
|
||||||
|
const char *src, size_t srcLen) const
|
||||||
|
{
|
||||||
|
wchar_t *out = dstLen ? dst : NULL;
|
||||||
|
size_t written = 0;
|
||||||
|
|
||||||
|
if ( srcLen == wxNO_LEN )
|
||||||
|
srcLen = strlen(src) + 1;
|
||||||
|
|
||||||
|
for ( const char *p = src; ; p++ )
|
||||||
|
{
|
||||||
|
if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
|
||||||
|
{
|
||||||
|
// all done successfully, just add the trailing NULL if we are not
|
||||||
|
// using explicit length
|
||||||
|
if ( srcLen == wxNO_LEN )
|
||||||
|
{
|
||||||
|
if ( out )
|
||||||
|
{
|
||||||
|
if ( !dstLen )
|
||||||
|
break;
|
||||||
|
|
||||||
|
*out = L'\0';
|
||||||
|
}
|
||||||
|
|
||||||
|
written++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return written;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned char c = *p;
|
||||||
|
unsigned len = tableUtf8Lengths[c];
|
||||||
|
if ( !len )
|
||||||
|
break;
|
||||||
|
|
||||||
|
if ( srcLen < len ) // the test works for wxNO_LEN too
|
||||||
|
break;
|
||||||
|
|
||||||
|
if ( srcLen != wxNO_LEN )
|
||||||
|
srcLen -= len;
|
||||||
|
|
||||||
|
if ( out && !dstLen-- )
|
||||||
|
break;
|
||||||
|
|
||||||
|
|
||||||
|
// Char. number range | UTF-8 octet sequence
|
||||||
|
// (hexadecimal) | (binary)
|
||||||
|
// ----------------------+---------------------------------------------
|
||||||
|
// 0000 0000 - 0000 007F | 0xxxxxxx
|
||||||
|
// 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
|
||||||
|
// 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
|
||||||
|
// 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||||
|
//
|
||||||
|
// Code point value is stored in bits marked with 'x', lowest-order bit
|
||||||
|
// of the value on the right side in the diagram above.
|
||||||
|
// (from RFC 3629)
|
||||||
|
|
||||||
|
// mask to extract lead byte's value ('x' bits above), by sequence length:
|
||||||
|
static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
|
||||||
|
|
||||||
|
// mask and value of lead byte's most significant bits, by length:
|
||||||
|
static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
|
||||||
|
static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
|
||||||
|
|
||||||
|
len--; // it's more convenient to work with 0-based length here
|
||||||
|
|
||||||
|
// extract the lead byte's value bits:
|
||||||
|
if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
|
||||||
|
break;
|
||||||
|
|
||||||
|
wxUint32 code = c & leadValueMask[len];
|
||||||
|
|
||||||
|
// all remaining bytes, if any, are handled in the same way regardless of
|
||||||
|
// sequence's length:
|
||||||
|
for ( ; len; --len )
|
||||||
|
{
|
||||||
|
c = *++p;
|
||||||
|
if ( (c & 0xC0) != 0x80 )
|
||||||
|
return wxCONV_FAILED;
|
||||||
|
|
||||||
|
code <<= 6;
|
||||||
|
code |= c & 0x3F;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef WC_UTF16
|
||||||
|
// cast is ok because wchar_t == wxUint16 if WC_UTF16
|
||||||
|
if ( encode_utf16(code, (wxUint16 *)out) == 2 )
|
||||||
|
{
|
||||||
|
if ( out )
|
||||||
|
out++;
|
||||||
|
written++;
|
||||||
|
}
|
||||||
|
#else // !WC_UTF16
|
||||||
|
if ( out )
|
||||||
|
*out = code;
|
||||||
|
#endif // WC_UTF16/!WC_UTF16
|
||||||
|
|
||||||
|
if ( out )
|
||||||
|
out++;
|
||||||
|
|
||||||
|
written++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return wxCONV_FAILED;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t
|
||||||
|
wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
|
||||||
|
const wchar_t *src, size_t srcLen) const
|
||||||
|
{
|
||||||
|
char *out = dstLen ? dst : NULL;
|
||||||
|
size_t written = 0;
|
||||||
|
|
||||||
|
for ( const wchar_t *wp = src; ; wp++ )
|
||||||
|
{
|
||||||
|
if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
|
||||||
|
{
|
||||||
|
// all done successfully, just add the trailing NULL if we are not
|
||||||
|
// using explicit length
|
||||||
|
if ( srcLen == wxNO_LEN )
|
||||||
|
{
|
||||||
|
if ( out )
|
||||||
|
{
|
||||||
|
if ( !dstLen )
|
||||||
|
break;
|
||||||
|
|
||||||
|
*out = '\0';
|
||||||
|
}
|
||||||
|
|
||||||
|
written++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return written;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
wxUint32 code;
|
||||||
|
#ifdef WC_UTF16
|
||||||
|
// cast is ok for WC_UTF16
|
||||||
|
if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
|
||||||
|
{
|
||||||
|
// skip the next char too as we decoded a surrogate
|
||||||
|
wp++;
|
||||||
|
}
|
||||||
|
#else // wchar_t is UTF-32
|
||||||
|
code = *wp & 0x7fffffff;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
unsigned len;
|
||||||
|
if ( code <= 0x7F )
|
||||||
|
{
|
||||||
|
len = 1;
|
||||||
|
if ( out )
|
||||||
|
{
|
||||||
|
if ( dstLen < len )
|
||||||
|
break;
|
||||||
|
|
||||||
|
out[0] = (char)code;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if ( code <= 0x07FF )
|
||||||
|
{
|
||||||
|
len = 2;
|
||||||
|
if ( out )
|
||||||
|
{
|
||||||
|
if ( dstLen < len )
|
||||||
|
break;
|
||||||
|
|
||||||
|
// NB: this line takes 6 least significant bits, encodes them as
|
||||||
|
// 10xxxxxx and discards them so that the next byte can be encoded:
|
||||||
|
out[1] = 0x80 | (code & 0x3F); code >>= 6;
|
||||||
|
out[0] = 0xC0 | code;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if ( code < 0xFFFF )
|
||||||
|
{
|
||||||
|
len = 3;
|
||||||
|
if ( out )
|
||||||
|
{
|
||||||
|
if ( dstLen < len )
|
||||||
|
break;
|
||||||
|
|
||||||
|
out[2] = 0x80 | (code & 0x3F); code >>= 6;
|
||||||
|
out[1] = 0x80 | (code & 0x3F); code >>= 6;
|
||||||
|
out[0] = 0xE0 | code;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if ( code <= 0x10FFFF )
|
||||||
|
{
|
||||||
|
len = 4;
|
||||||
|
if ( out )
|
||||||
|
{
|
||||||
|
if ( dstLen < len )
|
||||||
|
break;
|
||||||
|
|
||||||
|
out[3] = 0x80 | (code & 0x3F); code >>= 6;
|
||||||
|
out[2] = 0x80 | (code & 0x3F); code >>= 6;
|
||||||
|
out[1] = 0x80 | (code & 0x3F); code >>= 6;
|
||||||
|
out[0] = 0xF0 | code;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( out )
|
||||||
|
{
|
||||||
|
out += len;
|
||||||
|
dstLen -= len;
|
||||||
|
}
|
||||||
|
|
||||||
|
written += len;
|
||||||
|
}
|
||||||
|
|
||||||
|
// we only get here if an error occurs during decoding
|
||||||
|
return wxCONV_FAILED;
|
||||||
|
}
|
||||||
|
|
||||||
size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
|
size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
|
||||||
{
|
{
|
||||||
|
if ( m_options == MAP_INVALID_UTF8_NOT )
|
||||||
|
return wxMBConvStrictUTF8::MB2WC(buf, psz, n);
|
||||||
|
|
||||||
size_t len = 0;
|
size_t len = 0;
|
||||||
|
|
||||||
while (*psz && ((!buf) || (len < n)))
|
while (*psz && ((!buf) || (len < n)))
|
||||||
@@ -785,7 +1045,7 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
#ifdef WC_UTF16
|
#ifdef WC_UTF16
|
||||||
// cast is ok because wchar_t == wxUuint16 if WC_UTF16
|
// cast is ok because wchar_t == wxUint16 if WC_UTF16
|
||||||
size_t pa = encode_utf16(res, (wxUint16 *)buf);
|
size_t pa = encode_utf16(res, (wxUint16 *)buf);
|
||||||
if (pa == wxCONV_FAILED)
|
if (pa == wxCONV_FAILED)
|
||||||
{
|
{
|
||||||
@@ -865,6 +1125,9 @@ static inline bool isoctal(wchar_t wch)
|
|||||||
|
|
||||||
size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
|
size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
|
||||||
{
|
{
|
||||||
|
if ( m_options == MAP_INVALID_UTF8_NOT )
|
||||||
|
return wxMBConvStrictUTF8::WC2MB(buf, psz, n);
|
||||||
|
|
||||||
size_t len = 0;
|
size_t len = 0;
|
||||||
|
|
||||||
while (*psz && ((!buf) || (len < n)))
|
while (*psz && ((!buf) || (len < n)))
|
||||||
@@ -2903,7 +3166,7 @@ wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
|
|||||||
WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
|
WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
WX_DEFINE_GLOBAL_CONV(wxMBConvUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
|
WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
|
||||||
WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
|
WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
|
||||||
|
|
||||||
WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
|
WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
|
||||||
|
Reference in New Issue
Block a user