diff --git a/include/wx/stringops.h b/include/wx/stringops.h index 150554f341..dd46e6616c 100644 --- a/include/wx/stringops.h +++ b/include/wx/stringops.h @@ -94,15 +94,15 @@ struct WXDLLIMPEXP_BASE wxStringOperationsUtf8 return (c <= 0x7F) || (c >= 0xC2 && c <= 0xF4); } - // table of offsets to skip forward when iterating over UTF-8 sequence - static const unsigned char ms_utf8IterTable[256]; + // returns offset to skip forward when iterating over UTF-8 sequence + static unsigned char GetUTF8IterOffset(unsigned char c); template static void IncIter(Iterator& i) { wxASSERT( IsValidUtf8LeadByte(*i) ); - i += ms_utf8IterTable[(unsigned char)*i]; + i += GetUTF8IterOffset(*i); } template @@ -178,7 +178,7 @@ struct WXDLLIMPEXP_BASE wxStringOperationsUtf8 static size_t GetUtf8CharLength(char c) { wxASSERT( IsValidUtf8LeadByte(c) ); - return ms_utf8IterTable[(unsigned char)c]; + return GetUTF8IterOffset(c); } // decodes single UTF-8 character from UTF-8 string diff --git a/src/common/stringops.cpp b/src/common/stringops.cpp index 85629406a3..7cedab7cc0 100644 --- a/src/common/stringops.cpp +++ b/src/common/stringops.cpp @@ -23,6 +23,8 @@ #include "wx/stringops.h" #endif +#include "wx/private/unicode.h" + // =========================================================================== // implementation // =========================================================================== @@ -97,40 +99,13 @@ wxWxCharBuffer wxStringOperationsWchar::EncodeNChars(size_t n, const wxUniChar& // UTF-8 sequences lengths // --------------------------------------------------------------------------- -const unsigned char wxStringOperationsUtf8::ms_utf8IterTable[256] = { - // single-byte sequences (ASCII): - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F - - // these are invalid, we use step 1 to skip - // over them (should never happen): - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80..8F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90..9F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0..AF - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0..BF - 1, 1, // C0,C1 - - // two-byte sequences: - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF - - // three-byte sequences: - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF - - // four-byte sequences: - 4, 4, 4, 4, 4, // F0..F4 - - // these are invalid again (5- or 6-byte - // sequences and sequences for code points - // above U+10FFFF, as restricted by RFC 3629): - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F5..FF -}; +static unsigned char wxStringOperationsUtf8::GetUTF8IterOffset(unsigned char c) +{ + unsigned char l = tableUtf8Lengths[c]; + if ( !l ) //skip over invalid characters + l = 1; + return l; +} // --------------------------------------------------------------------------- // UTF-8 operations @@ -166,7 +141,7 @@ bool wxStringOperationsUtf8::IsValidUtf8String(const char *str, size_t len) { // if the string is not NULL-terminated, verify we have enough // bytes in it left for current character's encoding: - if ( c + ms_utf8IterTable[*c] > end ) + if ( c + GetUTF8IterOffset(*c) > end ) return false; } @@ -364,7 +339,7 @@ wxCharBuffer wxStringOperationsUtf8::EncodeNChars(size_t n, const wxUniChar& ch) { Utf8CharBuffer once(EncodeChar(ch)); // the IncIter() table can be used to determine the length of ch's encoding: - size_t len = ms_utf8IterTable[(unsigned char)once.data[0]]; + size_t len = GetUTF8IterOffset(once.data[0]); wxCharBuffer buf(n * len); char *ptr = buf.data();