Use tableUtf8Lengths[] in sringops.cpp too

This commit is contained in:
Pavel Tyunin
2020-10-07 15:41:15 +03:00
parent 240fcee90e
commit b536457e07
2 changed files with 15 additions and 40 deletions

View File

@@ -94,15 +94,15 @@ struct WXDLLIMPEXP_BASE wxStringOperationsUtf8
return (c <= 0x7F) || (c >= 0xC2 && c <= 0xF4); return (c <= 0x7F) || (c >= 0xC2 && c <= 0xF4);
} }
// table of offsets to skip forward when iterating over UTF-8 sequence // returns offset to skip forward when iterating over UTF-8 sequence
static const unsigned char ms_utf8IterTable[256]; static unsigned char GetUTF8IterOffset(unsigned char c);
template<typename Iterator> template<typename Iterator>
static void IncIter(Iterator& i) static void IncIter(Iterator& i)
{ {
wxASSERT( IsValidUtf8LeadByte(*i) ); wxASSERT( IsValidUtf8LeadByte(*i) );
i += ms_utf8IterTable[(unsigned char)*i]; i += GetUTF8IterOffset(*i);
} }
template<typename Iterator> template<typename Iterator>
@@ -178,7 +178,7 @@ struct WXDLLIMPEXP_BASE wxStringOperationsUtf8
static size_t GetUtf8CharLength(char c) static size_t GetUtf8CharLength(char c)
{ {
wxASSERT( IsValidUtf8LeadByte(c) ); wxASSERT( IsValidUtf8LeadByte(c) );
return ms_utf8IterTable[(unsigned char)c]; return GetUTF8IterOffset(c);
} }
// decodes single UTF-8 character from UTF-8 string // decodes single UTF-8 character from UTF-8 string

View File

@@ -23,6 +23,8 @@
#include "wx/stringops.h" #include "wx/stringops.h"
#endif #endif
#include "wx/private/unicode.h"
// =========================================================================== // ===========================================================================
// implementation // implementation
// =========================================================================== // ===========================================================================
@@ -97,40 +99,13 @@ wxWxCharBuffer wxStringOperationsWchar::EncodeNChars(size_t n, const wxUniChar&
// UTF-8 sequences lengths // UTF-8 sequences lengths
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
const unsigned char wxStringOperationsUtf8::ms_utf8IterTable[256] = { static unsigned char wxStringOperationsUtf8::GetUTF8IterOffset(unsigned char c)
// single-byte sequences (ASCII): {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F unsigned char l = tableUtf8Lengths[c];
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F if ( !l ) //skip over invalid characters
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F l = 1;
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F return l;
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F }
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
// these are invalid, we use step 1 to skip
// over them (should never happen):
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80..8F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90..9F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0..AF
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0..BF
1, 1, // C0,C1
// two-byte sequences:
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
// three-byte sequences:
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
// four-byte sequences:
4, 4, 4, 4, 4, // F0..F4
// these are invalid again (5- or 6-byte
// sequences and sequences for code points
// above U+10FFFF, as restricted by RFC 3629):
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F5..FF
};
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
// UTF-8 operations // UTF-8 operations
@@ -166,7 +141,7 @@ bool wxStringOperationsUtf8::IsValidUtf8String(const char *str, size_t len)
{ {
// if the string is not NULL-terminated, verify we have enough // if the string is not NULL-terminated, verify we have enough
// bytes in it left for current character's encoding: // bytes in it left for current character's encoding:
if ( c + ms_utf8IterTable[*c] > end ) if ( c + GetUTF8IterOffset(*c) > end )
return false; return false;
} }
@@ -364,7 +339,7 @@ wxCharBuffer wxStringOperationsUtf8::EncodeNChars(size_t n, const wxUniChar& ch)
{ {
Utf8CharBuffer once(EncodeChar(ch)); Utf8CharBuffer once(EncodeChar(ch));
// the IncIter() table can be used to determine the length of ch's encoding: // the IncIter() table can be used to determine the length of ch's encoding:
size_t len = ms_utf8IterTable[(unsigned char)once.data[0]]; size_t len = GetUTF8IterOffset(once.data[0]);
wxCharBuffer buf(n * len); wxCharBuffer buf(n * len);
char *ptr = buf.data(); char *ptr = buf.data();