Use tableUtf8Lengths[] in sringops.cpp too

2020-10-07 15:41:15 +03:00
parent 240fcee90e
commit b536457e07
2 changed files with 15 additions and 40 deletions
--- a/include/wx/stringops.h
+++ b/include/wx/stringops.h
@@ -94,15 +94,15 @@ struct WXDLLIMPEXP_BASE wxStringOperationsUtf8
        return (c <= 0x7F) || (c >= 0xC2 && c <= 0xF4);
    }

-    // table of offsets to skip forward when iterating over UTF-8 sequence
-    static const unsigned char ms_utf8IterTable[256];
+    // returns offset to skip forward when iterating over UTF-8 sequence
+    static unsigned char GetUTF8IterOffset(unsigned char c);


    template<typename Iterator>
    static void IncIter(Iterator& i)
    {
        wxASSERT( IsValidUtf8LeadByte(*i) );
-        i += ms_utf8IterTable[(unsigned char)*i];
+        i += GetUTF8IterOffset(*i);
    }

    template<typename Iterator>
@@ -178,7 +178,7 @@ struct WXDLLIMPEXP_BASE wxStringOperationsUtf8
    static size_t GetUtf8CharLength(char c)
    {
        wxASSERT( IsValidUtf8LeadByte(c) );
-        return ms_utf8IterTable[(unsigned char)c];
+        return GetUTF8IterOffset(c);
    }

    // decodes single UTF-8 character from UTF-8 string
--- a/src/common/stringops.cpp
+++ b/src/common/stringops.cpp
@@ -23,6 +23,8 @@
    #include "wx/stringops.h"
 #endif

+#include "wx/private/unicode.h"
+
 // ===========================================================================
 // implementation
 // ===========================================================================
@@ -97,40 +99,13 @@ wxWxCharBuffer wxStringOperationsWchar::EncodeNChars(size_t n, const wxUniChar&
 // UTF-8 sequences lengths
 // ---------------------------------------------------------------------------

-const unsigned char wxStringOperationsUtf8::ms_utf8IterTable[256] = {
-    // single-byte sequences (ASCII):
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
-
-    // these are invalid, we use step 1 to skip
-    // over them (should never happen):
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 80..8F
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 90..9F
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // A0..AF
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // B0..BF
-    1, 1,                                            // C0,C1
-
-    // two-byte sequences:
-          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
-
-    // three-byte sequences:
-    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
-
-    // four-byte sequences:
-    4, 4, 4, 4, 4,                                   // F0..F4
-
-    // these are invalid again (5- or 6-byte
-    // sequences and sequences for code points
-    // above U+10FFFF, as restricted by RFC 3629):
-                   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1   // F5..FF
-};
+static unsigned char wxStringOperationsUtf8::GetUTF8IterOffset(unsigned char c)
+{
+    unsigned char l = tableUtf8Lengths[c];
+    if ( !l ) //skip over invalid characters
+        l = 1;
+    return l;
+}

 // ---------------------------------------------------------------------------
 // UTF-8 operations
@@ -166,7 +141,7 @@ bool wxStringOperationsUtf8::IsValidUtf8String(const char *str, size_t len)
        {
            // if the string is not NULL-terminated, verify we have enough
            // bytes in it left for current character's encoding:
-            if ( c + ms_utf8IterTable[*c] > end )
+            if ( c + GetUTF8IterOffset(*c) > end )
                return false;
        }

@@ -364,7 +339,7 @@ wxCharBuffer wxStringOperationsUtf8::EncodeNChars(size_t n, const wxUniChar& ch)
 {
    Utf8CharBuffer once(EncodeChar(ch));
    // the IncIter() table can be used to determine the length of ch's encoding:
-    size_t len = ms_utf8IterTable[(unsigned char)once.data[0]];
+    size_t len = GetUTF8IterOffset(once.data[0]);

    wxCharBuffer buf(n * len);
    char *ptr = buf.data();