Switch to fallback earlier if the input is not valid UTF-8 prefix

2020-09-29 15:35:53 +03:00
parent bc838b4773
commit b3eff48e28
4 changed files with 25 additions and 4 deletions
--- a/include/wx/strconv.h
+++ b/include/wx/strconv.h
@@ -387,6 +387,8 @@ private:
    int m_options;
 };
 bool wxIsUTF8Prefix(const char *src, size_t len);
 // ----------------------------------------------------------------------------
 // wxMBConvUTF16Base: for both LE and BE variants
 // ----------------------------------------------------------------------------
--- a/src/common/convauto.cpp
+++ b/src/common/convauto.cpp
@@ -313,7 +313,7 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
        // to the fall-back conversion in this case as it would prevent us from
        // decoding UTF-8 input when fed it byte by byte, as done by
        // wxTextInputStream, for example
-        if ( srcLen < m_conv->GetMaxCharLen() )
+        if ( srcLen < m_conv->GetMaxCharLen() && wxIsUTF8Prefix(src, srcLen) )
            return wxCONV_FAILED;
        // if the conversion failed but we didn't really detect anything and
--- a/src/common/strconv.cpp
+++ b/src/common/strconv.cpp
@@ -1446,6 +1446,26 @@ size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
    return len;
 }
 // checks if the input can be the beginning of a valid UTF-8 string
 bool wxIsUTF8Prefix(const char *src, size_t len)
 {
    unsigned char l;
    for ( size_t i = 0; i < len; ++i )
    {
        l = tableUtf8Lengths[(unsigned char)src[i]];
        if ( !l )
            return false; // invalid leading byte
        while ( --l )
        {
            if ( ++i == len )
                return true; // truncated sequence
            if ( (src[i] & 0xC0) != 0x80 )
                return false; // invalid continuation byte
        }
    }
    return true;
 }
 // ============================================================================
 // UTF-16
 // ============================================================================
--- a/tests/mbconv/convautotest.cpp
+++ b/tests/mbconv/convautotest.cpp
@@ -288,9 +288,8 @@ void ConvAutoTestCase::StreamUTF32BE()
 void ConvAutoTestCase::StreamFallback()
 {
-    // this only works if there are at least 3 bytes after the first non-ASCII character
+    TestTextStream("\x61\xbf\x0A\xe0",
-    TestTextStream("\x61\xbf\x0A\xe0\x7a",
+                   4, wxString::FromUTF8("a\xd0\x9f"), wxString::FromUTF8("\xd1\x80"),
                   5, wxString::FromUTF8("a\xd0\x9f"), wxString::FromUTF8("\xd1\x80z"),
                   wxFONTENCODING_ISO8859_5);
 }