From b3eff48e28f4d1f7a7ea56f10522473cb150aff9 Mon Sep 17 00:00:00 2001
From: Pavel Tyunin <pavel51tunin@gmail.com>
Date: Tue, 29 Sep 2020 15:35:53 +0300
Subject: [PATCH] Switch to fallback earlier if the input is not valid UTF-8
 prefix

---
 include/wx/strconv.h          |  2 ++
 src/common/convauto.cpp       |  2 +-
 src/common/strconv.cpp        | 20 ++++++++++++++++++++
 tests/mbconv/convautotest.cpp |  5 ++---
 4 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/include/wx/strconv.h b/include/wx/strconv.h
index c1b070d36a..21c5f136b1 100644
--- a/include/wx/strconv.h
+++ b/include/wx/strconv.h
@@ -387,6 +387,8 @@ private:
     int m_options;
 };
 
+bool wxIsUTF8Prefix(const char *src, size_t len);
+
 // ----------------------------------------------------------------------------
 // wxMBConvUTF16Base: for both LE and BE variants
 // ----------------------------------------------------------------------------
diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp
index 50c5a956c7..7b92d396f6 100644
--- a/src/common/convauto.cpp
+++ b/src/common/convauto.cpp
@@ -313,7 +313,7 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
         // to the fall-back conversion in this case as it would prevent us from
         // decoding UTF-8 input when fed it byte by byte, as done by
         // wxTextInputStream, for example
-        if ( srcLen < m_conv->GetMaxCharLen() )
+        if ( srcLen < m_conv->GetMaxCharLen() && wxIsUTF8Prefix(src, srcLen) )
             return wxCONV_FAILED;
 
         // if the conversion failed but we didn't really detect anything and
diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp
index ba25dae157..04f6e451ec 100644
--- a/src/common/strconv.cpp
+++ b/src/common/strconv.cpp
@@ -1446,6 +1446,26 @@ size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
     return len;
 }
 
+// checks if the input can be the beginning of a valid UTF-8 string
+bool wxIsUTF8Prefix(const char *src, size_t len)
+{
+    unsigned char l;
+    for ( size_t i = 0; i < len; ++i )
+    {
+        l = tableUtf8Lengths[(unsigned char)src[i]];
+        if ( !l )
+            return false; // invalid leading byte
+        while ( --l )
+        {
+            if ( ++i == len )
+                return true; // truncated sequence
+            if ( (src[i] & 0xC0) != 0x80 )
+                return false; // invalid continuation byte
+        }
+    }
+    return true;
+}
+
 // ============================================================================
 // UTF-16
 // ============================================================================
diff --git a/tests/mbconv/convautotest.cpp b/tests/mbconv/convautotest.cpp
index 91b940a35b..789e7582aa 100644
--- a/tests/mbconv/convautotest.cpp
+++ b/tests/mbconv/convautotest.cpp
@@ -288,9 +288,8 @@ void ConvAutoTestCase::StreamUTF32BE()
 
 void ConvAutoTestCase::StreamFallback()
 {
-    // this only works if there are at least 3 bytes after the first non-ASCII character
-    TestTextStream("\x61\xbf\x0A\xe0\x7a",
-                   5, wxString::FromUTF8("a\xd0\x9f"), wxString::FromUTF8("\xd1\x80z"),
+    TestTextStream("\x61\xbf\x0A\xe0",
+                   4, wxString::FromUTF8("a\xd0\x9f"), wxString::FromUTF8("\xd1\x80"),
                    wxFONTENCODING_ISO8859_5);
 }