From b3eff48e28f4d1f7a7ea56f10522473cb150aff9 Mon Sep 17 00:00:00 2001 From: Pavel Tyunin Date: Tue, 29 Sep 2020 15:35:53 +0300 Subject: [PATCH] Switch to fallback earlier if the input is not valid UTF-8 prefix --- include/wx/strconv.h | 2 ++ src/common/convauto.cpp | 2 +- src/common/strconv.cpp | 20 ++++++++++++++++++++ tests/mbconv/convautotest.cpp | 5 ++--- 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/include/wx/strconv.h b/include/wx/strconv.h index c1b070d36a..21c5f136b1 100644 --- a/include/wx/strconv.h +++ b/include/wx/strconv.h @@ -387,6 +387,8 @@ private: int m_options; }; +bool wxIsUTF8Prefix(const char *src, size_t len); + // ---------------------------------------------------------------------------- // wxMBConvUTF16Base: for both LE and BE variants // ---------------------------------------------------------------------------- diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp index 50c5a956c7..7b92d396f6 100644 --- a/src/common/convauto.cpp +++ b/src/common/convauto.cpp @@ -313,7 +313,7 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, // to the fall-back conversion in this case as it would prevent us from // decoding UTF-8 input when fed it byte by byte, as done by // wxTextInputStream, for example - if ( srcLen < m_conv->GetMaxCharLen() ) + if ( srcLen < m_conv->GetMaxCharLen() && wxIsUTF8Prefix(src, srcLen) ) return wxCONV_FAILED; // if the conversion failed but we didn't really detect anything and diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index ba25dae157..04f6e451ec 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -1446,6 +1446,26 @@ size_t wxMBConvUTF8::FromWChar(char *buf, size_t n, return len; } +// checks if the input can be the beginning of a valid UTF-8 string +bool wxIsUTF8Prefix(const char *src, size_t len) +{ + unsigned char l; + for ( size_t i = 0; i < len; ++i ) + { + l = tableUtf8Lengths[(unsigned char)src[i]]; + if ( !l ) + return false; // invalid leading byte + while ( --l ) + { + if ( ++i == len ) + return true; // truncated sequence + if ( (src[i] & 0xC0) != 0x80 ) + return false; // invalid continuation byte + } + } + return true; +} + // ============================================================================ // UTF-16 // ============================================================================ diff --git a/tests/mbconv/convautotest.cpp b/tests/mbconv/convautotest.cpp index 91b940a35b..789e7582aa 100644 --- a/tests/mbconv/convautotest.cpp +++ b/tests/mbconv/convautotest.cpp @@ -288,9 +288,8 @@ void ConvAutoTestCase::StreamUTF32BE() void ConvAutoTestCase::StreamFallback() { - // this only works if there are at least 3 bytes after the first non-ASCII character - TestTextStream("\x61\xbf\x0A\xe0\x7a", - 5, wxString::FromUTF8("a\xd0\x9f"), wxString::FromUTF8("\xd1\x80z"), + TestTextStream("\x61\xbf\x0A\xe0", + 4, wxString::FromUTF8("a\xd0\x9f"), wxString::FromUTF8("\xd1\x80"), wxFONTENCODING_ISO8859_5); }