From c9dd9e96a1bc834fad23435242bc83b793d987ed Mon Sep 17 00:00:00 2001 From: Pavel Tyunin Date: Wed, 7 Oct 2020 17:31:42 +0300 Subject: [PATCH] Allow decoding even shorter strings in fallback encoding Complete UTF-8 characters (except leading nulls) never appear in failed decoding attempts when the input is fed byte by byte. --- src/common/convauto.cpp | 30 ++++++++++++++---------------- tests/mbconv/convautotest.cpp | 2 +- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp index 8778295207..708096de5d 100644 --- a/src/common/convauto.cpp +++ b/src/common/convauto.cpp @@ -267,24 +267,21 @@ bool wxConvAuto::InitFromInput(const char *src, size_t len) return true; } -// checks if the input can be the beginning of a valid UTF-8 string -static bool wxIsUTF8Prefix(const char *src, size_t len) +// checks if the input can be the beginning of a valid UTF-8 sequence +static bool wxCanBeUTF8SequencePrefix(const char *src, size_t len) { - unsigned char l; - for ( size_t i = 0; i < len; ++i ) + size_t i = 0; + unsigned char l = tableUtf8Lengths[(unsigned char)src[i]]; + if ( !l ) + return false; // invalid leading byte + while ( --l ) { - l = tableUtf8Lengths[(unsigned char)src[i]]; - if ( !l ) - return false; // invalid leading byte - while ( --l ) - { - if ( ++i == len ) - return true; // truncated sequence - if ( (src[i] & 0xC0) != 0x80 ) - return false; // invalid continuation byte - } + if ( ++i == len ) + return true; // truncated sequence + if ( (src[i] & 0xC0) != 0x80 ) + return false; // invalid continuation byte } - return true; + return false; // complete sequence } size_t @@ -339,7 +336,8 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, size_t nNull = 0; if ( srcLen != wxNO_LEN && srcLen >= 2 && !src[0] ) nNull = ( src[1]? 1 : 2 ); - if ( srcLen < nNull + m_conv->GetMaxCharLen() && wxIsUTF8Prefix(src, srcLen) ) + if ( srcLen < nNull + m_conv->GetMaxCharLen() && + wxCanBeUTF8SequencePrefix(src + nNull, srcLen - nNull) ) return wxCONV_FAILED; // if the conversion failed but we didn't really detect anything and diff --git a/tests/mbconv/convautotest.cpp b/tests/mbconv/convautotest.cpp index 27839f93a5..12e19c21ed 100644 --- a/tests/mbconv/convautotest.cpp +++ b/tests/mbconv/convautotest.cpp @@ -226,7 +226,7 @@ void ConvAutoTestCase::FallbackMultibyte() void ConvAutoTestCase::FallbackShort() { - TestFirstChar("\x61\x61\x61\xc4", 'a', 4, + TestFirstChar("\x61\xc4", 'a', 2, ConvState(wxBOM_None, wxFONTENCODING_ISO8859_5, true), wxFONTENCODING_ISO8859_5); }