Switch to fallback earlier if the input is not valid UTF-8 prefix

This commit is contained in:
Pavel Tyunin
2020-09-29 15:35:53 +03:00
parent bc838b4773
commit b3eff48e28
4 changed files with 25 additions and 4 deletions

View File

@@ -387,6 +387,8 @@ private:
int m_options; int m_options;
}; };
bool wxIsUTF8Prefix(const char *src, size_t len);
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// wxMBConvUTF16Base: for both LE and BE variants // wxMBConvUTF16Base: for both LE and BE variants
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------

View File

@@ -313,7 +313,7 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
// to the fall-back conversion in this case as it would prevent us from // to the fall-back conversion in this case as it would prevent us from
// decoding UTF-8 input when fed it byte by byte, as done by // decoding UTF-8 input when fed it byte by byte, as done by
// wxTextInputStream, for example // wxTextInputStream, for example
if ( srcLen < m_conv->GetMaxCharLen() ) if ( srcLen < m_conv->GetMaxCharLen() && wxIsUTF8Prefix(src, srcLen) )
return wxCONV_FAILED; return wxCONV_FAILED;
// if the conversion failed but we didn't really detect anything and // if the conversion failed but we didn't really detect anything and

View File

@@ -1446,6 +1446,26 @@ size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
return len; return len;
} }
// checks if the input can be the beginning of a valid UTF-8 string
bool wxIsUTF8Prefix(const char *src, size_t len)
{
unsigned char l;
for ( size_t i = 0; i < len; ++i )
{
l = tableUtf8Lengths[(unsigned char)src[i]];
if ( !l )
return false; // invalid leading byte
while ( --l )
{
if ( ++i == len )
return true; // truncated sequence
if ( (src[i] & 0xC0) != 0x80 )
return false; // invalid continuation byte
}
}
return true;
}
// ============================================================================ // ============================================================================
// UTF-16 // UTF-16
// ============================================================================ // ============================================================================

View File

@@ -288,9 +288,8 @@ void ConvAutoTestCase::StreamUTF32BE()
void ConvAutoTestCase::StreamFallback() void ConvAutoTestCase::StreamFallback()
{ {
// this only works if there are at least 3 bytes after the first non-ASCII character TestTextStream("\x61\xbf\x0A\xe0",
TestTextStream("\x61\xbf\x0A\xe0\x7a", 4, wxString::FromUTF8("a\xd0\x9f"), wxString::FromUTF8("\xd1\x80"),
5, wxString::FromUTF8("a\xd0\x9f"), wxString::FromUTF8("\xd1\x80z"),
wxFONTENCODING_ISO8859_5); wxFONTENCODING_ISO8859_5);
} }