From 45adce85618f05cdffc08ef2df76547701cc17b2 Mon Sep 17 00:00:00 2001 From: Pavel Tyunin Date: Sat, 3 Oct 2020 18:21:18 +0300 Subject: [PATCH] Fix wxTextInputStream for some inputs starting with nulls --- src/common/convauto.cpp | 2 +- src/common/txtstrm.cpp | 9 +++---- tests/streams/textstreamtest.cpp | 40 ++++++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 5 deletions(-) diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp index 7b92d396f6..d5d6079b32 100644 --- a/src/common/convauto.cpp +++ b/src/common/convauto.cpp @@ -313,7 +313,7 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, // to the fall-back conversion in this case as it would prevent us from // decoding UTF-8 input when fed it byte by byte, as done by // wxTextInputStream, for example - if ( srcLen < m_conv->GetMaxCharLen() && wxIsUTF8Prefix(src, srcLen) ) + if ( srcLen < 2 + m_conv->GetMaxCharLen() && wxIsUTF8Prefix(src, srcLen) ) return wxCONV_FAILED; // if the conversion failed but we didn't really detect anything and diff --git a/src/common/txtstrm.cpp b/src/common/txtstrm.cpp index fc5e352b58..1332bb3cf0 100644 --- a/src/common/txtstrm.cpp +++ b/src/common/txtstrm.cpp @@ -97,10 +97,11 @@ wxChar wxTextInputStream::GetChar() m_validEnd = 0; } - // We may need to decode up to 4 characters if we have input starting with - // 3 BOM-like bytes, but not actually containing a BOM, as decoding it will - // only succeed when 4 bytes are read -- and will yield 4 wide characters. - wxChar wbuf[4]; + // We may need to decode up to 6 characters if we have input starting with + // 2 null bytes (like in UTF-32BE BOM), and then 3 bytes that look like + // the start of UTF-8 sequence, as decoding it will only succeed when + // 6 bytes are read -- and will yield 6 wide characters. + wxChar wbuf[6]; for(size_t inlen = 0; inlen < sizeof(m_lastBytes); inlen++) { if ( inlen >= m_validEnd ) diff --git a/tests/streams/textstreamtest.cpp b/tests/streams/textstreamtest.cpp index edb6eaa8a2..c6497b23f2 100644 --- a/tests/streams/textstreamtest.cpp +++ b/tests/streams/textstreamtest.cpp @@ -324,6 +324,46 @@ TEST_CASE("wxTextInputStream::GetChar", "[text][input][stream][char]") REQUIRE( tis.GetChar() == 0x00 ); CHECK( tis.GetInputStream().Eof() ); } + + // Two null bytes that look like the start of UTF-32BE BOM, + // followed by 4 byte UTF-8 sequence. + // Needs wxConvAuto to not switch to fallback on <6 bytes. + SECTION("UTF8-with-nulls") + { + const wxUint8 buf[] = { 0x00, 0x00, 0xf0, 0x90, 0x8c, 0x98 }; + wxMemoryInputStream mis(buf, sizeof(buf)); + wxTextInputStream tis(mis); + + wxCharTypeBuffer e = wxString::FromUTF8((char*)buf, sizeof(buf)) + .tchar_str(); + for ( size_t i = 0; i < e.length(); ++i ) + { + INFO("i = " << i); + REQUIRE( tis.GetChar() == e[i] ); + } + REQUIRE( tis.GetChar() == 0x00 ); + CHECK( tis.GetInputStream().Eof() ); + } + + // Two null bytes that look like the start of UTF-32BE BOM, + // then 3 bytes that look like the start of UTF-8 sequence. + // Needs 6 character output buffer in GetChar(). + SECTION("almost-UTF8-with-nulls") + { + const wxUint8 buf[] = { 0x00, 0x00, 0xf0, 0x90, 0x8c, 0xe0 }; + wxMemoryInputStream mis(buf, sizeof(buf)); + wxTextInputStream tis(mis); + + wxCharTypeBuffer e = wxString((char*)buf, wxCSConv(wxFONTENCODING_ISO8859_1), + sizeof(buf)).tchar_str(); + for ( size_t i = 0; i < e.length(); ++i ) + { + INFO("i = " << i); + REQUIRE( tis.GetChar() == e[i] ); + } + REQUIRE( tis.GetChar() == 0x00 ); + CHECK( tis.GetInputStream().Eof() ); + } } #endif // wxUSE_UNICODE