Fix wxTextInputStream for some inputs starting with nulls

2020-10-03 18:21:18 +03:00
parent b3eff48e28
commit 45adce8561
3 changed files with 46 additions and 5 deletions
--- a/src/common/convauto.cpp
+++ b/src/common/convauto.cpp
@@ -313,7 +313,7 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
        // to the fall-back conversion in this case as it would prevent us from
        // decoding UTF-8 input when fed it byte by byte, as done by
        // wxTextInputStream, for example
-        if ( srcLen < m_conv->GetMaxCharLen() && wxIsUTF8Prefix(src, srcLen) )
+        if ( srcLen < 2 + m_conv->GetMaxCharLen() && wxIsUTF8Prefix(src, srcLen) )
            return wxCONV_FAILED;

        // if the conversion failed but we didn't really detect anything and
--- a/src/common/txtstrm.cpp
+++ b/src/common/txtstrm.cpp
@@ -97,10 +97,11 @@ wxChar wxTextInputStream::GetChar()
        m_validEnd = 0;
    }

-    // We may need to decode up to 4 characters if we have input starting with
-    // 3 BOM-like bytes, but not actually containing a BOM, as decoding it will
-    // only succeed when 4 bytes are read -- and will yield 4 wide characters.
-    wxChar wbuf[4];
+    // We may need to decode up to 6 characters if we have input starting with
+    // 2 null bytes (like in UTF-32BE BOM), and then 3 bytes that look like
+    // the start of UTF-8 sequence, as decoding it will only succeed when
+    // 6 bytes are read -- and will yield 6 wide characters.
+    wxChar wbuf[6];
    for(size_t inlen = 0; inlen < sizeof(m_lastBytes); inlen++)
    {
        if ( inlen >= m_validEnd )
--- a/tests/streams/textstreamtest.cpp
+++ b/tests/streams/textstreamtest.cpp
@@ -324,6 +324,46 @@ TEST_CASE("wxTextInputStream::GetChar", "[text][input][stream][char]")
        REQUIRE( tis.GetChar() == 0x00 );
        CHECK( tis.GetInputStream().Eof() );
    }
+
+    // Two null bytes that look like the start of UTF-32BE BOM,
+    // followed by 4 byte UTF-8 sequence.
+    // Needs wxConvAuto to not switch to fallback on <6 bytes.
+    SECTION("UTF8-with-nulls")
+    {
+        const wxUint8 buf[] = { 0x00, 0x00, 0xf0, 0x90, 0x8c, 0x98 };
+        wxMemoryInputStream mis(buf, sizeof(buf));
+        wxTextInputStream tis(mis);
+
+        wxCharTypeBuffer<wxChar> e = wxString::FromUTF8((char*)buf, sizeof(buf))
+                                     .tchar_str<wxChar>();
+        for ( size_t i = 0; i < e.length(); ++i )
+        {
+            INFO("i = " << i);
+            REQUIRE( tis.GetChar() == e[i] );
+        }
+        REQUIRE( tis.GetChar() == 0x00 );
+        CHECK( tis.GetInputStream().Eof() );
+    }
+
+    // Two null bytes that look like the start of UTF-32BE BOM,
+    // then 3 bytes that look like the start of UTF-8 sequence.
+    // Needs 6 character output buffer in GetChar().
+    SECTION("almost-UTF8-with-nulls")
+    {
+        const wxUint8 buf[] = { 0x00, 0x00, 0xf0, 0x90, 0x8c, 0xe0 };
+        wxMemoryInputStream mis(buf, sizeof(buf));
+        wxTextInputStream tis(mis);
+
+        wxCharTypeBuffer<wxChar> e = wxString((char*)buf, wxCSConv(wxFONTENCODING_ISO8859_1),
+                                              sizeof(buf)).tchar_str<wxChar>();
+        for ( size_t i = 0; i < e.length(); ++i )
+        {
+            INFO("i = " << i);
+            REQUIRE( tis.GetChar() == e[i] );
+        }
+        REQUIRE( tis.GetChar() == 0x00 );
+        CHECK( tis.GetInputStream().Eof() );
+    }
 }

 #endif // wxUSE_UNICODE