From 45adce85618f05cdffc08ef2df76547701cc17b2 Mon Sep 17 00:00:00 2001
From: Pavel Tyunin <pavel51tunin@gmail.com>
Date: Sat, 3 Oct 2020 18:21:18 +0300
Subject: [PATCH] Fix wxTextInputStream for some inputs starting with nulls

---
 src/common/convauto.cpp          |  2 +-
 src/common/txtstrm.cpp           |  9 +++----
 tests/streams/textstreamtest.cpp | 40 ++++++++++++++++++++++++++++++++
 3 files changed, 46 insertions(+), 5 deletions(-)
diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp
index 7b92d396f6..d5d6079b32 100644
--- a/src/common/convauto.cpp
+++ b/src/common/convauto.cpp
@@ -313,7 +313,7 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
         // to the fall-back conversion in this case as it would prevent us from
         // decoding UTF-8 input when fed it byte by byte, as done by
         // wxTextInputStream, for example
-        if ( srcLen < m_conv->GetMaxCharLen() && wxIsUTF8Prefix(src, srcLen) )
+        if ( srcLen < 2 + m_conv->GetMaxCharLen() && wxIsUTF8Prefix(src, srcLen) )
             return wxCONV_FAILED;
 
         // if the conversion failed but we didn't really detect anything and
diff --git a/src/common/txtstrm.cpp b/src/common/txtstrm.cpp
index fc5e352b58..1332bb3cf0 100644
--- a/src/common/txtstrm.cpp
+++ b/src/common/txtstrm.cpp
@@ -97,10 +97,11 @@ wxChar wxTextInputStream::GetChar()
         m_validEnd = 0;
     }
 
-    // We may need to decode up to 4 characters if we have input starting with
-    // 3 BOM-like bytes, but not actually containing a BOM, as decoding it will
-    // only succeed when 4 bytes are read -- and will yield 4 wide characters.
-    wxChar wbuf[4];
+    // We may need to decode up to 6 characters if we have input starting with
+    // 2 null bytes (like in UTF-32BE BOM), and then 3 bytes that look like
+    // the start of UTF-8 sequence, as decoding it will only succeed when
+    // 6 bytes are read -- and will yield 6 wide characters.
+    wxChar wbuf[6];
     for(size_t inlen = 0; inlen < sizeof(m_lastBytes); inlen++)
     {
         if ( inlen >= m_validEnd )
diff --git a/tests/streams/textstreamtest.cpp b/tests/streams/textstreamtest.cpp
index edb6eaa8a2..c6497b23f2 100644
--- a/tests/streams/textstreamtest.cpp
+++ b/tests/streams/textstreamtest.cpp
@@ -324,6 +324,46 @@ TEST_CASE("wxTextInputStream::GetChar", "[text][input][stream][char]")
         REQUIRE( tis.GetChar() == 0x00 );
         CHECK( tis.GetInputStream().Eof() );
     }
+
+    // Two null bytes that look like the start of UTF-32BE BOM,
+    // followed by 4 byte UTF-8 sequence.
+    // Needs wxConvAuto to not switch to fallback on <6 bytes.
+    SECTION("UTF8-with-nulls")
+    {
+        const wxUint8 buf[] = { 0x00, 0x00, 0xf0, 0x90, 0x8c, 0x98 };
+        wxMemoryInputStream mis(buf, sizeof(buf));
+        wxTextInputStream tis(mis);
+
+        wxCharTypeBuffer<wxChar> e = wxString::FromUTF8((char*)buf, sizeof(buf))
+                                     .tchar_str<wxChar>();
+        for ( size_t i = 0; i < e.length(); ++i )
+        {
+            INFO("i = " << i);
+            REQUIRE( tis.GetChar() == e[i] );
+        }
+        REQUIRE( tis.GetChar() == 0x00 );
+        CHECK( tis.GetInputStream().Eof() );
+    }
+
+    // Two null bytes that look like the start of UTF-32BE BOM,
+    // then 3 bytes that look like the start of UTF-8 sequence.
+    // Needs 6 character output buffer in GetChar().
+    SECTION("almost-UTF8-with-nulls")
+    {
+        const wxUint8 buf[] = { 0x00, 0x00, 0xf0, 0x90, 0x8c, 0xe0 };
+        wxMemoryInputStream mis(buf, sizeof(buf));
+        wxTextInputStream tis(mis);
+
+        wxCharTypeBuffer<wxChar> e = wxString((char*)buf, wxCSConv(wxFONTENCODING_ISO8859_1),
+                                              sizeof(buf)).tchar_str<wxChar>();
+        for ( size_t i = 0; i < e.length(); ++i )
+        {
+            INFO("i = " << i);
+            REQUIRE( tis.GetChar() == e[i] );
+        }
+        REQUIRE( tis.GetChar() == 0x00 );
+        CHECK( tis.GetInputStream().Eof() );
+    }
 }
 
 #endif // wxUSE_UNICODE