diff --git a/include/wx/txtstrm.h b/include/wx/txtstrm.h index 5218f44520..0bcf10e2cd 100644 --- a/include/wx/txtstrm.h +++ b/include/wx/txtstrm.h @@ -84,7 +84,22 @@ public: protected: wxInputStream &m_input; wxString m_separators; - char m_lastBytes[10]; // stores the bytes that were read for the last character + + // Data possibly (see m_validXXX) read from the stream but not decoded yet. + // This is necessary because GetChar() may only return a single character + // but we may get more than one character when decoding raw input bytes. + char m_lastBytes[10]; + + // The bytes [0, m_validEnd) of m_lastBytes contain the bytes read by the + // last GetChar() call (this interval may be empty if GetChar() hasn't been + // called yet). The bytes [0, m_validBegin) have been already decoded and + // returned to caller or stored in m_lastWChar in the particularly + // egregious case of decoding a non-BMP character when using UTF-16 for + // wchar_t. Finally, the bytes [m_validBegin, m_validEnd) remain to be + // decoded and returned during the next call (again, this interval can, and + // usually will, be empty too if m_validBegin == m_validEnd). + size_t m_validBegin, + m_validEnd; #if wxUSE_UNICODE wxMBConv *m_conv; diff --git a/src/common/txtstrm.cpp b/src/common/txtstrm.cpp index 86c19583b0..d174e4e13f 100644 --- a/src/common/txtstrm.cpp +++ b/src/common/txtstrm.cpp @@ -35,7 +35,8 @@ wxTextInputStream::wxTextInputStream(wxInputStream &s, const wxMBConv& conv) : m_input(s), m_separators(sep), m_conv(conv.Clone()) { - memset((void*)m_lastBytes, 0, 10); + m_validBegin = + m_validEnd = 0; #if SIZEOF_WCHAR_T == 2 m_lastWChar = 0; @@ -45,7 +46,10 @@ wxTextInputStream::wxTextInputStream(wxInputStream &s, wxTextInputStream::wxTextInputStream(wxInputStream &s, const wxString &sep) : m_input(s), m_separators(sep) { - memset((void*)m_lastBytes, 0, 10); + m_validBegin = + m_validEnd = 0; + + m_lastBytes[0] = 0; } #endif @@ -58,11 +62,13 @@ wxTextInputStream::~wxTextInputStream() void wxTextInputStream::UngetLast() { - size_t byteCount = 0; - while(m_lastBytes[byteCount]) // pseudo ANSI strlen (even for Unicode!) - byteCount++; - m_input.Ungetch(m_lastBytes, byteCount); - memset((void*)m_lastBytes, 0, 10); + if ( m_validEnd ) + { + m_input.Ungetch(m_lastBytes, m_validEnd); + + m_validBegin = + m_validEnd = 0; + } } wxChar wxTextInputStream::GetChar() @@ -77,17 +83,37 @@ wxChar wxTextInputStream::GetChar() m_lastWChar = 0; return wc; } -#endif // !SWIG_ONLY_SCRIPT_API +#endif // SIZEOF_WCHAR_T - wxChar wbuf[2]; - memset((void*)m_lastBytes, 0, 10); - for(size_t inlen = 0; inlen < 9; inlen++) + // If we have any non-decoded bytes left from the last call, shift them to + // be at the beginning of the buffer. + if ( m_validBegin < m_validEnd ) { - // actually read the next character - m_lastBytes[inlen] = m_input.GetC(); + m_validEnd -= m_validBegin; + memmove(m_lastBytes, m_lastBytes + m_validBegin, m_validEnd); + } + else // All bytes were already decoded and consumed. + { + m_validEnd = 0; + } - if(m_input.LastRead() <= 0) - return 0; + // We may need to decode up to 4 characters if we have input starting with + // 3 BOM-like bytes, but not actually containing a BOM, as decoding it will + // only succeed when 4 bytes are read -- and will yield 4 wide characters. + wxChar wbuf[4]; + for(size_t inlen = 0; inlen < sizeof(m_lastBytes); inlen++) + { + if ( inlen >= m_validEnd ) + { + // actually read the next character + m_lastBytes[inlen] = m_input.GetC(); + + if(m_input.LastRead() <= 0) + return 0; + + m_validEnd++; + } + //else: Retry decoding what we already have in the buffer. switch ( m_conv->ToWChar(wbuf, WXSIZEOF(wbuf), m_lastBytes, inlen + 1) ) { @@ -103,12 +129,17 @@ wxChar wxTextInputStream::GetChar() break; default: - // if we couldn't decode a single character during the last - // loop iteration we shouldn't be able to decode 2 or more of - // them with an extra single byte, something fishy is going on - // (except if we use UTF-16, see below) - wxFAIL_MSG("unexpected decoding result"); - return 0; + // If we couldn't decode a single character during the last + // loop iteration, but decoded more than one of them with just + // one extra byte, the only explanation is that we were using a + // wxConvAuto conversion recognizing the initial BOM and that + // it couldn't detect the presence or absence of BOM so far, + // but now finally has enough data to see that there is none. + // As we must have fallen back to Latin-1 in this case, return + // just the first byte and keep the other ones for the next + // time. + m_validBegin = 1; + return wbuf[0]; #if SIZEOF_WCHAR_T == 2 case 2: @@ -118,25 +149,37 @@ wxChar wxTextInputStream::GetChar() // remember the second one for the next call, as there is no // way to fit both of them into a single wxChar in this case. m_lastWChar = wbuf[1]; -#endif // !SWIG_ONLY_SCRIPT_API +#endif // SIZEOF_WCHAR_T == 2 wxFALLTHROUGH; case 1: + m_validBegin = inlen + 1; // we finally decoded a character return wbuf[0]; } } - // there should be no encoding which requires more than nine bytes for one - // character so something must be wrong with our conversion but we have no - // way to signal it from here + // There should be no encoding which requires more than 10 bytes to decode + // at least one character (the most actually seems to be 7: 3 for the + // initial BOM, which is ignored, and 4 for the longest possible encoding + // of a Unicode character in UTF-8), so something must be wrong with our + // conversion but we have no way to signal it from here and just return 0 + // as if we reached the end of the stream. + m_validBegin = 0; + m_validEnd = sizeof(m_lastBytes); + return 0; #else m_lastBytes[0] = m_input.GetC(); if(m_input.LastRead() <= 0) + { + m_validEnd = 0; return 0; + } + + m_validEnd = 1; return m_lastBytes[0]; #endif diff --git a/tests/streams/textstreamtest.cpp b/tests/streams/textstreamtest.cpp index 0975e7526e..edb6eaa8a2 100644 --- a/tests/streams/textstreamtest.cpp +++ b/tests/streams/textstreamtest.cpp @@ -290,4 +290,40 @@ void TextStreamTestCase::TestInput(const wxMBConv& conv, CPPUNIT_ASSERT_EQUAL( 0, memcmp(txtWchar, temp.wc_str(), sizeof(txtWchar)) ); } +TEST_CASE("wxTextInputStream::GetChar", "[text][input][stream][char]") +{ + // This is the simplest possible test that used to trigger assertion in + // wxTextInputStream::GetChar(). + SECTION("starts-with-nul") + { + const wxUint8 buf[] = { 0x00, 0x01, }; + wxMemoryInputStream mis(buf, sizeof(buf)); + wxTextInputStream tis(mis); + + REQUIRE( tis.GetChar() == 0x00 ); + REQUIRE( tis.GetChar() == 0x01 ); + REQUIRE( tis.GetChar() == 0x00 ); + CHECK( tis.GetInputStream().Eof() ); + } + + // This exercises a problematic path in GetChar() as the first 3 bytes of + // this stream look like the start of UTF-32BE BOM, but this is not + // actually a BOM because the 4th byte is 0xFE and not 0xFF, so the stream + // should decode the buffer as Latin-1 once it gets there. + SECTION("almost-UTF-32-BOM") + { + const wxUint8 buf[] = { 0x00, 0x00, 0xFE, 0xFE, 0x01 }; + wxMemoryInputStream mis(buf, sizeof(buf)); + wxTextInputStream tis(mis); + + REQUIRE( tis.GetChar() == 0x00 ); + REQUIRE( tis.GetChar() == 0x00 ); + REQUIRE( tis.GetChar() == 0xFE ); + REQUIRE( tis.GetChar() == 0xFE ); + REQUIRE( tis.GetChar() == 0x01 ); + REQUIRE( tis.GetChar() == 0x00 ); + CHECK( tis.GetInputStream().Eof() ); + } +} + #endif // wxUSE_UNICODE