Fix wxTextInputStream for input starting with BOM-like bytes

Contrary to what a comment in wxTextInputStream::GetChar() said, it is
actually possible to get more than one wide character from a call to
wxMBConv::ToWChar(len+1) even if a previous call to ToWChar(len) failed
to decode anything at all. This happens with wxConvAuto because it keeps
returning an error while it doesn't have enough data to determine if the
input contains a BOM or not, but then returns all the characters
examined so far at once if it turns out that there was no BOM, after
all.

The simplest case in which this created problems was just input starting
with a NUL byte as it as this could be a start of UTF-32BE BOM.

The fix consists in keeping all the bytes read but not yet decoded in
the m_lastBytes buffer and retrying to decode them during the next
GetChar() call. This implies keeping track of how much valid data is
there in m_lastBytes exactly, as we can't discard the already decoded
data immediately, but need to keep it in the buffer too, in order to
allow implementing UngetLast(). Incidentally, UngetLast() was totally
broken for UTF-16/32 input (containing NUL bytes in the middle of the
characters) before and this change fixes this as a side effect.

Also add test cases for previously failing inputs.
This commit is contained in:
Vadim Zeitlin
2017-11-09 02:02:54 +01:00
parent 46ea3cb8c0
commit 4502e7563b
3 changed files with 120 additions and 26 deletions

View File

@@ -35,7 +35,8 @@ wxTextInputStream::wxTextInputStream(wxInputStream &s,
const wxMBConv& conv)
: m_input(s), m_separators(sep), m_conv(conv.Clone())
{
memset((void*)m_lastBytes, 0, 10);
m_validBegin =
m_validEnd = 0;
#if SIZEOF_WCHAR_T == 2
m_lastWChar = 0;
@@ -45,7 +46,10 @@ wxTextInputStream::wxTextInputStream(wxInputStream &s,
wxTextInputStream::wxTextInputStream(wxInputStream &s, const wxString &sep)
: m_input(s), m_separators(sep)
{
memset((void*)m_lastBytes, 0, 10);
m_validBegin =
m_validEnd = 0;
m_lastBytes[0] = 0;
}
#endif
@@ -58,11 +62,13 @@ wxTextInputStream::~wxTextInputStream()
void wxTextInputStream::UngetLast()
{
size_t byteCount = 0;
while(m_lastBytes[byteCount]) // pseudo ANSI strlen (even for Unicode!)
byteCount++;
m_input.Ungetch(m_lastBytes, byteCount);
memset((void*)m_lastBytes, 0, 10);
if ( m_validEnd )
{
m_input.Ungetch(m_lastBytes, m_validEnd);
m_validBegin =
m_validEnd = 0;
}
}
wxChar wxTextInputStream::GetChar()
@@ -77,17 +83,37 @@ wxChar wxTextInputStream::GetChar()
m_lastWChar = 0;
return wc;
}
#endif // !SWIG_ONLY_SCRIPT_API
#endif // SIZEOF_WCHAR_T
wxChar wbuf[2];
memset((void*)m_lastBytes, 0, 10);
for(size_t inlen = 0; inlen < 9; inlen++)
// If we have any non-decoded bytes left from the last call, shift them to
// be at the beginning of the buffer.
if ( m_validBegin < m_validEnd )
{
// actually read the next character
m_lastBytes[inlen] = m_input.GetC();
m_validEnd -= m_validBegin;
memmove(m_lastBytes, m_lastBytes + m_validBegin, m_validEnd);
}
else // All bytes were already decoded and consumed.
{
m_validEnd = 0;
}
if(m_input.LastRead() <= 0)
return 0;
// We may need to decode up to 4 characters if we have input starting with
// 3 BOM-like bytes, but not actually containing a BOM, as decoding it will
// only succeed when 4 bytes are read -- and will yield 4 wide characters.
wxChar wbuf[4];
for(size_t inlen = 0; inlen < sizeof(m_lastBytes); inlen++)
{
if ( inlen >= m_validEnd )
{
// actually read the next character
m_lastBytes[inlen] = m_input.GetC();
if(m_input.LastRead() <= 0)
return 0;
m_validEnd++;
}
//else: Retry decoding what we already have in the buffer.
switch ( m_conv->ToWChar(wbuf, WXSIZEOF(wbuf), m_lastBytes, inlen + 1) )
{
@@ -103,12 +129,17 @@ wxChar wxTextInputStream::GetChar()
break;
default:
// if we couldn't decode a single character during the last
// loop iteration we shouldn't be able to decode 2 or more of
// them with an extra single byte, something fishy is going on
// (except if we use UTF-16, see below)
wxFAIL_MSG("unexpected decoding result");
return 0;
// If we couldn't decode a single character during the last
// loop iteration, but decoded more than one of them with just
// one extra byte, the only explanation is that we were using a
// wxConvAuto conversion recognizing the initial BOM and that
// it couldn't detect the presence or absence of BOM so far,
// but now finally has enough data to see that there is none.
// As we must have fallen back to Latin-1 in this case, return
// just the first byte and keep the other ones for the next
// time.
m_validBegin = 1;
return wbuf[0];
#if SIZEOF_WCHAR_T == 2
case 2:
@@ -118,25 +149,37 @@ wxChar wxTextInputStream::GetChar()
// remember the second one for the next call, as there is no
// way to fit both of them into a single wxChar in this case.
m_lastWChar = wbuf[1];
#endif // !SWIG_ONLY_SCRIPT_API
#endif // SIZEOF_WCHAR_T == 2
wxFALLTHROUGH;
case 1:
m_validBegin = inlen + 1;
// we finally decoded a character
return wbuf[0];
}
}
// there should be no encoding which requires more than nine bytes for one
// character so something must be wrong with our conversion but we have no
// way to signal it from here
// There should be no encoding which requires more than 10 bytes to decode
// at least one character (the most actually seems to be 7: 3 for the
// initial BOM, which is ignored, and 4 for the longest possible encoding
// of a Unicode character in UTF-8), so something must be wrong with our
// conversion but we have no way to signal it from here and just return 0
// as if we reached the end of the stream.
m_validBegin = 0;
m_validEnd = sizeof(m_lastBytes);
return 0;
#else
m_lastBytes[0] = m_input.GetC();
if(m_input.LastRead() <= 0)
{
m_validEnd = 0;
return 0;
}
m_validEnd = 1;
return m_lastBytes[0];
#endif