Fix wxTextInputStream for input starting with BOM-like bytes
Contrary to what a comment in wxTextInputStream::GetChar() said, it is actually possible to get more than one wide character from a call to wxMBConv::ToWChar(len+1) even if a previous call to ToWChar(len) failed to decode anything at all. This happens with wxConvAuto because it keeps returning an error while it doesn't have enough data to determine if the input contains a BOM or not, but then returns all the characters examined so far at once if it turns out that there was no BOM, after all. The simplest case in which this created problems was just input starting with a NUL byte as it as this could be a start of UTF-32BE BOM. The fix consists in keeping all the bytes read but not yet decoded in the m_lastBytes buffer and retrying to decode them during the next GetChar() call. This implies keeping track of how much valid data is there in m_lastBytes exactly, as we can't discard the already decoded data immediately, but need to keep it in the buffer too, in order to allow implementing UngetLast(). Incidentally, UngetLast() was totally broken for UTF-16/32 input (containing NUL bytes in the middle of the characters) before and this change fixes this as a side effect. Also add test cases for previously failing inputs.
This commit is contained in:
@@ -35,7 +35,8 @@ wxTextInputStream::wxTextInputStream(wxInputStream &s,
|
||||
const wxMBConv& conv)
|
||||
: m_input(s), m_separators(sep), m_conv(conv.Clone())
|
||||
{
|
||||
memset((void*)m_lastBytes, 0, 10);
|
||||
m_validBegin =
|
||||
m_validEnd = 0;
|
||||
|
||||
#if SIZEOF_WCHAR_T == 2
|
||||
m_lastWChar = 0;
|
||||
@@ -45,7 +46,10 @@ wxTextInputStream::wxTextInputStream(wxInputStream &s,
|
||||
wxTextInputStream::wxTextInputStream(wxInputStream &s, const wxString &sep)
|
||||
: m_input(s), m_separators(sep)
|
||||
{
|
||||
memset((void*)m_lastBytes, 0, 10);
|
||||
m_validBegin =
|
||||
m_validEnd = 0;
|
||||
|
||||
m_lastBytes[0] = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -58,11 +62,13 @@ wxTextInputStream::~wxTextInputStream()
|
||||
|
||||
void wxTextInputStream::UngetLast()
|
||||
{
|
||||
size_t byteCount = 0;
|
||||
while(m_lastBytes[byteCount]) // pseudo ANSI strlen (even for Unicode!)
|
||||
byteCount++;
|
||||
m_input.Ungetch(m_lastBytes, byteCount);
|
||||
memset((void*)m_lastBytes, 0, 10);
|
||||
if ( m_validEnd )
|
||||
{
|
||||
m_input.Ungetch(m_lastBytes, m_validEnd);
|
||||
|
||||
m_validBegin =
|
||||
m_validEnd = 0;
|
||||
}
|
||||
}
|
||||
|
||||
wxChar wxTextInputStream::GetChar()
|
||||
@@ -77,17 +83,37 @@ wxChar wxTextInputStream::GetChar()
|
||||
m_lastWChar = 0;
|
||||
return wc;
|
||||
}
|
||||
#endif // !SWIG_ONLY_SCRIPT_API
|
||||
#endif // SIZEOF_WCHAR_T
|
||||
|
||||
wxChar wbuf[2];
|
||||
memset((void*)m_lastBytes, 0, 10);
|
||||
for(size_t inlen = 0; inlen < 9; inlen++)
|
||||
// If we have any non-decoded bytes left from the last call, shift them to
|
||||
// be at the beginning of the buffer.
|
||||
if ( m_validBegin < m_validEnd )
|
||||
{
|
||||
// actually read the next character
|
||||
m_lastBytes[inlen] = m_input.GetC();
|
||||
m_validEnd -= m_validBegin;
|
||||
memmove(m_lastBytes, m_lastBytes + m_validBegin, m_validEnd);
|
||||
}
|
||||
else // All bytes were already decoded and consumed.
|
||||
{
|
||||
m_validEnd = 0;
|
||||
}
|
||||
|
||||
if(m_input.LastRead() <= 0)
|
||||
return 0;
|
||||
// We may need to decode up to 4 characters if we have input starting with
|
||||
// 3 BOM-like bytes, but not actually containing a BOM, as decoding it will
|
||||
// only succeed when 4 bytes are read -- and will yield 4 wide characters.
|
||||
wxChar wbuf[4];
|
||||
for(size_t inlen = 0; inlen < sizeof(m_lastBytes); inlen++)
|
||||
{
|
||||
if ( inlen >= m_validEnd )
|
||||
{
|
||||
// actually read the next character
|
||||
m_lastBytes[inlen] = m_input.GetC();
|
||||
|
||||
if(m_input.LastRead() <= 0)
|
||||
return 0;
|
||||
|
||||
m_validEnd++;
|
||||
}
|
||||
//else: Retry decoding what we already have in the buffer.
|
||||
|
||||
switch ( m_conv->ToWChar(wbuf, WXSIZEOF(wbuf), m_lastBytes, inlen + 1) )
|
||||
{
|
||||
@@ -103,12 +129,17 @@ wxChar wxTextInputStream::GetChar()
|
||||
break;
|
||||
|
||||
default:
|
||||
// if we couldn't decode a single character during the last
|
||||
// loop iteration we shouldn't be able to decode 2 or more of
|
||||
// them with an extra single byte, something fishy is going on
|
||||
// (except if we use UTF-16, see below)
|
||||
wxFAIL_MSG("unexpected decoding result");
|
||||
return 0;
|
||||
// If we couldn't decode a single character during the last
|
||||
// loop iteration, but decoded more than one of them with just
|
||||
// one extra byte, the only explanation is that we were using a
|
||||
// wxConvAuto conversion recognizing the initial BOM and that
|
||||
// it couldn't detect the presence or absence of BOM so far,
|
||||
// but now finally has enough data to see that there is none.
|
||||
// As we must have fallen back to Latin-1 in this case, return
|
||||
// just the first byte and keep the other ones for the next
|
||||
// time.
|
||||
m_validBegin = 1;
|
||||
return wbuf[0];
|
||||
|
||||
#if SIZEOF_WCHAR_T == 2
|
||||
case 2:
|
||||
@@ -118,25 +149,37 @@ wxChar wxTextInputStream::GetChar()
|
||||
// remember the second one for the next call, as there is no
|
||||
// way to fit both of them into a single wxChar in this case.
|
||||
m_lastWChar = wbuf[1];
|
||||
#endif // !SWIG_ONLY_SCRIPT_API
|
||||
#endif // SIZEOF_WCHAR_T == 2
|
||||
wxFALLTHROUGH;
|
||||
|
||||
case 1:
|
||||
m_validBegin = inlen + 1;
|
||||
|
||||
// we finally decoded a character
|
||||
return wbuf[0];
|
||||
}
|
||||
}
|
||||
|
||||
// there should be no encoding which requires more than nine bytes for one
|
||||
// character so something must be wrong with our conversion but we have no
|
||||
// way to signal it from here
|
||||
// There should be no encoding which requires more than 10 bytes to decode
|
||||
// at least one character (the most actually seems to be 7: 3 for the
|
||||
// initial BOM, which is ignored, and 4 for the longest possible encoding
|
||||
// of a Unicode character in UTF-8), so something must be wrong with our
|
||||
// conversion but we have no way to signal it from here and just return 0
|
||||
// as if we reached the end of the stream.
|
||||
m_validBegin = 0;
|
||||
m_validEnd = sizeof(m_lastBytes);
|
||||
|
||||
return 0;
|
||||
#else
|
||||
m_lastBytes[0] = m_input.GetC();
|
||||
|
||||
if(m_input.LastRead() <= 0)
|
||||
{
|
||||
m_validEnd = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
m_validEnd = 1;
|
||||
|
||||
return m_lastBytes[0];
|
||||
#endif
|
||||
|
Reference in New Issue
Block a user