Files
wxWidgets/src/common/txtstrm.cpp
Vadim Zeitlin 4502e7563b Fix wxTextInputStream for input starting with BOM-like bytes
Contrary to what a comment in wxTextInputStream::GetChar() said, it is
actually possible to get more than one wide character from a call to
wxMBConv::ToWChar(len+1) even if a previous call to ToWChar(len) failed
to decode anything at all. This happens with wxConvAuto because it keeps
returning an error while it doesn't have enough data to determine if the
input contains a BOM or not, but then returns all the characters
examined so far at once if it turns out that there was no BOM, after
all.

The simplest case in which this created problems was just input starting
with a NUL byte as it as this could be a start of UTF-32BE BOM.

The fix consists in keeping all the bytes read but not yet decoded in
the m_lastBytes buffer and retrying to decode them during the next
GetChar() call. This implies keeping track of how much valid data is
there in m_lastBytes exactly, as we can't discard the already decoded
data immediately, but need to keep it in the buffer too, in order to
allow implementing UngetLast(). Incidentally, UngetLast() was totally
broken for UTF-16/32 input (containing NUL bytes in the middle of the
characters) before and this change fixes this as a side effect.

Also add test cases for previously failing inputs.
2017-11-09 23:49:59 +01:00

723 lines
16 KiB
C++

///////////////////////////////////////////////////////////////////////////////
// Name: src/common/txtstrm.cpp
// Purpose: Text stream classes
// Author: Guilhem Lavaux
// Modified by:
// Created: 28/06/98
// Copyright: (c) Guilhem Lavaux
// Licence: wxWindows licence
/////////////////////////////////////////////////////////////////////////////
// For compilers that support precompilation, includes "wx.h".
#include "wx/wxprec.h"
#ifdef __BORLANDC__
#pragma hdrstop
#endif
#if wxUSE_STREAMS
#include "wx/txtstrm.h"
#ifndef WX_PRECOMP
#include "wx/crt.h"
#endif
#include <ctype.h>
// ----------------------------------------------------------------------------
// wxTextInputStream
// ----------------------------------------------------------------------------
#if wxUSE_UNICODE
wxTextInputStream::wxTextInputStream(wxInputStream &s,
const wxString &sep,
const wxMBConv& conv)
: m_input(s), m_separators(sep), m_conv(conv.Clone())
{
m_validBegin =
m_validEnd = 0;
#if SIZEOF_WCHAR_T == 2
m_lastWChar = 0;
#endif // SIZEOF_WCHAR_T == 2
}
#else
wxTextInputStream::wxTextInputStream(wxInputStream &s, const wxString &sep)
: m_input(s), m_separators(sep)
{
m_validBegin =
m_validEnd = 0;
m_lastBytes[0] = 0;
}
#endif
wxTextInputStream::~wxTextInputStream()
{
#if wxUSE_UNICODE
delete m_conv;
#endif // wxUSE_UNICODE
}
void wxTextInputStream::UngetLast()
{
if ( m_validEnd )
{
m_input.Ungetch(m_lastBytes, m_validEnd);
m_validBegin =
m_validEnd = 0;
}
}
wxChar wxTextInputStream::GetChar()
{
#if wxUSE_UNICODE
#if SIZEOF_WCHAR_T == 2
// Return the already raed character remaining from the last call to this
// function, if any.
if ( m_lastWChar )
{
const wxChar wc = m_lastWChar;
m_lastWChar = 0;
return wc;
}
#endif // SIZEOF_WCHAR_T
// If we have any non-decoded bytes left from the last call, shift them to
// be at the beginning of the buffer.
if ( m_validBegin < m_validEnd )
{
m_validEnd -= m_validBegin;
memmove(m_lastBytes, m_lastBytes + m_validBegin, m_validEnd);
}
else // All bytes were already decoded and consumed.
{
m_validEnd = 0;
}
// We may need to decode up to 4 characters if we have input starting with
// 3 BOM-like bytes, but not actually containing a BOM, as decoding it will
// only succeed when 4 bytes are read -- and will yield 4 wide characters.
wxChar wbuf[4];
for(size_t inlen = 0; inlen < sizeof(m_lastBytes); inlen++)
{
if ( inlen >= m_validEnd )
{
// actually read the next character
m_lastBytes[inlen] = m_input.GetC();
if(m_input.LastRead() <= 0)
return 0;
m_validEnd++;
}
//else: Retry decoding what we already have in the buffer.
switch ( m_conv->ToWChar(wbuf, WXSIZEOF(wbuf), m_lastBytes, inlen + 1) )
{
case 0:
// this is a bug in converter object as it should either fail
// or decode non-empty string to something non-empty
wxFAIL_MSG("ToWChar() can't return 0 for non-empty input");
break;
case wxCONV_FAILED:
// the buffer probably doesn't contain enough bytes to decode
// as a complete character, try with more bytes
break;
default:
// If we couldn't decode a single character during the last
// loop iteration, but decoded more than one of them with just
// one extra byte, the only explanation is that we were using a
// wxConvAuto conversion recognizing the initial BOM and that
// it couldn't detect the presence or absence of BOM so far,
// but now finally has enough data to see that there is none.
// As we must have fallen back to Latin-1 in this case, return
// just the first byte and keep the other ones for the next
// time.
m_validBegin = 1;
return wbuf[0];
#if SIZEOF_WCHAR_T == 2
case 2:
// When wchar_t uses UTF-16, we could have decoded a single
// Unicode code point as 2 wchar_t characters and there is
// nothing else to do here but to return the first one now and
// remember the second one for the next call, as there is no
// way to fit both of them into a single wxChar in this case.
m_lastWChar = wbuf[1];
#endif // SIZEOF_WCHAR_T == 2
wxFALLTHROUGH;
case 1:
m_validBegin = inlen + 1;
// we finally decoded a character
return wbuf[0];
}
}
// There should be no encoding which requires more than 10 bytes to decode
// at least one character (the most actually seems to be 7: 3 for the
// initial BOM, which is ignored, and 4 for the longest possible encoding
// of a Unicode character in UTF-8), so something must be wrong with our
// conversion but we have no way to signal it from here and just return 0
// as if we reached the end of the stream.
m_validBegin = 0;
m_validEnd = sizeof(m_lastBytes);
return 0;
#else
m_lastBytes[0] = m_input.GetC();
if(m_input.LastRead() <= 0)
{
m_validEnd = 0;
return 0;
}
m_validEnd = 1;
return m_lastBytes[0];
#endif
}
wxChar wxTextInputStream::NextNonSeparators()
{
for (;;)
{
wxChar c = GetChar();
if (!c)
return c;
if (c != wxT('\n') &&
c != wxT('\r') &&
m_separators.Find(c) < 0)
return c;
}
}
bool wxTextInputStream::EatEOL(const wxChar &c)
{
if (c == wxT('\n')) return true; // eat on UNIX
if (c == wxT('\r')) // eat on both Mac and DOS
{
wxChar c2 = GetChar();
if (!c2) return true; // end of stream reached, had enough :-)
if (c2 != wxT('\n')) UngetLast(); // Don't eat on Mac
return true;
}
return false;
}
wxUint64 wxTextInputStream::Read64(int base)
{
wxASSERT_MSG( !base || (base > 1 && base <= 36), wxT("invalid base") );
if(!m_input) return 0;
wxString word = ReadWord();
if(word.empty())
return 0;
wxUint64 res;
if(!word.ToULongLong(&res, base))
return 0;
return res;
}
wxUint32 wxTextInputStream::Read32(int base)
{
wxASSERT_MSG( !base || (base > 1 && base <= 36), wxT("invalid base") );
if(!m_input) return 0;
wxString word = ReadWord();
if(word.empty())
return 0;
return wxStrtoul(word.c_str(), 0, base);
}
wxUint16 wxTextInputStream::Read16(int base)
{
return (wxUint16)Read32(base);
}
wxUint8 wxTextInputStream::Read8(int base)
{
return (wxUint8)Read32(base);
}
wxInt64 wxTextInputStream::Read64S(int base)
{
wxASSERT_MSG( !base || (base > 1 && base <= 36), wxT("invalid base") );
if(!m_input) return 0;
wxString word = ReadWord();
if(word.empty())
return 0;
wxInt64 res;
if(!word.ToLongLong(&res, base))
return 0;
return res;
}
wxInt32 wxTextInputStream::Read32S(int base)
{
wxASSERT_MSG( !base || (base > 1 && base <= 36), wxT("invalid base") );
if(!m_input) return 0;
wxString word = ReadWord();
if(word.empty())
return 0;
return wxStrtol(word.c_str(), 0, base);
}
wxInt16 wxTextInputStream::Read16S(int base)
{
return (wxInt16)Read32S(base);
}
wxInt8 wxTextInputStream::Read8S(int base)
{
return (wxInt8)Read32S(base);
}
double wxTextInputStream::ReadDouble()
{
if(!m_input) return 0;
wxString word = ReadWord();
if(word.empty())
return 0;
return wxStrtod(word.c_str(), 0);
}
wxString wxTextInputStream::ReadLine()
{
wxString line;
while ( !m_input.Eof() )
{
wxChar c = GetChar();
if (!c)
break;
if (EatEOL(c))
break;
line += c;
}
return line;
}
wxString wxTextInputStream::ReadWord()
{
wxString word;
if ( !m_input )
return word;
wxChar c = NextNonSeparators();
if ( !c )
return word;
word += c;
while ( !m_input.Eof() )
{
c = GetChar();
if (!c)
break;
if (m_separators.Find(c) >= 0)
break;
if (EatEOL(c))
break;
word += c;
}
return word;
}
wxTextInputStream& wxTextInputStream::operator>>(wxString& word)
{
word = ReadWord();
return *this;
}
wxTextInputStream& wxTextInputStream::operator>>(char& c)
{
c = m_input.GetC();
if(m_input.LastRead() <= 0) c = 0;
if (EatEOL(c))
c = '\n';
return *this;
}
#if wxUSE_UNICODE && wxWCHAR_T_IS_REAL_TYPE
wxTextInputStream& wxTextInputStream::operator>>(wchar_t& wc)
{
wc = GetChar();
return *this;
}
#endif // wxUSE_UNICODE
wxTextInputStream& wxTextInputStream::operator>>(wxInt16& i)
{
i = Read16S();
return *this;
}
wxTextInputStream& wxTextInputStream::operator>>(wxInt32& i)
{
i = Read32S();
return *this;
}
wxTextInputStream& wxTextInputStream::operator>>(wxInt64& i)
{
i = Read64S();
return *this;
}
wxTextInputStream& wxTextInputStream::operator>>(wxUint16& i)
{
i = Read16();
return *this;
}
wxTextInputStream& wxTextInputStream::operator>>(wxUint32& i)
{
i = Read32();
return *this;
}
wxTextInputStream& wxTextInputStream::operator>>(wxUint64& i)
{
i = Read64();
return *this;
}
wxTextInputStream& wxTextInputStream::operator>>(double& i)
{
i = ReadDouble();
return *this;
}
wxTextInputStream& wxTextInputStream::operator>>(float& f)
{
f = (float)ReadDouble();
return *this;
}
#if wxUSE_UNICODE
wxTextOutputStream::wxTextOutputStream(wxOutputStream& s,
wxEOL mode,
const wxMBConv& conv)
: m_output(s), m_conv(conv.Clone())
#else
wxTextOutputStream::wxTextOutputStream(wxOutputStream& s, wxEOL mode)
: m_output(s)
#endif
{
m_mode = mode;
if (m_mode == wxEOL_NATIVE)
{
#if defined(__WINDOWS__)
m_mode = wxEOL_DOS;
#else
m_mode = wxEOL_UNIX;
#endif
}
#if wxUSE_UNICODE && SIZEOF_WCHAR_T == 2
m_lastWChar = 0;
#endif // SIZEOF_WCHAR_T == 2
}
wxTextOutputStream::~wxTextOutputStream()
{
#if wxUSE_UNICODE
delete m_conv;
#endif // wxUSE_UNICODE
}
void wxTextOutputStream::SetMode(wxEOL mode)
{
m_mode = mode;
if (m_mode == wxEOL_NATIVE)
{
#if defined(__WINDOWS__)
m_mode = wxEOL_DOS;
#else
m_mode = wxEOL_UNIX;
#endif
}
}
void wxTextOutputStream::Write64(wxUint64 i)
{
WriteString(wxString::Format("%" wxLongLongFmtSpec "u", i));
}
void wxTextOutputStream::Write32(wxUint32 i)
{
wxString str;
str.Printf(wxT("%u"), i);
WriteString(str);
}
void wxTextOutputStream::Write16(wxUint16 i)
{
wxString str;
str.Printf(wxT("%u"), (unsigned)i);
WriteString(str);
}
void wxTextOutputStream::Write8(wxUint8 i)
{
wxString str;
str.Printf(wxT("%u"), (unsigned)i);
WriteString(str);
}
void wxTextOutputStream::WriteDouble(double d)
{
wxString str;
str.Printf(wxT("%f"), d);
WriteString(str);
}
void wxTextOutputStream::WriteString(const wxString& string)
{
size_t len = string.length();
wxString out;
out.reserve(len);
for ( size_t i = 0; i < len; i++ )
{
const wxChar c = string[i];
if ( c == wxT('\n') )
{
switch ( m_mode )
{
case wxEOL_DOS:
out << wxT("\r\n");
continue;
case wxEOL_MAC:
out << wxT('\r');
continue;
default:
wxFAIL_MSG( wxT("unknown EOL mode in wxTextOutputStream") );
wxFALLTHROUGH;
case wxEOL_UNIX:
// don't treat '\n' specially
;
}
}
out << c;
}
#if wxUSE_UNICODE
// FIXME-UTF8: use wxCharBufferWithLength if/when we have it
wxCharBuffer buffer = m_conv->cWC2MB(out.wc_str(), out.length(), &len);
m_output.Write(buffer, len);
#else
m_output.Write(out.c_str(), out.length() );
#endif
}
wxTextOutputStream& wxTextOutputStream::PutChar(wxChar c)
{
#if wxUSE_UNICODE
#if SIZEOF_WCHAR_T == 2
wxCharBuffer buffer;
size_t len;
if ( m_lastWChar )
{
wxChar buf[2];
buf[0] = m_lastWChar;
buf[1] = c;
buffer = m_conv->cWC2MB(buf, WXSIZEOF(buf), &len);
m_lastWChar = 0;
}
else
{
buffer = m_conv->cWC2MB(&c, 1, &len);
}
if ( !len )
{
// Conversion failed, possibly because we have the first half of a
// surrogate character, so just store it and write it out when the
// second half is written to the stream too later.
//
// Notice that if we already had had a valid m_lastWChar, it is simply
// discarded here which is very bad, but there is no way to signal an
// error from here and this is not worse than the old code behaviour.
m_lastWChar = c;
}
else
{
for ( size_t n = 0; n < len; n++ )
{
const char c2 = buffer[n];
if ( c2 == '\n' )
{
switch ( m_mode )
{
case wxEOL_DOS:
m_output.Write("\r\n", 2);
continue;
case wxEOL_MAC:
m_output.Write("\r", 1);
continue;
default:
wxFAIL_MSG( wxT("unknown EOL mode in wxTextOutputStream") );
wxFALLTHROUGH;
case wxEOL_UNIX:
// don't treat '\n' specially
;
}
}
m_output.Write(&c2, 1);
}
}
#else // SIZEOF_WCHAR_T == 4
WriteString( wxString(&c, *m_conv, 1) );
#endif // SIZEOF_WCHAR_T == 2 or 4
#else
WriteString( wxString(&c, wxConvLocal, 1) );
#endif
return *this;
}
void wxTextOutputStream::Flush()
{
#if wxUSE_UNICODE
const size_t len = m_conv->FromWChar(NULL, 0, L"", 1);
if ( len > m_conv->GetMBNulLen() )
{
wxCharBuffer buf(len);
m_conv->FromWChar(buf.data(), len, L"", 1);
m_output.Write(buf, len - m_conv->GetMBNulLen());
}
#endif // wxUSE_UNICODE
}
wxTextOutputStream& wxTextOutputStream::operator<<(const wxString& string)
{
WriteString( string );
return *this;
}
wxTextOutputStream& wxTextOutputStream::operator<<(char c)
{
WriteString( wxString::FromAscii(c) );
return *this;
}
#if wxUSE_UNICODE && wxWCHAR_T_IS_REAL_TYPE
wxTextOutputStream& wxTextOutputStream::operator<<(wchar_t wc)
{
PutChar(wc);
return *this;
}
#endif // wxUSE_UNICODE
wxTextOutputStream& wxTextOutputStream::operator<<(wxInt16 c)
{
Write(c);
return *this;
}
wxTextOutputStream& wxTextOutputStream::operator<<(wxInt32 c)
{
Write(c);
return *this;
}
wxTextOutputStream& wxTextOutputStream::operator<<(wxInt64 c)
{
Write(c);
return *this;
}
wxTextOutputStream& wxTextOutputStream::operator<<(wxUint16 c)
{
Write(c);
return *this;
}
wxTextOutputStream& wxTextOutputStream::operator<<(wxUint32 c)
{
Write(c);
return *this;
}
wxTextOutputStream& wxTextOutputStream::operator<<(wxUint64 c)
{
Write(c);
return *this;
}
wxTextOutputStream &wxTextOutputStream::operator<<(double f)
{
Write(f);
return *this;
}
wxTextOutputStream& wxTextOutputStream::operator<<(float f)
{
Write(f);
return *this;
}
wxTextOutputStream &endl( wxTextOutputStream &stream )
{
return stream.PutChar(wxT('\n'));
}
#endif
// wxUSE_STREAMS