Make wxTextStream classes work with surrogates under MSW

On the platforms using UTF-16 for wchar_t we can't read nor write Unicode data
one wchar_t at a time as a single half of a surrogate character can't be
converted to or from the encoding of the stream.

To fix this, we may need to store the last wchar_t already read from the
stream but not returned yet in wxTextInputStream::NextChar() and store,
without writing it, the wchar_t passed to wxTextOutputStream::PutChar() until
the second half of the surrogate is written.

See #17070.
This commit is contained in:
Vadim Zeitlin
2015-11-13 18:03:14 +01:00
parent 5cff8c1232
commit 823a2337f6
2 changed files with 109 additions and 3 deletions

View File

@@ -87,7 +87,16 @@ protected:
#if wxUSE_UNICODE #if wxUSE_UNICODE
wxMBConv *m_conv; wxMBConv *m_conv;
#endif
// The second half of a surrogate character when using UTF-16 for wchar_t:
// we can't return it immediately from GetChar() when we read a Unicode
// code point outside of the BMP, but we can't keep it in m_lastBytes
// neither because it can't separately decoded, so we have a separate 1
// wchar_t buffer just for this case.
#if SIZEOF_WCHAR_T == 2
wchar_t m_lastWChar;
#endif // SIZEOF_WCHAR_T == 2
#endif // wxUSE_UNICODE
bool EatEOL(const wxChar &c); bool EatEOL(const wxChar &c);
void UngetLast(); // should be used instead of wxInputStream::Ungetch() because of Unicode issues void UngetLast(); // should be used instead of wxInputStream::Ungetch() because of Unicode issues
@@ -165,7 +174,13 @@ protected:
#if wxUSE_UNICODE #if wxUSE_UNICODE
wxMBConv *m_conv; wxMBConv *m_conv;
#endif
#if SIZEOF_WCHAR_T == 2
// The first half of a surrogate character if one was passed to PutChar()
// and couldn't be output when it was called the last time.
wchar_t m_lastWChar;
#endif // SIZEOF_WCHAR_T == 2
#endif // wxUSE_UNICODE
wxDECLARE_NO_COPY_CLASS(wxTextOutputStream); wxDECLARE_NO_COPY_CLASS(wxTextOutputStream);
}; };

View File

@@ -36,6 +36,10 @@ wxTextInputStream::wxTextInputStream(wxInputStream &s,
: m_input(s), m_separators(sep), m_conv(conv.Clone()) : m_input(s), m_separators(sep), m_conv(conv.Clone())
{ {
memset((void*)m_lastBytes, 0, 10); memset((void*)m_lastBytes, 0, 10);
#if SIZEOF_WCHAR_T == 2
m_lastWChar = 0;
#endif // SIZEOF_WCHAR_T == 2
} }
#else #else
wxTextInputStream::wxTextInputStream(wxInputStream &s, const wxString &sep) wxTextInputStream::wxTextInputStream(wxInputStream &s, const wxString &sep)
@@ -64,6 +68,17 @@ void wxTextInputStream::UngetLast()
wxChar wxTextInputStream::NextChar() wxChar wxTextInputStream::NextChar()
{ {
#if wxUSE_UNICODE #if wxUSE_UNICODE
#if SIZEOF_WCHAR_T == 2
// Return the already raed character remaining from the last call to this
// function, if any.
if ( m_lastWChar )
{
const wxChar wc = m_lastWChar;
m_lastWChar = 0;
return wc;
}
#endif // !SWIG_ONLY_SCRIPT_API
wxChar wbuf[2]; wxChar wbuf[2];
memset((void*)m_lastBytes, 0, 10); memset((void*)m_lastBytes, 0, 10);
for(size_t inlen = 0; inlen < 9; inlen++) for(size_t inlen = 0; inlen < 9; inlen++)
@@ -91,10 +106,23 @@ wxChar wxTextInputStream::NextChar()
// if we couldn't decode a single character during the last // if we couldn't decode a single character during the last
// loop iteration we shouldn't be able to decode 2 or more of // loop iteration we shouldn't be able to decode 2 or more of
// them with an extra single byte, something fishy is going on // them with an extra single byte, something fishy is going on
// (except if we use UTF-16, see below)
wxFAIL_MSG("unexpected decoding result"); wxFAIL_MSG("unexpected decoding result");
wxFALLTHROUGH;// fall through nevertheless and return at least something return wxEOT;
#if SIZEOF_WCHAR_T == 2
case 2:
// When wchar_t uses UTF-16, we could have decoded a single
// Unicode code point as 2 wchar_t characters and there is
// nothing else to do here but to return the first one now and
// remember the second one for the next call, as there is no
// way to fit both of them into a single wxChar in this case.
m_lastWChar = wbuf[1];
#endif // !SWIG_ONLY_SCRIPT_API
wxFALLTHROUGH;
case 1: case 1:
// we finally decoded a character // we finally decoded a character
return wbuf[0]; return wbuf[0];
} }
@@ -374,6 +402,10 @@ wxTextOutputStream::wxTextOutputStream(wxOutputStream& s, wxEOL mode)
m_mode = wxEOL_UNIX; m_mode = wxEOL_UNIX;
#endif #endif
} }
#if wxUSE_UNICODE && SIZEOF_WCHAR_T == 2
m_lastWChar = 0;
#endif // SIZEOF_WCHAR_T == 2
} }
wxTextOutputStream::~wxTextOutputStream() wxTextOutputStream::~wxTextOutputStream()
@@ -480,7 +512,66 @@ void wxTextOutputStream::WriteString(const wxString& string)
wxTextOutputStream& wxTextOutputStream::PutChar(wxChar c) wxTextOutputStream& wxTextOutputStream::PutChar(wxChar c)
{ {
#if wxUSE_UNICODE #if wxUSE_UNICODE
#if SIZEOF_WCHAR_T == 2
wxCharBuffer buffer;
size_t len;
if ( m_lastWChar )
{
wxChar buf[2];
buf[0] = m_lastWChar;
buf[1] = c;
buffer = m_conv->cWC2MB(buf, WXSIZEOF(buf), &len);
m_lastWChar = 0;
}
else
{
buffer = m_conv->cWC2MB(&c, 1, &len);
}
if ( !len )
{
// Conversion failed, possibly because we have the first half of a
// surrogate character, so just store it and write it out when the
// second half is written to the stream too later.
//
// Notice that if we already had had a valid m_lastWChar, it is simply
// discarded here which is very bad, but there is no way to signal an
// error from here and this is not worse than the old code behaviour.
m_lastWChar = c;
}
else
{
for ( size_t n = 0; n < len; n++ )
{
const char c = buffer[n];
if ( c == '\n' )
{
switch ( m_mode )
{
case wxEOL_DOS:
m_output.Write("\r\n", 2);
continue;
case wxEOL_MAC:
m_output.Write("\r", 1);
continue;
default:
wxFAIL_MSG( wxT("unknown EOL mode in wxTextOutputStream") );
wxFALLTHROUGH;
case wxEOL_UNIX:
// don't treat '\n' specially
;
}
}
m_output.Write(&c, 1);
}
}
#else // SIZEOF_WCHAR_T == 4
WriteString( wxString(&c, *m_conv, 1) ); WriteString( wxString(&c, *m_conv, 1) );
#endif // SIZEOF_WCHAR_T == 2 or 4
#else #else
WriteString( wxString(&c, wxConvLocal, 1) ); WriteString( wxString(&c, wxConvLocal, 1) );
#endif #endif