Make wxTextStream classes work with surrogates under MSW
On the platforms using UTF-16 for wchar_t we can't read nor write Unicode data one wchar_t at a time as a single half of a surrogate character can't be converted to or from the encoding of the stream. To fix this, we may need to store the last wchar_t already read from the stream but not returned yet in wxTextInputStream::NextChar() and store, without writing it, the wchar_t passed to wxTextOutputStream::PutChar() until the second half of the surrogate is written. See #17070.
This commit is contained in:
@@ -87,7 +87,16 @@ protected:
|
|||||||
|
|
||||||
#if wxUSE_UNICODE
|
#if wxUSE_UNICODE
|
||||||
wxMBConv *m_conv;
|
wxMBConv *m_conv;
|
||||||
#endif
|
|
||||||
|
// The second half of a surrogate character when using UTF-16 for wchar_t:
|
||||||
|
// we can't return it immediately from GetChar() when we read a Unicode
|
||||||
|
// code point outside of the BMP, but we can't keep it in m_lastBytes
|
||||||
|
// neither because it can't separately decoded, so we have a separate 1
|
||||||
|
// wchar_t buffer just for this case.
|
||||||
|
#if SIZEOF_WCHAR_T == 2
|
||||||
|
wchar_t m_lastWChar;
|
||||||
|
#endif // SIZEOF_WCHAR_T == 2
|
||||||
|
#endif // wxUSE_UNICODE
|
||||||
|
|
||||||
bool EatEOL(const wxChar &c);
|
bool EatEOL(const wxChar &c);
|
||||||
void UngetLast(); // should be used instead of wxInputStream::Ungetch() because of Unicode issues
|
void UngetLast(); // should be used instead of wxInputStream::Ungetch() because of Unicode issues
|
||||||
@@ -165,7 +174,13 @@ protected:
|
|||||||
|
|
||||||
#if wxUSE_UNICODE
|
#if wxUSE_UNICODE
|
||||||
wxMBConv *m_conv;
|
wxMBConv *m_conv;
|
||||||
#endif
|
|
||||||
|
#if SIZEOF_WCHAR_T == 2
|
||||||
|
// The first half of a surrogate character if one was passed to PutChar()
|
||||||
|
// and couldn't be output when it was called the last time.
|
||||||
|
wchar_t m_lastWChar;
|
||||||
|
#endif // SIZEOF_WCHAR_T == 2
|
||||||
|
#endif // wxUSE_UNICODE
|
||||||
|
|
||||||
wxDECLARE_NO_COPY_CLASS(wxTextOutputStream);
|
wxDECLARE_NO_COPY_CLASS(wxTextOutputStream);
|
||||||
};
|
};
|
||||||
|
@@ -36,6 +36,10 @@ wxTextInputStream::wxTextInputStream(wxInputStream &s,
|
|||||||
: m_input(s), m_separators(sep), m_conv(conv.Clone())
|
: m_input(s), m_separators(sep), m_conv(conv.Clone())
|
||||||
{
|
{
|
||||||
memset((void*)m_lastBytes, 0, 10);
|
memset((void*)m_lastBytes, 0, 10);
|
||||||
|
|
||||||
|
#if SIZEOF_WCHAR_T == 2
|
||||||
|
m_lastWChar = 0;
|
||||||
|
#endif // SIZEOF_WCHAR_T == 2
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
wxTextInputStream::wxTextInputStream(wxInputStream &s, const wxString &sep)
|
wxTextInputStream::wxTextInputStream(wxInputStream &s, const wxString &sep)
|
||||||
@@ -64,6 +68,17 @@ void wxTextInputStream::UngetLast()
|
|||||||
wxChar wxTextInputStream::NextChar()
|
wxChar wxTextInputStream::NextChar()
|
||||||
{
|
{
|
||||||
#if wxUSE_UNICODE
|
#if wxUSE_UNICODE
|
||||||
|
#if SIZEOF_WCHAR_T == 2
|
||||||
|
// Return the already raed character remaining from the last call to this
|
||||||
|
// function, if any.
|
||||||
|
if ( m_lastWChar )
|
||||||
|
{
|
||||||
|
const wxChar wc = m_lastWChar;
|
||||||
|
m_lastWChar = 0;
|
||||||
|
return wc;
|
||||||
|
}
|
||||||
|
#endif // !SWIG_ONLY_SCRIPT_API
|
||||||
|
|
||||||
wxChar wbuf[2];
|
wxChar wbuf[2];
|
||||||
memset((void*)m_lastBytes, 0, 10);
|
memset((void*)m_lastBytes, 0, 10);
|
||||||
for(size_t inlen = 0; inlen < 9; inlen++)
|
for(size_t inlen = 0; inlen < 9; inlen++)
|
||||||
@@ -91,10 +106,23 @@ wxChar wxTextInputStream::NextChar()
|
|||||||
// if we couldn't decode a single character during the last
|
// if we couldn't decode a single character during the last
|
||||||
// loop iteration we shouldn't be able to decode 2 or more of
|
// loop iteration we shouldn't be able to decode 2 or more of
|
||||||
// them with an extra single byte, something fishy is going on
|
// them with an extra single byte, something fishy is going on
|
||||||
|
// (except if we use UTF-16, see below)
|
||||||
wxFAIL_MSG("unexpected decoding result");
|
wxFAIL_MSG("unexpected decoding result");
|
||||||
wxFALLTHROUGH;// fall through nevertheless and return at least something
|
return wxEOT;
|
||||||
|
|
||||||
|
#if SIZEOF_WCHAR_T == 2
|
||||||
|
case 2:
|
||||||
|
// When wchar_t uses UTF-16, we could have decoded a single
|
||||||
|
// Unicode code point as 2 wchar_t characters and there is
|
||||||
|
// nothing else to do here but to return the first one now and
|
||||||
|
// remember the second one for the next call, as there is no
|
||||||
|
// way to fit both of them into a single wxChar in this case.
|
||||||
|
m_lastWChar = wbuf[1];
|
||||||
|
#endif // !SWIG_ONLY_SCRIPT_API
|
||||||
|
wxFALLTHROUGH;
|
||||||
|
|
||||||
case 1:
|
case 1:
|
||||||
|
|
||||||
// we finally decoded a character
|
// we finally decoded a character
|
||||||
return wbuf[0];
|
return wbuf[0];
|
||||||
}
|
}
|
||||||
@@ -374,6 +402,10 @@ wxTextOutputStream::wxTextOutputStream(wxOutputStream& s, wxEOL mode)
|
|||||||
m_mode = wxEOL_UNIX;
|
m_mode = wxEOL_UNIX;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if wxUSE_UNICODE && SIZEOF_WCHAR_T == 2
|
||||||
|
m_lastWChar = 0;
|
||||||
|
#endif // SIZEOF_WCHAR_T == 2
|
||||||
}
|
}
|
||||||
|
|
||||||
wxTextOutputStream::~wxTextOutputStream()
|
wxTextOutputStream::~wxTextOutputStream()
|
||||||
@@ -480,7 +512,66 @@ void wxTextOutputStream::WriteString(const wxString& string)
|
|||||||
wxTextOutputStream& wxTextOutputStream::PutChar(wxChar c)
|
wxTextOutputStream& wxTextOutputStream::PutChar(wxChar c)
|
||||||
{
|
{
|
||||||
#if wxUSE_UNICODE
|
#if wxUSE_UNICODE
|
||||||
|
#if SIZEOF_WCHAR_T == 2
|
||||||
|
wxCharBuffer buffer;
|
||||||
|
size_t len;
|
||||||
|
if ( m_lastWChar )
|
||||||
|
{
|
||||||
|
wxChar buf[2];
|
||||||
|
buf[0] = m_lastWChar;
|
||||||
|
buf[1] = c;
|
||||||
|
buffer = m_conv->cWC2MB(buf, WXSIZEOF(buf), &len);
|
||||||
|
m_lastWChar = 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
buffer = m_conv->cWC2MB(&c, 1, &len);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( !len )
|
||||||
|
{
|
||||||
|
// Conversion failed, possibly because we have the first half of a
|
||||||
|
// surrogate character, so just store it and write it out when the
|
||||||
|
// second half is written to the stream too later.
|
||||||
|
//
|
||||||
|
// Notice that if we already had had a valid m_lastWChar, it is simply
|
||||||
|
// discarded here which is very bad, but there is no way to signal an
|
||||||
|
// error from here and this is not worse than the old code behaviour.
|
||||||
|
m_lastWChar = c;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for ( size_t n = 0; n < len; n++ )
|
||||||
|
{
|
||||||
|
const char c = buffer[n];
|
||||||
|
if ( c == '\n' )
|
||||||
|
{
|
||||||
|
switch ( m_mode )
|
||||||
|
{
|
||||||
|
case wxEOL_DOS:
|
||||||
|
m_output.Write("\r\n", 2);
|
||||||
|
continue;
|
||||||
|
|
||||||
|
case wxEOL_MAC:
|
||||||
|
m_output.Write("\r", 1);
|
||||||
|
continue;
|
||||||
|
|
||||||
|
default:
|
||||||
|
wxFAIL_MSG( wxT("unknown EOL mode in wxTextOutputStream") );
|
||||||
|
wxFALLTHROUGH;
|
||||||
|
|
||||||
|
case wxEOL_UNIX:
|
||||||
|
// don't treat '\n' specially
|
||||||
|
;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
m_output.Write(&c, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else // SIZEOF_WCHAR_T == 4
|
||||||
WriteString( wxString(&c, *m_conv, 1) );
|
WriteString( wxString(&c, *m_conv, 1) );
|
||||||
|
#endif // SIZEOF_WCHAR_T == 2 or 4
|
||||||
#else
|
#else
|
||||||
WriteString( wxString(&c, wxConvLocal, 1) );
|
WriteString( wxString(&c, wxConvLocal, 1) );
|
||||||
#endif
|
#endif
|
||||||
|
Reference in New Issue
Block a user