From 823a2337f64b0e23f241abe543f073ea0b13f33a Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Fri, 13 Nov 2015 18:03:14 +0100 Subject: [PATCH] Make wxTextStream classes work with surrogates under MSW On the platforms using UTF-16 for wchar_t we can't read nor write Unicode data one wchar_t at a time as a single half of a surrogate character can't be converted to or from the encoding of the stream. To fix this, we may need to store the last wchar_t already read from the stream but not returned yet in wxTextInputStream::NextChar() and store, without writing it, the wchar_t passed to wxTextOutputStream::PutChar() until the second half of the surrogate is written. See #17070. --- include/wx/txtstrm.h | 19 ++++++++- src/common/txtstrm.cpp | 93 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 109 insertions(+), 3 deletions(-) diff --git a/include/wx/txtstrm.h b/include/wx/txtstrm.h index 465a277164..81b6d751b1 100644 --- a/include/wx/txtstrm.h +++ b/include/wx/txtstrm.h @@ -87,7 +87,16 @@ protected: #if wxUSE_UNICODE wxMBConv *m_conv; -#endif + + // The second half of a surrogate character when using UTF-16 for wchar_t: + // we can't return it immediately from GetChar() when we read a Unicode + // code point outside of the BMP, but we can't keep it in m_lastBytes + // neither because it can't separately decoded, so we have a separate 1 + // wchar_t buffer just for this case. +#if SIZEOF_WCHAR_T == 2 + wchar_t m_lastWChar; +#endif // SIZEOF_WCHAR_T == 2 +#endif // wxUSE_UNICODE bool EatEOL(const wxChar &c); void UngetLast(); // should be used instead of wxInputStream::Ungetch() because of Unicode issues @@ -165,7 +174,13 @@ protected: #if wxUSE_UNICODE wxMBConv *m_conv; -#endif + +#if SIZEOF_WCHAR_T == 2 + // The first half of a surrogate character if one was passed to PutChar() + // and couldn't be output when it was called the last time. + wchar_t m_lastWChar; +#endif // SIZEOF_WCHAR_T == 2 +#endif // wxUSE_UNICODE wxDECLARE_NO_COPY_CLASS(wxTextOutputStream); }; diff --git a/src/common/txtstrm.cpp b/src/common/txtstrm.cpp index 97fd0c9072..3b2c44949f 100644 --- a/src/common/txtstrm.cpp +++ b/src/common/txtstrm.cpp @@ -36,6 +36,10 @@ wxTextInputStream::wxTextInputStream(wxInputStream &s, : m_input(s), m_separators(sep), m_conv(conv.Clone()) { memset((void*)m_lastBytes, 0, 10); + +#if SIZEOF_WCHAR_T == 2 + m_lastWChar = 0; +#endif // SIZEOF_WCHAR_T == 2 } #else wxTextInputStream::wxTextInputStream(wxInputStream &s, const wxString &sep) @@ -64,6 +68,17 @@ void wxTextInputStream::UngetLast() wxChar wxTextInputStream::NextChar() { #if wxUSE_UNICODE +#if SIZEOF_WCHAR_T == 2 + // Return the already raed character remaining from the last call to this + // function, if any. + if ( m_lastWChar ) + { + const wxChar wc = m_lastWChar; + m_lastWChar = 0; + return wc; + } +#endif // !SWIG_ONLY_SCRIPT_API + wxChar wbuf[2]; memset((void*)m_lastBytes, 0, 10); for(size_t inlen = 0; inlen < 9; inlen++) @@ -91,10 +106,23 @@ wxChar wxTextInputStream::NextChar() // if we couldn't decode a single character during the last // loop iteration we shouldn't be able to decode 2 or more of // them with an extra single byte, something fishy is going on + // (except if we use UTF-16, see below) wxFAIL_MSG("unexpected decoding result"); - wxFALLTHROUGH;// fall through nevertheless and return at least something + return wxEOT; + +#if SIZEOF_WCHAR_T == 2 + case 2: + // When wchar_t uses UTF-16, we could have decoded a single + // Unicode code point as 2 wchar_t characters and there is + // nothing else to do here but to return the first one now and + // remember the second one for the next call, as there is no + // way to fit both of them into a single wxChar in this case. + m_lastWChar = wbuf[1]; +#endif // !SWIG_ONLY_SCRIPT_API + wxFALLTHROUGH; case 1: + // we finally decoded a character return wbuf[0]; } @@ -374,6 +402,10 @@ wxTextOutputStream::wxTextOutputStream(wxOutputStream& s, wxEOL mode) m_mode = wxEOL_UNIX; #endif } + +#if wxUSE_UNICODE && SIZEOF_WCHAR_T == 2 + m_lastWChar = 0; +#endif // SIZEOF_WCHAR_T == 2 } wxTextOutputStream::~wxTextOutputStream() @@ -480,7 +512,66 @@ void wxTextOutputStream::WriteString(const wxString& string) wxTextOutputStream& wxTextOutputStream::PutChar(wxChar c) { #if wxUSE_UNICODE +#if SIZEOF_WCHAR_T == 2 + wxCharBuffer buffer; + size_t len; + if ( m_lastWChar ) + { + wxChar buf[2]; + buf[0] = m_lastWChar; + buf[1] = c; + buffer = m_conv->cWC2MB(buf, WXSIZEOF(buf), &len); + m_lastWChar = 0; + } + else + { + buffer = m_conv->cWC2MB(&c, 1, &len); + } + + if ( !len ) + { + // Conversion failed, possibly because we have the first half of a + // surrogate character, so just store it and write it out when the + // second half is written to the stream too later. + // + // Notice that if we already had had a valid m_lastWChar, it is simply + // discarded here which is very bad, but there is no way to signal an + // error from here and this is not worse than the old code behaviour. + m_lastWChar = c; + } + else + { + for ( size_t n = 0; n < len; n++ ) + { + const char c = buffer[n]; + if ( c == '\n' ) + { + switch ( m_mode ) + { + case wxEOL_DOS: + m_output.Write("\r\n", 2); + continue; + + case wxEOL_MAC: + m_output.Write("\r", 1); + continue; + + default: + wxFAIL_MSG( wxT("unknown EOL mode in wxTextOutputStream") ); + wxFALLTHROUGH; + + case wxEOL_UNIX: + // don't treat '\n' specially + ; + } + } + + m_output.Write(&c, 1); + } + } +#else // SIZEOF_WCHAR_T == 4 WriteString( wxString(&c, *m_conv, 1) ); +#endif // SIZEOF_WCHAR_T == 2 or 4 #else WriteString( wxString(&c, wxConvLocal, 1) ); #endif