From 048ba4b509e39f83c115a47cc115dff3c4d0e746 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Thu, 12 Nov 2015 02:39:36 +0100 Subject: [PATCH 1/6] Fail to convert wide string with incomplete surrogates to UTF-8 Correctly fail if the wide string being converted is UTF-16 encoded (which can only happen on platforms using 16 bit wchar_t, i.e. MSW) and ends in the middle of a surrogate pair. Notice that other conversions still wrongly encode invalid wchar_t sequences such as 0xd800 not followed by anything, this will need to be fixed in the future, but for now at least make it work for the most commonly used conversion. See #17070. --- src/common/strconv.cpp | 29 +++++++++++++++++++++++------ tests/mbconv/mbconvtest.cpp | 12 ++++++++++++ 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index f51660808a..c3191201c4 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -1122,13 +1122,30 @@ wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen, wxUint32 code; #ifdef WC_UTF16 - // cast is ok for WC_UTF16 - if ( decode_utf16((const wxUint16 *)wp, code) == 2 ) + // Be careful here: decode_utf16() may need to read the next wchar_t + // but we might not have any left, so pass it a temporary buffer which + // always has 2 wide characters and take care to set its second element + // to 0, which is invalid as a second half of a surrogate, to ensure + // that we return an error when trying to convert a buffer ending with + // half of a surrogate. + wxUint16 tmp[2]; + tmp[0] = wp[0]; + tmp[1] = srcLen != 0 ? wp[1] : 0; + switch ( decode_utf16(tmp, code) ) { - // skip the next char too as we decoded a surrogate - wp++; - if ( srcLen != wxNO_LEN ) - srcLen--; + case 1: + // Nothing special to do, just a character from BMP. + break; + + case 2: + // skip the next char too as we decoded a surrogate + wp++; + if ( srcLen != wxNO_LEN ) + srcLen--; + break; + + case wxCONV_FAILED: + return wxCONV_FAILED; } #else // wchar_t is UTF-32 code = *wp & 0x7fffffff; diff --git a/tests/mbconv/mbconvtest.cpp b/tests/mbconv/mbconvtest.cpp index 932ba3f416..10985fe3d5 100644 --- a/tests/mbconv/mbconvtest.cpp +++ b/tests/mbconv/mbconvtest.cpp @@ -203,6 +203,12 @@ private: void UTF8PUA_f4_80_82_a5() { UTF8PUA("\xf4\x80\x82\xa5", u1000a5); } void UTF8Octal_backslash245() { UTF8Octal("\\245", L"\\245"); } + // Test that converting string with incomplete surrogates in them fails + // (surrogates are only used in UTF-16, i.e. when wchar_t is 16 bits). +#if SIZEOF_WCHAR_T == 2 + void UTF8_fail_broken_surrogates(); +#endif // SIZEOF_WCHAR_T == 2 + // implementation for the utf-8 tests (see comments below) void UTF8(const char *charSequence, const wchar_t *wideSequence); void UTF8PUA(const char *charSequence, const wchar_t *wideSequence); @@ -461,6 +467,12 @@ void MBConvTestCase::UTF8Tests() wxConvUTF8, 1 ); + +#if SIZEOF_WCHAR_T == 2 + // Can't use \ud800 as it's an invalid Unicode character. + const wchar_t wc = 0xd800; + CPPUNIT_ASSERT_EQUAL(wxCONV_FAILED, wxConvUTF8.FromWChar(NULL, 0, &wc, 1)); +#endif // SIZEOF_WCHAR_T == 2 } void MBConvTestCase::UTF16LETests() From 5cff8c1232c42266cdcd64c516915bc95292d087 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Fri, 13 Nov 2015 18:00:43 +0100 Subject: [PATCH 2/6] Fix return value of wxMBConvUTF32::cWC2MB() in presence of surrogates UTF-32 conversions only estimate, from above, the size of the output buffer needed, so the value returned from the first call to FromWChar(NULL) in cWC2MB() can be inexact for them and we need to return the value returned by the second call to FromWChar() doing the real conversion from cWC2MB() itself to ensure that we return the correct output length. See #17070. --- src/common/strconv.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index c3191201c4..3102294183 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -488,7 +488,12 @@ wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const // the input is not wxCharBuffer buf(dstLen + nulLen - 1); memset(buf.data() + dstLen, 0, nulLen); - if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED ) + + // Notice that return value of the call to FromWChar() here may be + // different from the one above as it could have overestimated the + // space needed, while what we get here is the exact length. + dstLen = FromWChar(buf.data(), dstLen, inBuff, inLen); + if ( dstLen != wxCONV_FAILED ) { if ( outLen ) { From 823a2337f64b0e23f241abe543f073ea0b13f33a Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Fri, 13 Nov 2015 18:03:14 +0100 Subject: [PATCH 3/6] Make wxTextStream classes work with surrogates under MSW On the platforms using UTF-16 for wchar_t we can't read nor write Unicode data one wchar_t at a time as a single half of a surrogate character can't be converted to or from the encoding of the stream. To fix this, we may need to store the last wchar_t already read from the stream but not returned yet in wxTextInputStream::NextChar() and store, without writing it, the wchar_t passed to wxTextOutputStream::PutChar() until the second half of the surrogate is written. See #17070. --- include/wx/txtstrm.h | 19 ++++++++- src/common/txtstrm.cpp | 93 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 109 insertions(+), 3 deletions(-) diff --git a/include/wx/txtstrm.h b/include/wx/txtstrm.h index 465a277164..81b6d751b1 100644 --- a/include/wx/txtstrm.h +++ b/include/wx/txtstrm.h @@ -87,7 +87,16 @@ protected: #if wxUSE_UNICODE wxMBConv *m_conv; -#endif + + // The second half of a surrogate character when using UTF-16 for wchar_t: + // we can't return it immediately from GetChar() when we read a Unicode + // code point outside of the BMP, but we can't keep it in m_lastBytes + // neither because it can't separately decoded, so we have a separate 1 + // wchar_t buffer just for this case. +#if SIZEOF_WCHAR_T == 2 + wchar_t m_lastWChar; +#endif // SIZEOF_WCHAR_T == 2 +#endif // wxUSE_UNICODE bool EatEOL(const wxChar &c); void UngetLast(); // should be used instead of wxInputStream::Ungetch() because of Unicode issues @@ -165,7 +174,13 @@ protected: #if wxUSE_UNICODE wxMBConv *m_conv; -#endif + +#if SIZEOF_WCHAR_T == 2 + // The first half of a surrogate character if one was passed to PutChar() + // and couldn't be output when it was called the last time. + wchar_t m_lastWChar; +#endif // SIZEOF_WCHAR_T == 2 +#endif // wxUSE_UNICODE wxDECLARE_NO_COPY_CLASS(wxTextOutputStream); }; diff --git a/src/common/txtstrm.cpp b/src/common/txtstrm.cpp index 97fd0c9072..3b2c44949f 100644 --- a/src/common/txtstrm.cpp +++ b/src/common/txtstrm.cpp @@ -36,6 +36,10 @@ wxTextInputStream::wxTextInputStream(wxInputStream &s, : m_input(s), m_separators(sep), m_conv(conv.Clone()) { memset((void*)m_lastBytes, 0, 10); + +#if SIZEOF_WCHAR_T == 2 + m_lastWChar = 0; +#endif // SIZEOF_WCHAR_T == 2 } #else wxTextInputStream::wxTextInputStream(wxInputStream &s, const wxString &sep) @@ -64,6 +68,17 @@ void wxTextInputStream::UngetLast() wxChar wxTextInputStream::NextChar() { #if wxUSE_UNICODE +#if SIZEOF_WCHAR_T == 2 + // Return the already raed character remaining from the last call to this + // function, if any. + if ( m_lastWChar ) + { + const wxChar wc = m_lastWChar; + m_lastWChar = 0; + return wc; + } +#endif // !SWIG_ONLY_SCRIPT_API + wxChar wbuf[2]; memset((void*)m_lastBytes, 0, 10); for(size_t inlen = 0; inlen < 9; inlen++) @@ -91,10 +106,23 @@ wxChar wxTextInputStream::NextChar() // if we couldn't decode a single character during the last // loop iteration we shouldn't be able to decode 2 or more of // them with an extra single byte, something fishy is going on + // (except if we use UTF-16, see below) wxFAIL_MSG("unexpected decoding result"); - wxFALLTHROUGH;// fall through nevertheless and return at least something + return wxEOT; + +#if SIZEOF_WCHAR_T == 2 + case 2: + // When wchar_t uses UTF-16, we could have decoded a single + // Unicode code point as 2 wchar_t characters and there is + // nothing else to do here but to return the first one now and + // remember the second one for the next call, as there is no + // way to fit both of them into a single wxChar in this case. + m_lastWChar = wbuf[1]; +#endif // !SWIG_ONLY_SCRIPT_API + wxFALLTHROUGH; case 1: + // we finally decoded a character return wbuf[0]; } @@ -374,6 +402,10 @@ wxTextOutputStream::wxTextOutputStream(wxOutputStream& s, wxEOL mode) m_mode = wxEOL_UNIX; #endif } + +#if wxUSE_UNICODE && SIZEOF_WCHAR_T == 2 + m_lastWChar = 0; +#endif // SIZEOF_WCHAR_T == 2 } wxTextOutputStream::~wxTextOutputStream() @@ -480,7 +512,66 @@ void wxTextOutputStream::WriteString(const wxString& string) wxTextOutputStream& wxTextOutputStream::PutChar(wxChar c) { #if wxUSE_UNICODE +#if SIZEOF_WCHAR_T == 2 + wxCharBuffer buffer; + size_t len; + if ( m_lastWChar ) + { + wxChar buf[2]; + buf[0] = m_lastWChar; + buf[1] = c; + buffer = m_conv->cWC2MB(buf, WXSIZEOF(buf), &len); + m_lastWChar = 0; + } + else + { + buffer = m_conv->cWC2MB(&c, 1, &len); + } + + if ( !len ) + { + // Conversion failed, possibly because we have the first half of a + // surrogate character, so just store it and write it out when the + // second half is written to the stream too later. + // + // Notice that if we already had had a valid m_lastWChar, it is simply + // discarded here which is very bad, but there is no way to signal an + // error from here and this is not worse than the old code behaviour. + m_lastWChar = c; + } + else + { + for ( size_t n = 0; n < len; n++ ) + { + const char c = buffer[n]; + if ( c == '\n' ) + { + switch ( m_mode ) + { + case wxEOL_DOS: + m_output.Write("\r\n", 2); + continue; + + case wxEOL_MAC: + m_output.Write("\r", 1); + continue; + + default: + wxFAIL_MSG( wxT("unknown EOL mode in wxTextOutputStream") ); + wxFALLTHROUGH; + + case wxEOL_UNIX: + // don't treat '\n' specially + ; + } + } + + m_output.Write(&c, 1); + } + } +#else // SIZEOF_WCHAR_T == 4 WriteString( wxString(&c, *m_conv, 1) ); +#endif // SIZEOF_WCHAR_T == 2 or 4 #else WriteString( wxString(&c, wxConvLocal, 1) ); #endif From 37dd89a0da3ab22c2720d40a595134c0ab4b9978 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Fri, 13 Nov 2015 19:08:27 +0100 Subject: [PATCH 4/6] Don't rely on wxMBConv::cWC2MB(NULL) returning the exact byte count UTF-32 conversions use a useful optimization by avoiding the extra scan of the input wchar_t string in their FromWChar() and cWC2MB() implementation when they are only asked to compute the required buffer size without actually doing the conversion. However this means that for an input string containing UTF-16 surrogates (which is possible under MSW where wchar_t is 16 bits) the actual size of the output string can be smaller than that returned by FromWChar(NULL). Document that this may happen and avoid relying on the exact equality in the tests. See #17070. --- interface/wx/strconv.h | 11 ++++++++--- tests/mbconv/mbconvtest.cpp | 7 ++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/interface/wx/strconv.h b/interface/wx/strconv.h index 9d89c37e78..9388b49135 100644 --- a/interface/wx/strconv.h +++ b/interface/wx/strconv.h @@ -121,7 +121,7 @@ public: including the terminating @c NUL character(s). @return - The number of character written (or which would have been written + The number of characters written (or which would have been written if it were non-@NULL) to @a dst or @c wxCONV_FAILED on error. */ virtual size_t ToWChar(wchar_t* dst, size_t dstLen, const char* src, @@ -148,8 +148,13 @@ public: including the terminating @c NUL character. @return - The number of character written (or which would have been written - if it were non-@NULL) to @a dst or @c wxCONV_FAILED on error. + If @dst is non-@NULL, the number of characters actually written to + it. If @dst is @NULL, the returned value is at least equal to the + number of characters that would have been written out if it were + non-@NULL, but can be larger than it under the platforms using + UTF-16 as @c wchar_t encoding (this allows a useful optimization in + the implementation of this function for UTF-32). In any case, + @c wxCONV_FAILED is returned on conversion error. */ virtual size_t FromWChar(char* dst, size_t dstLen, const wchar_t* src, size_t srcLen = wxNO_LEN) const; diff --git a/tests/mbconv/mbconvtest.cpp b/tests/mbconv/mbconvtest.cpp index 10985fe3d5..69044cc057 100644 --- a/tests/mbconv/mbconvtest.cpp +++ b/tests/mbconv/mbconvtest.cpp @@ -1096,15 +1096,16 @@ void MBConvTestCase::TestEncoder( memcpy( inputCopy.data(), wideBuffer, (wideChars*sizeof(wchar_t)) ); inputCopy.data()[wideChars] = 0; - // calculate the output size + // calculate the output size: notice that it can be greater than the real + // size as the converter is allowed to estimate the maximal size needed + // instead of computing it precisely size_t outputWritten = converter.WC2MB ( 0, (const wchar_t*)inputCopy.data(), 0 ); - // make sure the correct output length was calculated - CPPUNIT_ASSERT_EQUAL( multiBytes, outputWritten ); + CPPUNIT_ASSERT( outputWritten >= multiBytes ); // convert the string size_t guardBytes = 8; // to make sure we're not overrunning the output buffer From e570e8b6ac652a4dabd9af67c263157519c0f2c1 Mon Sep 17 00:00:00 2001 From: ARATA Mizuki Date: Fri, 13 Nov 2015 19:17:37 +0100 Subject: [PATCH 5/6] Fix conversion from wchar_t string with surrogates to UTF-8 Correctly account for the second half of the surrogate in wxMBConvUTF8::FromWChar() implementation, this makes it actually work for the strings containing surrogates on the platforms using UTF-16 encoding for wchar_t (such as MSW). See #17070. --- src/common/strconv.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index 3102294183..76c6df67a7 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -1419,7 +1419,12 @@ size_t wxMBConvUTF8::FromWChar(char *buf, size_t n, #ifdef WC_UTF16 // cast is ok for WC_UTF16 size_t pa = decode_utf16((const wxUint16 *)psz, cc); + + // we could have consumed two input code units if we decoded a + // surrogate, so adjust the input pointer and, if necessary, the length psz += (pa == wxCONV_FAILED) ? 1 : pa; + if ( pa == 2 && !isNulTerminated ) + srcLen--; #else cc = (*psz++) & 0x7fffffff; #endif From 0c02d70fa57ff221b1ad4cb33f714c32b59c7d32 Mon Sep 17 00:00:00 2001 From: ARATA Mizuki Date: Fri, 13 Nov 2015 19:19:28 +0100 Subject: [PATCH 6/6] Add a test checking that conversions involving surrogates work After the fixes in the previous commits conversions between wchar_t containing surrogates and UTF-{8,16,32} work correctly, so add a test ensuring that this is the case. Notice that other conversions are still broken in presence of surrogates. See #17070. --- tests/mbconv/mbconvtest.cpp | 82 +++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/tests/mbconv/mbconvtest.cpp b/tests/mbconv/mbconvtest.cpp index 69044cc057..4674a67e9a 100644 --- a/tests/mbconv/mbconvtest.cpp +++ b/tests/mbconv/mbconvtest.cpp @@ -81,6 +81,7 @@ private: CPPUNIT_TEST( FontmapTests ); CPPUNIT_TEST( BufSize ); CPPUNIT_TEST( FromWCharTests ); + CPPUNIT_TEST( NonBMPCharTests ); #ifdef HAVE_WCHAR_H CPPUNIT_TEST( UTF8_41 ); CPPUNIT_TEST( UTF8_7f ); @@ -116,6 +117,7 @@ private: void FontmapTests(); void BufSize(); void FromWCharTests(); + void NonBMPCharTests(); void IconvTests(); void Latin1Tests(); @@ -940,6 +942,86 @@ void MBConvTestCase::FromWCharTests() CPPUNIT_ASSERT_EQUAL( '!', mbuf[6]); } +void MBConvTestCase::NonBMPCharTests() +{ + // U+1F363 (UTF-16: D83C DF63, UTF-8: F0 9F 8D A3) sushi (emoji) + // U+732B (UTF-8: E7 8C AB) cat (kanji) + // U+1F408 (UTF-16: D83D DC08, UTF-8: F0 9F 90 88) cat (emoji) + // U+845B U+E0101 (UTF-16: 845B DB40 DD01, UTF-8: E8 91 9B F3 A0 84 81) (a kanji + an IVS) + const char u8[] = + "\xF0\x9F\x8D\xA3" /* U+1F363 */ + "\xE7\x8C\xAB\xF0\x9F\x90\x88" /* U+732B U+1F408 */ + "\xE8\x91\x9B\xF3\xA0\x84\x81"; /* U+845B U+E0101 */ + const wxChar16 u16[] = { + 0xD83C, 0xDF63, + 0x732B, 0xD83D, 0xDC08, + 0x845B, 0xDB40, 0xDD01, + 0}; + const wxChar32 u32[] = { + 0x1F363, + 0x732B, 0x1F408, + 0x845B, 0xE0101, + 0}; +#if SIZEOF_WCHAR_T == 2 + const wchar_t *const w = u16; + const size_t wchars = sizeof(u16)/sizeof(wxChar16) - 1; +#else + const wchar_t *const w = u32; + const size_t wchars = sizeof(u32)/sizeof(wxChar32) - 1; +#endif + { + // Notice that these tests can only be done with strict UTF-8 + // converter, the use of any MAP_INVALID_UTF8_XXX options currently + // completely breaks wxTextInputStream use. + TestDecoder(w, wchars, u8, sizeof(u8)-1, wxConvUTF8, 1); + TestEncoder(w, wchars, u8, sizeof(u8)-1, wxConvUTF8, 1); + } + { + char u16le[sizeof(u16)]; + for (size_t i = 0; i < sizeof(u16)/2; ++i) { + u16le[2*i] = (char)(unsigned char)(u16[i] & 0xFF); + u16le[2*i+1] = (char)(unsigned char)((u16[i] >> 8) & 0xFF); + } + wxMBConvUTF16LE conv; + TestDecoder(w, wchars, u16le, sizeof(u16le)-2, conv, 2); + TestEncoder(w, wchars, u16le, sizeof(u16le)-2, conv, 2); + } + { + char u16be[sizeof(u16)]; + for (size_t i = 0; i < sizeof(u16)/2; ++i) { + u16be[2*i] = (char)(unsigned char)((u16[i] >> 8) & 0xFF); + u16be[2*i+1] = (char)(unsigned char)(u16[i] & 0xFF); + } + wxMBConvUTF16BE conv; + TestDecoder(w, wchars, u16be, sizeof(u16be)-2, conv, 2); + TestEncoder(w, wchars, u16be, sizeof(u16be)-2, conv, 2); + } + { + char u32le[sizeof(u32)]; + for (size_t i = 0; i < sizeof(u32)/4; ++i) { + u32le[4*i] = (char)(unsigned char)(u32[i] & 0xFF); + u32le[4*i+1] = (char)(unsigned char)((u32[i] >> 8) & 0xFF); + u32le[4*i+2] = (char)(unsigned char)((u32[i] >> 16) & 0xFF); + u32le[4*i+3] = (char)(unsigned char)((u32[i] >> 24) & 0xFF); + } + wxMBConvUTF32LE conv; + TestDecoder(w, wchars, u32le, sizeof(u32le)-4, conv, 4); + TestEncoder(w, wchars, u32le, sizeof(u32le)-4, conv, 4); + } + { + char u32be[sizeof(u32)]; + for (size_t i = 0; i < sizeof(u32)/4; ++i) { + u32be[4*i] = (char)(unsigned char)((u32[i] >> 24) & 0xFF); + u32be[4*i+1] = (char)(unsigned char)((u32[i] >> 16) & 0xFF); + u32be[4*i+2] = (char)(unsigned char)((u32[i] >> 8) & 0xFF); + u32be[4*i+3] = (char)(unsigned char)(u32[i] & 0xFF); + } + wxMBConvUTF32BE conv; + TestDecoder(w, wchars, u32be, sizeof(u32be)-4, conv, 4); + TestEncoder(w, wchars, u32be, sizeof(u32be)-4, conv, 4); + } +} + WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name ); void MBConvTestCase::IconvTests()