diff --git a/include/wx/txtstrm.h b/include/wx/txtstrm.h index 465a277164..81b6d751b1 100644 --- a/include/wx/txtstrm.h +++ b/include/wx/txtstrm.h @@ -87,7 +87,16 @@ protected: #if wxUSE_UNICODE wxMBConv *m_conv; -#endif + + // The second half of a surrogate character when using UTF-16 for wchar_t: + // we can't return it immediately from GetChar() when we read a Unicode + // code point outside of the BMP, but we can't keep it in m_lastBytes + // neither because it can't separately decoded, so we have a separate 1 + // wchar_t buffer just for this case. +#if SIZEOF_WCHAR_T == 2 + wchar_t m_lastWChar; +#endif // SIZEOF_WCHAR_T == 2 +#endif // wxUSE_UNICODE bool EatEOL(const wxChar &c); void UngetLast(); // should be used instead of wxInputStream::Ungetch() because of Unicode issues @@ -165,7 +174,13 @@ protected: #if wxUSE_UNICODE wxMBConv *m_conv; -#endif + +#if SIZEOF_WCHAR_T == 2 + // The first half of a surrogate character if one was passed to PutChar() + // and couldn't be output when it was called the last time. + wchar_t m_lastWChar; +#endif // SIZEOF_WCHAR_T == 2 +#endif // wxUSE_UNICODE wxDECLARE_NO_COPY_CLASS(wxTextOutputStream); }; diff --git a/interface/wx/strconv.h b/interface/wx/strconv.h index 9d89c37e78..9388b49135 100644 --- a/interface/wx/strconv.h +++ b/interface/wx/strconv.h @@ -121,7 +121,7 @@ public: including the terminating @c NUL character(s). @return - The number of character written (or which would have been written + The number of characters written (or which would have been written if it were non-@NULL) to @a dst or @c wxCONV_FAILED on error. */ virtual size_t ToWChar(wchar_t* dst, size_t dstLen, const char* src, @@ -148,8 +148,13 @@ public: including the terminating @c NUL character. @return - The number of character written (or which would have been written - if it were non-@NULL) to @a dst or @c wxCONV_FAILED on error. + If @dst is non-@NULL, the number of characters actually written to + it. If @dst is @NULL, the returned value is at least equal to the + number of characters that would have been written out if it were + non-@NULL, but can be larger than it under the platforms using + UTF-16 as @c wchar_t encoding (this allows a useful optimization in + the implementation of this function for UTF-32). In any case, + @c wxCONV_FAILED is returned on conversion error. */ virtual size_t FromWChar(char* dst, size_t dstLen, const wchar_t* src, size_t srcLen = wxNO_LEN) const; diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index f51660808a..76c6df67a7 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -488,7 +488,12 @@ wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const // the input is not wxCharBuffer buf(dstLen + nulLen - 1); memset(buf.data() + dstLen, 0, nulLen); - if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED ) + + // Notice that return value of the call to FromWChar() here may be + // different from the one above as it could have overestimated the + // space needed, while what we get here is the exact length. + dstLen = FromWChar(buf.data(), dstLen, inBuff, inLen); + if ( dstLen != wxCONV_FAILED ) { if ( outLen ) { @@ -1122,13 +1127,30 @@ wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen, wxUint32 code; #ifdef WC_UTF16 - // cast is ok for WC_UTF16 - if ( decode_utf16((const wxUint16 *)wp, code) == 2 ) + // Be careful here: decode_utf16() may need to read the next wchar_t + // but we might not have any left, so pass it a temporary buffer which + // always has 2 wide characters and take care to set its second element + // to 0, which is invalid as a second half of a surrogate, to ensure + // that we return an error when trying to convert a buffer ending with + // half of a surrogate. + wxUint16 tmp[2]; + tmp[0] = wp[0]; + tmp[1] = srcLen != 0 ? wp[1] : 0; + switch ( decode_utf16(tmp, code) ) { - // skip the next char too as we decoded a surrogate - wp++; - if ( srcLen != wxNO_LEN ) - srcLen--; + case 1: + // Nothing special to do, just a character from BMP. + break; + + case 2: + // skip the next char too as we decoded a surrogate + wp++; + if ( srcLen != wxNO_LEN ) + srcLen--; + break; + + case wxCONV_FAILED: + return wxCONV_FAILED; } #else // wchar_t is UTF-32 code = *wp & 0x7fffffff; @@ -1397,7 +1419,12 @@ size_t wxMBConvUTF8::FromWChar(char *buf, size_t n, #ifdef WC_UTF16 // cast is ok for WC_UTF16 size_t pa = decode_utf16((const wxUint16 *)psz, cc); + + // we could have consumed two input code units if we decoded a + // surrogate, so adjust the input pointer and, if necessary, the length psz += (pa == wxCONV_FAILED) ? 1 : pa; + if ( pa == 2 && !isNulTerminated ) + srcLen--; #else cc = (*psz++) & 0x7fffffff; #endif diff --git a/src/common/txtstrm.cpp b/src/common/txtstrm.cpp index 97fd0c9072..3b2c44949f 100644 --- a/src/common/txtstrm.cpp +++ b/src/common/txtstrm.cpp @@ -36,6 +36,10 @@ wxTextInputStream::wxTextInputStream(wxInputStream &s, : m_input(s), m_separators(sep), m_conv(conv.Clone()) { memset((void*)m_lastBytes, 0, 10); + +#if SIZEOF_WCHAR_T == 2 + m_lastWChar = 0; +#endif // SIZEOF_WCHAR_T == 2 } #else wxTextInputStream::wxTextInputStream(wxInputStream &s, const wxString &sep) @@ -64,6 +68,17 @@ void wxTextInputStream::UngetLast() wxChar wxTextInputStream::NextChar() { #if wxUSE_UNICODE +#if SIZEOF_WCHAR_T == 2 + // Return the already raed character remaining from the last call to this + // function, if any. + if ( m_lastWChar ) + { + const wxChar wc = m_lastWChar; + m_lastWChar = 0; + return wc; + } +#endif // !SWIG_ONLY_SCRIPT_API + wxChar wbuf[2]; memset((void*)m_lastBytes, 0, 10); for(size_t inlen = 0; inlen < 9; inlen++) @@ -91,10 +106,23 @@ wxChar wxTextInputStream::NextChar() // if we couldn't decode a single character during the last // loop iteration we shouldn't be able to decode 2 or more of // them with an extra single byte, something fishy is going on + // (except if we use UTF-16, see below) wxFAIL_MSG("unexpected decoding result"); - wxFALLTHROUGH;// fall through nevertheless and return at least something + return wxEOT; + +#if SIZEOF_WCHAR_T == 2 + case 2: + // When wchar_t uses UTF-16, we could have decoded a single + // Unicode code point as 2 wchar_t characters and there is + // nothing else to do here but to return the first one now and + // remember the second one for the next call, as there is no + // way to fit both of them into a single wxChar in this case. + m_lastWChar = wbuf[1]; +#endif // !SWIG_ONLY_SCRIPT_API + wxFALLTHROUGH; case 1: + // we finally decoded a character return wbuf[0]; } @@ -374,6 +402,10 @@ wxTextOutputStream::wxTextOutputStream(wxOutputStream& s, wxEOL mode) m_mode = wxEOL_UNIX; #endif } + +#if wxUSE_UNICODE && SIZEOF_WCHAR_T == 2 + m_lastWChar = 0; +#endif // SIZEOF_WCHAR_T == 2 } wxTextOutputStream::~wxTextOutputStream() @@ -480,7 +512,66 @@ void wxTextOutputStream::WriteString(const wxString& string) wxTextOutputStream& wxTextOutputStream::PutChar(wxChar c) { #if wxUSE_UNICODE +#if SIZEOF_WCHAR_T == 2 + wxCharBuffer buffer; + size_t len; + if ( m_lastWChar ) + { + wxChar buf[2]; + buf[0] = m_lastWChar; + buf[1] = c; + buffer = m_conv->cWC2MB(buf, WXSIZEOF(buf), &len); + m_lastWChar = 0; + } + else + { + buffer = m_conv->cWC2MB(&c, 1, &len); + } + + if ( !len ) + { + // Conversion failed, possibly because we have the first half of a + // surrogate character, so just store it and write it out when the + // second half is written to the stream too later. + // + // Notice that if we already had had a valid m_lastWChar, it is simply + // discarded here which is very bad, but there is no way to signal an + // error from here and this is not worse than the old code behaviour. + m_lastWChar = c; + } + else + { + for ( size_t n = 0; n < len; n++ ) + { + const char c = buffer[n]; + if ( c == '\n' ) + { + switch ( m_mode ) + { + case wxEOL_DOS: + m_output.Write("\r\n", 2); + continue; + + case wxEOL_MAC: + m_output.Write("\r", 1); + continue; + + default: + wxFAIL_MSG( wxT("unknown EOL mode in wxTextOutputStream") ); + wxFALLTHROUGH; + + case wxEOL_UNIX: + // don't treat '\n' specially + ; + } + } + + m_output.Write(&c, 1); + } + } +#else // SIZEOF_WCHAR_T == 4 WriteString( wxString(&c, *m_conv, 1) ); +#endif // SIZEOF_WCHAR_T == 2 or 4 #else WriteString( wxString(&c, wxConvLocal, 1) ); #endif diff --git a/tests/mbconv/mbconvtest.cpp b/tests/mbconv/mbconvtest.cpp index 932ba3f416..4674a67e9a 100644 --- a/tests/mbconv/mbconvtest.cpp +++ b/tests/mbconv/mbconvtest.cpp @@ -81,6 +81,7 @@ private: CPPUNIT_TEST( FontmapTests ); CPPUNIT_TEST( BufSize ); CPPUNIT_TEST( FromWCharTests ); + CPPUNIT_TEST( NonBMPCharTests ); #ifdef HAVE_WCHAR_H CPPUNIT_TEST( UTF8_41 ); CPPUNIT_TEST( UTF8_7f ); @@ -116,6 +117,7 @@ private: void FontmapTests(); void BufSize(); void FromWCharTests(); + void NonBMPCharTests(); void IconvTests(); void Latin1Tests(); @@ -203,6 +205,12 @@ private: void UTF8PUA_f4_80_82_a5() { UTF8PUA("\xf4\x80\x82\xa5", u1000a5); } void UTF8Octal_backslash245() { UTF8Octal("\\245", L"\\245"); } + // Test that converting string with incomplete surrogates in them fails + // (surrogates are only used in UTF-16, i.e. when wchar_t is 16 bits). +#if SIZEOF_WCHAR_T == 2 + void UTF8_fail_broken_surrogates(); +#endif // SIZEOF_WCHAR_T == 2 + // implementation for the utf-8 tests (see comments below) void UTF8(const char *charSequence, const wchar_t *wideSequence); void UTF8PUA(const char *charSequence, const wchar_t *wideSequence); @@ -461,6 +469,12 @@ void MBConvTestCase::UTF8Tests() wxConvUTF8, 1 ); + +#if SIZEOF_WCHAR_T == 2 + // Can't use \ud800 as it's an invalid Unicode character. + const wchar_t wc = 0xd800; + CPPUNIT_ASSERT_EQUAL(wxCONV_FAILED, wxConvUTF8.FromWChar(NULL, 0, &wc, 1)); +#endif // SIZEOF_WCHAR_T == 2 } void MBConvTestCase::UTF16LETests() @@ -928,6 +942,86 @@ void MBConvTestCase::FromWCharTests() CPPUNIT_ASSERT_EQUAL( '!', mbuf[6]); } +void MBConvTestCase::NonBMPCharTests() +{ + // U+1F363 (UTF-16: D83C DF63, UTF-8: F0 9F 8D A3) sushi (emoji) + // U+732B (UTF-8: E7 8C AB) cat (kanji) + // U+1F408 (UTF-16: D83D DC08, UTF-8: F0 9F 90 88) cat (emoji) + // U+845B U+E0101 (UTF-16: 845B DB40 DD01, UTF-8: E8 91 9B F3 A0 84 81) (a kanji + an IVS) + const char u8[] = + "\xF0\x9F\x8D\xA3" /* U+1F363 */ + "\xE7\x8C\xAB\xF0\x9F\x90\x88" /* U+732B U+1F408 */ + "\xE8\x91\x9B\xF3\xA0\x84\x81"; /* U+845B U+E0101 */ + const wxChar16 u16[] = { + 0xD83C, 0xDF63, + 0x732B, 0xD83D, 0xDC08, + 0x845B, 0xDB40, 0xDD01, + 0}; + const wxChar32 u32[] = { + 0x1F363, + 0x732B, 0x1F408, + 0x845B, 0xE0101, + 0}; +#if SIZEOF_WCHAR_T == 2 + const wchar_t *const w = u16; + const size_t wchars = sizeof(u16)/sizeof(wxChar16) - 1; +#else + const wchar_t *const w = u32; + const size_t wchars = sizeof(u32)/sizeof(wxChar32) - 1; +#endif + { + // Notice that these tests can only be done with strict UTF-8 + // converter, the use of any MAP_INVALID_UTF8_XXX options currently + // completely breaks wxTextInputStream use. + TestDecoder(w, wchars, u8, sizeof(u8)-1, wxConvUTF8, 1); + TestEncoder(w, wchars, u8, sizeof(u8)-1, wxConvUTF8, 1); + } + { + char u16le[sizeof(u16)]; + for (size_t i = 0; i < sizeof(u16)/2; ++i) { + u16le[2*i] = (char)(unsigned char)(u16[i] & 0xFF); + u16le[2*i+1] = (char)(unsigned char)((u16[i] >> 8) & 0xFF); + } + wxMBConvUTF16LE conv; + TestDecoder(w, wchars, u16le, sizeof(u16le)-2, conv, 2); + TestEncoder(w, wchars, u16le, sizeof(u16le)-2, conv, 2); + } + { + char u16be[sizeof(u16)]; + for (size_t i = 0; i < sizeof(u16)/2; ++i) { + u16be[2*i] = (char)(unsigned char)((u16[i] >> 8) & 0xFF); + u16be[2*i+1] = (char)(unsigned char)(u16[i] & 0xFF); + } + wxMBConvUTF16BE conv; + TestDecoder(w, wchars, u16be, sizeof(u16be)-2, conv, 2); + TestEncoder(w, wchars, u16be, sizeof(u16be)-2, conv, 2); + } + { + char u32le[sizeof(u32)]; + for (size_t i = 0; i < sizeof(u32)/4; ++i) { + u32le[4*i] = (char)(unsigned char)(u32[i] & 0xFF); + u32le[4*i+1] = (char)(unsigned char)((u32[i] >> 8) & 0xFF); + u32le[4*i+2] = (char)(unsigned char)((u32[i] >> 16) & 0xFF); + u32le[4*i+3] = (char)(unsigned char)((u32[i] >> 24) & 0xFF); + } + wxMBConvUTF32LE conv; + TestDecoder(w, wchars, u32le, sizeof(u32le)-4, conv, 4); + TestEncoder(w, wchars, u32le, sizeof(u32le)-4, conv, 4); + } + { + char u32be[sizeof(u32)]; + for (size_t i = 0; i < sizeof(u32)/4; ++i) { + u32be[4*i] = (char)(unsigned char)((u32[i] >> 24) & 0xFF); + u32be[4*i+1] = (char)(unsigned char)((u32[i] >> 16) & 0xFF); + u32be[4*i+2] = (char)(unsigned char)((u32[i] >> 8) & 0xFF); + u32be[4*i+3] = (char)(unsigned char)(u32[i] & 0xFF); + } + wxMBConvUTF32BE conv; + TestDecoder(w, wchars, u32be, sizeof(u32be)-4, conv, 4); + TestEncoder(w, wchars, u32be, sizeof(u32be)-4, conv, 4); + } +} + WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name ); void MBConvTestCase::IconvTests() @@ -1084,15 +1178,16 @@ void MBConvTestCase::TestEncoder( memcpy( inputCopy.data(), wideBuffer, (wideChars*sizeof(wchar_t)) ); inputCopy.data()[wideChars] = 0; - // calculate the output size + // calculate the output size: notice that it can be greater than the real + // size as the converter is allowed to estimate the maximal size needed + // instead of computing it precisely size_t outputWritten = converter.WC2MB ( 0, (const wchar_t*)inputCopy.data(), 0 ); - // make sure the correct output length was calculated - CPPUNIT_ASSERT_EQUAL( multiBytes, outputWritten ); + CPPUNIT_ASSERT( outputWritten >= multiBytes ); // convert the string size_t guardBytes = 8; // to make sure we're not overrunning the output buffer