Merge wchar_t-surrogates branch

Fix several problems with surrogates in UTF-16-encoded wchar_t strings: correct bugs in UTF-8 and UTF-32 conversion code and add surrogates support to wxText{Input,Output}Stream classes. Closes #17070.
2015-11-13 20:35:01 +01:00
parent 88408d536f 0c02d70fa5
commit 1f347637a6
5 changed files with 249 additions and 16 deletions
--- a/include/wx/txtstrm.h
+++ b/include/wx/txtstrm.h
@@ -87,7 +87,16 @@ protected:

 #if wxUSE_UNICODE
    wxMBConv *m_conv;
-#endif
+
+    // The second half of a surrogate character when using UTF-16 for wchar_t:
+    // we can't return it immediately from GetChar() when we read a Unicode
+    // code point outside of the BMP, but we can't keep it in m_lastBytes
+    // neither because it can't separately decoded, so we have a separate 1
+    // wchar_t buffer just for this case.
+#if SIZEOF_WCHAR_T == 2
+    wchar_t m_lastWChar;
+#endif // SIZEOF_WCHAR_T == 2
+#endif // wxUSE_UNICODE

    bool   EatEOL(const wxChar &c);
    void   UngetLast(); // should be used instead of wxInputStream::Ungetch() because of Unicode issues
@@ -165,7 +174,13 @@ protected:

 #if wxUSE_UNICODE
    wxMBConv *m_conv;
-#endif
+
+#if SIZEOF_WCHAR_T == 2
+    // The first half of a surrogate character if one was passed to PutChar()
+    // and couldn't be output when it was called the last time.
+    wchar_t m_lastWChar;
+#endif // SIZEOF_WCHAR_T == 2
+#endif // wxUSE_UNICODE

    wxDECLARE_NO_COPY_CLASS(wxTextOutputStream);
 };
--- a/interface/wx/strconv.h
+++ b/interface/wx/strconv.h
@@ -121,7 +121,7 @@ public:
            including the terminating @c NUL character(s).

        @return
-            The number of character written (or which would have been written
+            The number of characters written (or which would have been written
            if it were non-@NULL) to @a dst or @c wxCONV_FAILED on error.
    */
    virtual size_t ToWChar(wchar_t* dst, size_t dstLen, const char* src,
@@ -148,8 +148,13 @@ public:
            including the terminating @c NUL character.

        @return
-            The number of character written (or which would have been written
-            if it were non-@NULL) to @a dst or @c wxCONV_FAILED on error.
+            If @dst is non-@NULL, the number of characters actually written to
+            it. If @dst is @NULL, the returned value is at least equal to the
+            number of characters that would have been written out if it were
+            non-@NULL, but can be larger than it under the platforms using
+            UTF-16 as @c wchar_t encoding (this allows a useful optimization in
+            the implementation of this function for UTF-32). In any case,
+            @c wxCONV_FAILED is returned on conversion error.
    */
    virtual size_t FromWChar(char* dst, size_t dstLen, const wchar_t* src,
                             size_t srcLen = wxNO_LEN) const;
--- a/src/common/strconv.cpp
+++ b/src/common/strconv.cpp
@@ -488,7 +488,12 @@ wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
        // the input is not
        wxCharBuffer buf(dstLen + nulLen - 1);
        memset(buf.data() + dstLen, 0, nulLen);
-        if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
+
+        // Notice that return value of the call to FromWChar() here may be
+        // different from the one above as it could have overestimated the
+        // space needed, while what we get here is the exact length.
+        dstLen = FromWChar(buf.data(), dstLen, inBuff, inLen);
+        if ( dstLen != wxCONV_FAILED )
        {
            if ( outLen )
            {
@@ -1122,13 +1127,30 @@ wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,

        wxUint32 code;
 #ifdef WC_UTF16
-        // cast is ok for WC_UTF16
-        if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
+        // Be careful here: decode_utf16() may need to read the next wchar_t
+        // but we might not have any left, so pass it a temporary buffer which
+        // always has 2 wide characters and take care to set its second element
+        // to 0, which is invalid as a second half of a surrogate, to ensure
+        // that we return an error when trying to convert a buffer ending with
+        // half of a surrogate.
+        wxUint16 tmp[2];
+        tmp[0] = wp[0];
+        tmp[1] = srcLen != 0 ? wp[1] : 0;
+        switch ( decode_utf16(tmp, code) )
        {
-            // skip the next char too as we decoded a surrogate
-            wp++;
-            if ( srcLen != wxNO_LEN )
-                srcLen--;
+            case 1:
+                // Nothing special to do, just a character from BMP.
+                break;
+
+            case 2:
+                // skip the next char too as we decoded a surrogate
+                wp++;
+                if ( srcLen != wxNO_LEN )
+                    srcLen--;
+                break;
+
+            case wxCONV_FAILED:
+                return wxCONV_FAILED;
        }
 #else // wchar_t is UTF-32
        code = *wp & 0x7fffffff;
@@ -1397,7 +1419,12 @@ size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
 #ifdef WC_UTF16
        // cast is ok for WC_UTF16
        size_t pa = decode_utf16((const wxUint16 *)psz, cc);
+
+        // we could have consumed two input code units if we decoded a
+        // surrogate, so adjust the input pointer and, if necessary, the length
        psz += (pa == wxCONV_FAILED) ? 1 : pa;
+        if ( pa == 2 && !isNulTerminated )
+            srcLen--;
 #else
        cc = (*psz++) & 0x7fffffff;
 #endif
--- a/src/common/txtstrm.cpp
+++ b/src/common/txtstrm.cpp
@@ -36,6 +36,10 @@ wxTextInputStream::wxTextInputStream(wxInputStream &s,
  : m_input(s), m_separators(sep), m_conv(conv.Clone())
 {
    memset((void*)m_lastBytes, 0, 10);
+
+#if SIZEOF_WCHAR_T == 2
+    m_lastWChar = 0;
+#endif // SIZEOF_WCHAR_T == 2
 }
 #else
 wxTextInputStream::wxTextInputStream(wxInputStream &s, const wxString &sep)
@@ -64,6 +68,17 @@ void wxTextInputStream::UngetLast()
 wxChar wxTextInputStream::NextChar()
 {
 #if wxUSE_UNICODE
+#if SIZEOF_WCHAR_T == 2
+    // Return the already raed character remaining from the last call to this
+    // function, if any.
+    if ( m_lastWChar )
+    {
+        const wxChar wc = m_lastWChar;
+        m_lastWChar = 0;
+        return wc;
+    }
+#endif // !SWIG_ONLY_SCRIPT_API
+
    wxChar wbuf[2];
    memset((void*)m_lastBytes, 0, 10);
    for(size_t inlen = 0; inlen < 9; inlen++)
@@ -91,10 +106,23 @@ wxChar wxTextInputStream::NextChar()
                // if we couldn't decode a single character during the last
                // loop iteration we shouldn't be able to decode 2 or more of
                // them with an extra single byte, something fishy is going on
+                // (except if we use UTF-16, see below)
                wxFAIL_MSG("unexpected decoding result");
-                wxFALLTHROUGH;// fall through nevertheless and return at least something
+                return wxEOT;
+
+#if SIZEOF_WCHAR_T == 2
+            case 2:
+                // When wchar_t uses UTF-16, we could have decoded a single
+                // Unicode code point as 2 wchar_t characters and there is
+                // nothing else to do here but to return the first one now and
+                // remember the second one for the next call, as there is no
+                // way to fit both of them into a single wxChar in this case.
+                m_lastWChar = wbuf[1];
+#endif // !SWIG_ONLY_SCRIPT_API
+                wxFALLTHROUGH;

            case 1:
+
                // we finally decoded a character
                return wbuf[0];
        }
@@ -374,6 +402,10 @@ wxTextOutputStream::wxTextOutputStream(wxOutputStream& s, wxEOL mode)
        m_mode = wxEOL_UNIX;
 #endif
    }
+
+#if wxUSE_UNICODE && SIZEOF_WCHAR_T == 2
+    m_lastWChar = 0;
+#endif // SIZEOF_WCHAR_T == 2
 }

 wxTextOutputStream::~wxTextOutputStream()
@@ -480,7 +512,66 @@ void wxTextOutputStream::WriteString(const wxString& string)
 wxTextOutputStream& wxTextOutputStream::PutChar(wxChar c)
 {
 #if wxUSE_UNICODE
+#if SIZEOF_WCHAR_T == 2
+    wxCharBuffer buffer;
+    size_t len;
+    if ( m_lastWChar )
+    {
+        wxChar buf[2];
+        buf[0] = m_lastWChar;
+        buf[1] = c;
+        buffer = m_conv->cWC2MB(buf, WXSIZEOF(buf), &len);
+        m_lastWChar = 0;
+    }
+    else
+    {
+        buffer = m_conv->cWC2MB(&c, 1, &len);
+    }
+
+    if ( !len )
+    {
+        // Conversion failed, possibly because we have the first half of a
+        // surrogate character, so just store it and write it out when the
+        // second half is written to the stream too later.
+        //
+        // Notice that if we already had had a valid m_lastWChar, it is simply
+        // discarded here which is very bad, but there is no way to signal an
+        // error from here and this is not worse than the old code behaviour.
+        m_lastWChar = c;
+    }
+    else
+    {
+        for ( size_t n = 0; n < len; n++ )
+        {
+            const char c = buffer[n];
+            if ( c == '\n' )
+            {
+                switch ( m_mode )
+                {
+                    case wxEOL_DOS:
+                        m_output.Write("\r\n", 2);
+                        continue;
+
+                    case wxEOL_MAC:
+                        m_output.Write("\r", 1);
+                        continue;
+
+                    default:
+                        wxFAIL_MSG( wxT("unknown EOL mode in wxTextOutputStream") );
+                        wxFALLTHROUGH;
+
+                    case wxEOL_UNIX:
+                        // don't treat '\n' specially
+                        ;
+                }
+            }
+
+            m_output.Write(&c, 1);
+        }
+    }
+#else // SIZEOF_WCHAR_T == 4
    WriteString( wxString(&c, *m_conv, 1) );
+#endif // SIZEOF_WCHAR_T == 2 or 4
 #else
    WriteString( wxString(&c, wxConvLocal, 1) );
 #endif
--- a/tests/mbconv/mbconvtest.cpp
+++ b/tests/mbconv/mbconvtest.cpp
@@ -81,6 +81,7 @@ private:
        CPPUNIT_TEST( FontmapTests );
        CPPUNIT_TEST( BufSize );
        CPPUNIT_TEST( FromWCharTests );
+        CPPUNIT_TEST( NonBMPCharTests );
 #ifdef HAVE_WCHAR_H
        CPPUNIT_TEST( UTF8_41 );
        CPPUNIT_TEST( UTF8_7f );
@@ -116,6 +117,7 @@ private:
    void FontmapTests();
    void BufSize();
    void FromWCharTests();
+    void NonBMPCharTests();
    void IconvTests();
    void Latin1Tests();

@@ -203,6 +205,12 @@ private:
    void UTF8PUA_f4_80_82_a5() { UTF8PUA("\xf4\x80\x82\xa5", u1000a5); }
    void UTF8Octal_backslash245() { UTF8Octal("\\245", L"\\245"); }

+    // Test that converting string with incomplete surrogates in them fails
+    // (surrogates are only used in UTF-16, i.e. when wchar_t is 16 bits).
+#if SIZEOF_WCHAR_T == 2
+    void UTF8_fail_broken_surrogates();
+#endif // SIZEOF_WCHAR_T == 2
+
    // implementation for the utf-8 tests (see comments below)
    void UTF8(const char *charSequence, const wchar_t *wideSequence);
    void UTF8PUA(const char *charSequence, const wchar_t *wideSequence);
@@ -461,6 +469,12 @@ void MBConvTestCase::UTF8Tests()
        wxConvUTF8,
        1
        );
+
+#if SIZEOF_WCHAR_T == 2
+    // Can't use \ud800 as it's an invalid Unicode character.
+    const wchar_t wc = 0xd800;
+    CPPUNIT_ASSERT_EQUAL(wxCONV_FAILED, wxConvUTF8.FromWChar(NULL, 0, &wc, 1));
+#endif // SIZEOF_WCHAR_T == 2
 }

 void MBConvTestCase::UTF16LETests()
@@ -928,6 +942,86 @@ void MBConvTestCase::FromWCharTests()
    CPPUNIT_ASSERT_EQUAL( '!', mbuf[6]);
 }

+void MBConvTestCase::NonBMPCharTests()
+{
+    // U+1F363 (UTF-16: D83C DF63, UTF-8: F0 9F 8D A3) sushi (emoji)
+    // U+732B (UTF-8: E7 8C AB) cat (kanji)
+    // U+1F408 (UTF-16: D83D DC08, UTF-8: F0 9F 90 88) cat (emoji)
+    // U+845B U+E0101 (UTF-16: 845B DB40 DD01, UTF-8: E8 91 9B F3 A0 84 81) (a kanji + an IVS)
+    const char u8[] =
+        "\xF0\x9F\x8D\xA3" /* U+1F363 */
+        "\xE7\x8C\xAB\xF0\x9F\x90\x88" /* U+732B U+1F408 */
+        "\xE8\x91\x9B\xF3\xA0\x84\x81"; /* U+845B U+E0101 */
+    const wxChar16 u16[] = {
+        0xD83C, 0xDF63,
+        0x732B, 0xD83D, 0xDC08,
+        0x845B, 0xDB40, 0xDD01,
+        0};
+    const wxChar32 u32[] = {
+        0x1F363,
+        0x732B, 0x1F408,
+        0x845B, 0xE0101,
+        0};
+#if SIZEOF_WCHAR_T == 2
+    const wchar_t *const w = u16;
+    const size_t wchars = sizeof(u16)/sizeof(wxChar16) - 1;
+#else
+    const wchar_t *const w = u32;
+    const size_t wchars = sizeof(u32)/sizeof(wxChar32) - 1;
+#endif
+    {
+        // Notice that these tests can only be done with strict UTF-8
+        // converter, the use of any MAP_INVALID_UTF8_XXX options currently
+        // completely breaks wxTextInputStream use.
+        TestDecoder(w, wchars, u8, sizeof(u8)-1, wxConvUTF8, 1);
+        TestEncoder(w, wchars, u8, sizeof(u8)-1, wxConvUTF8, 1);
+    }
+    {
+        char u16le[sizeof(u16)];
+        for (size_t i = 0; i < sizeof(u16)/2; ++i) {
+            u16le[2*i]   = (char)(unsigned char)(u16[i] & 0xFF);
+            u16le[2*i+1] = (char)(unsigned char)((u16[i] >> 8) & 0xFF);
+        }
+        wxMBConvUTF16LE conv;
+        TestDecoder(w, wchars, u16le, sizeof(u16le)-2, conv, 2);
+        TestEncoder(w, wchars, u16le, sizeof(u16le)-2, conv, 2);
+    }
+    {
+        char u16be[sizeof(u16)];
+        for (size_t i = 0; i < sizeof(u16)/2; ++i) {
+            u16be[2*i]   = (char)(unsigned char)((u16[i] >> 8) & 0xFF);
+            u16be[2*i+1] = (char)(unsigned char)(u16[i] & 0xFF);
+        }
+        wxMBConvUTF16BE conv;
+        TestDecoder(w, wchars, u16be, sizeof(u16be)-2, conv, 2);
+        TestEncoder(w, wchars, u16be, sizeof(u16be)-2, conv, 2);
+    }
+    {
+        char u32le[sizeof(u32)];
+        for (size_t i = 0; i < sizeof(u32)/4; ++i) {
+            u32le[4*i]   = (char)(unsigned char)(u32[i] & 0xFF);
+            u32le[4*i+1] = (char)(unsigned char)((u32[i] >> 8) & 0xFF);
+            u32le[4*i+2] = (char)(unsigned char)((u32[i] >> 16) & 0xFF);
+            u32le[4*i+3] = (char)(unsigned char)((u32[i] >> 24) & 0xFF);
+        }
+        wxMBConvUTF32LE conv;
+        TestDecoder(w, wchars, u32le, sizeof(u32le)-4, conv, 4);
+        TestEncoder(w, wchars, u32le, sizeof(u32le)-4, conv, 4);
+    }
+    {
+        char u32be[sizeof(u32)];
+        for (size_t i = 0; i < sizeof(u32)/4; ++i) {
+            u32be[4*i]   = (char)(unsigned char)((u32[i] >> 24) & 0xFF);
+            u32be[4*i+1] = (char)(unsigned char)((u32[i] >> 16) & 0xFF);
+            u32be[4*i+2] = (char)(unsigned char)((u32[i] >> 8) & 0xFF);
+            u32be[4*i+3] = (char)(unsigned char)(u32[i] & 0xFF);
+        }
+        wxMBConvUTF32BE conv;
+        TestDecoder(w, wchars, u32be, sizeof(u32be)-4, conv, 4);
+        TestEncoder(w, wchars, u32be, sizeof(u32be)-4, conv, 4);
+    }
+}
+
 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name );

 void MBConvTestCase::IconvTests()
@@ -1084,15 +1178,16 @@ void MBConvTestCase::TestEncoder(
    memcpy( inputCopy.data(), wideBuffer, (wideChars*sizeof(wchar_t)) );
    inputCopy.data()[wideChars] = 0;

-    // calculate the output size
+    // calculate the output size: notice that it can be greater than the real
+    // size as the converter is allowed to estimate the maximal size needed
+    // instead of computing it precisely
    size_t outputWritten = converter.WC2MB
        (
        0,
        (const wchar_t*)inputCopy.data(),
        0
        );
-    // make sure the correct output length was calculated
-    CPPUNIT_ASSERT_EQUAL( multiBytes, outputWritten );
+    CPPUNIT_ASSERT( outputWritten >= multiBytes );

    // convert the string
    size_t guardBytes = 8; // to make sure we're not overrunning the output buffer