From 5a92181ac1515006d464ec4d6b39465ada146df6 Mon Sep 17 00:00:00 2001
From: ARATA Mizuki <minorinoki@gmail.com>
Date: Sun, 21 Feb 2016 14:38:17 +0100
Subject: [PATCH] Fix the length returned by UTF-32 conversion for non-BMP
 input

Don't optimize the required length as this is a tiny gain resulting in big
problems with the strings containing surrogates for which the actual result is
shorter than the length returned, resulting in extra NUL bytes at the end of
the converted buffer.

This is similar to 3410aa372fe142bdb8e047eb303bf70b7adba039 (see #16298) but
for UTF-32 and not UTF-16.

Closes #17070.
---
 docs/changes.txt          |  1 +
 src/common/strconv.cpp    | 42 +++++++++++----------------------------
 tests/strings/unicode.cpp | 13 ++++++++++++
 3 files changed, 26 insertions(+), 30 deletions(-)
diff --git a/docs/changes.txt b/docs/changes.txt
index a7653a1844..e95fd326ab 100644
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -211,6 +211,7 @@ wxMSW:
 - Notify shell about the changes done by wxMimeTypesManager (Maarten Bent).
 - Fix wxPrintf() and friends when using MinGW with ANSI stdio option.
 - Fix strike-through support in wxFont with GDI+ (David Vanderson).
+- Fix UTF-32 conversion for non-BMP characters (ARATA Mizuki).
 
 wxOSX/Cocoa:
 
diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp
index 0390dc3795..867d663f99 100644
--- a/src/common/strconv.cpp
+++ b/src/common/strconv.cpp
@@ -1887,18 +1887,6 @@ wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
     if ( srcLen == wxNO_LEN )
         srcLen = wxWcslen(src) + 1;
 
-    if ( !dst )
-    {
-        // optimization: return maximal space which could be needed for this
-        // string instead of the exact amount which could be less if there are
-        // any surrogates in the input
-        //
-        // we consider that surrogates are rare enough to make it worthwhile to
-        // avoid running the loop below at the cost of slightly extra memory
-        // consumption
-        return srcLen * BYTES_PER_CHAR;
-    }
-
     wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
     size_t outLen = 0;
     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
@@ -1909,10 +1897,13 @@ wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
 
         outLen += BYTES_PER_CHAR;
 
-        if ( outLen > dstLen )
-            return wxCONV_FAILED;
+        if ( outBuff )
+        {
+            if ( outLen > dstLen )
+                return wxCONV_FAILED;
 
-        *outBuff++ = ch;
+            *outBuff++ = ch;
+        }
     }
 
     return outLen;
@@ -1965,18 +1956,6 @@ wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
     if ( srcLen == wxNO_LEN )
         srcLen = wxWcslen(src) + 1;
 
-    if ( !dst )
-    {
-        // optimization: return maximal space which could be needed for this
-        // string instead of the exact amount which could be less if there are
-        // any surrogates in the input
-        //
-        // we consider that surrogates are rare enough to make it worthwhile to
-        // avoid running the loop below at the cost of slightly extra memory
-        // consumption
-        return srcLen*BYTES_PER_CHAR;
-    }
-
     wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
     size_t outLen = 0;
     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
@@ -1987,10 +1966,13 @@ wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
 
         outLen += BYTES_PER_CHAR;
 
-        if ( outLen > dstLen )
-            return wxCONV_FAILED;
+        if ( outBuff )
+        {
+            if ( outLen > dstLen )
+                return wxCONV_FAILED;
 
-        *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
+            *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
+        }
     }
 
     return outLen;
diff --git a/tests/strings/unicode.cpp b/tests/strings/unicode.cpp
index 565bd7b22b..20751f23fe 100644
--- a/tests/strings/unicode.cpp
+++ b/tests/strings/unicode.cpp
@@ -403,6 +403,19 @@ void UnicodeTestCase::ConversionUTF16()
     wxMBConvUTF16BE().cMB2WC("\xd8\x03\xdc\x01\0" /* OLD TURKIC LETTER YENISEI A */, wxNO_LEN, &len);
     CPPUNIT_ASSERT_EQUAL( 1, len );
 #endif // UTF-32 internal representation
+
+#if SIZEOF_WCHAR_T == 2
+    // Verify that the length of UTF-32 string is correct even when converting
+    // to it from a longer UTF-16 string with surrogates.
+
+    // Construct CAT FACE U+1F431 without using \U which is not supported by
+    // ancient compilers and without using \u with surrogates which is
+    // (correctly) flagged as an error by the newer ones.
+    wchar_t ws[2];
+    ws[0] = 0xd83d;
+    ws[1] = 0xdc31;
+    CPPUNIT_ASSERT_EQUAL( 4, wxMBConvUTF32BE().FromWChar(NULL, 0, ws, 2) );
+#endif // UTF-16 internal representation
 }
 
 void UnicodeTestCase::ConversionUTF32()