From ab858b58053c925c7165322fe1975355f1fe8596 Mon Sep 17 00:00:00 2001
From: Stefan Csomor <csomor@advancedconcepts.ch>
Date: Mon, 13 Jul 2020 12:05:29 +0200
Subject: [PATCH] Produce NFD in wxConvFileName::FromWChar() on macOS

This ensures that fn_str() returns the string in the expected,
decomposed, format.

Also simplify the code by removing workarounds for old systems which are
not supported any more and make explicit the fact that under macOS
ToWChar() always produced NFC.
---
 include/wx/osx/core/private/strconv_cf.h | 31 +++++++--
 src/common/strconv.cpp                   |  2 +-
 src/osx/core/strconv_cf.cpp              | 84 +++++-------------------
 3 files changed, 42 insertions(+), 75 deletions(-)

diff --git a/include/wx/osx/core/private/strconv_cf.h b/include/wx/osx/core/private/strconv_cf.h
index e39a9218ca..e61b5a00f8 100644
--- a/include/wx/osx/core/private/strconv_cf.h
+++ b/include/wx/osx/core/private/strconv_cf.h
@@ -290,35 +290,44 @@ inline CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
 class wxMBConv_cf : public wxMBConv
 {
 public:
+    enum NormalizationForm
+    {
+        None = 0x00,
+        FromWChar_D = 0x01,
+        ToWChar_C = 0x02
+    };
+
     wxMBConv_cf()
     {
-        Init(CFStringGetSystemEncoding()) ;
+        Init(CFStringGetSystemEncoding(), ToWChar_C) ;
     }
 
     wxMBConv_cf(const wxMBConv_cf& conv) : wxMBConv()
     {
         m_encoding = conv.m_encoding;
+        m_normalization = conv.m_normalization;
     }
 
 #if wxUSE_FONTMAP
-    wxMBConv_cf(const char* name)
+    wxMBConv_cf(const char* name, NormalizationForm normalization = ToWChar_C)
     {
-        Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
+        Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) , normalization) ;
     }
 #endif
 
-    wxMBConv_cf(wxFontEncoding encoding)
+    wxMBConv_cf(wxFontEncoding encoding, NormalizationForm normalization = ToWChar_C )
     {
-        Init( wxCFStringEncFromFontEnc(encoding) );
+        Init( wxCFStringEncFromFontEnc(encoding) , normalization);
     }
 
     virtual ~wxMBConv_cf()
     {
     }
 
-    void Init( CFStringEncoding encoding)
+    void Init( CFStringEncoding encoding, NormalizationForm normalization )
     {
         m_encoding = encoding ;
+        m_normalization = normalization;
     }
 
     virtual size_t ToWChar(wchar_t * dst, size_t dstSize, const char * src, size_t srcSize = wxNO_LEN) const wxOVERRIDE;
@@ -333,9 +342,19 @@ public:
     }
 
 private:
+    NormalizationForm m_normalization ;
     CFStringEncoding m_encoding ;
 };
 
+// This "decomposing" converter is used as wxConvFileName in wxOSX.
+class wxMBConvD_cf : public wxMBConv_cf
+{
+public:
+    wxMBConvD_cf(wxFontEncoding encoding) : wxMBConv_cf(encoding, (NormalizationForm) (ToWChar_C | FromWChar_D) )
+    {
+    }
+};
+
 // corresponding class for holding UniChars (native unicode characters)
 
 class WXDLLIMPEXP_BASE wxMacUniCharBuffer
diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp
index ba04f3c144..ba25dae157 100644
--- a/src/common/strconv.cpp
+++ b/src/common/strconv.cpp
@@ -3307,7 +3307,7 @@ WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
 // It is important to use this conversion object under Darwin as it ensures
 // that Unicode strings are (re)composed correctly even though xnu kernel uses
 // decomposed form internally (at least for the file names).
-static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
+static wxMBConvD_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
 #endif
 
 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
diff --git a/src/osx/core/strconv_cf.cpp b/src/osx/core/strconv_cf.cpp
index 18f232bdbe..dfdca0fbc7 100644
--- a/src/osx/core/strconv_cf.cpp
+++ b/src/osx/core/strconv_cf.cpp
@@ -89,13 +89,16 @@ WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_cf(wxFontEncoding encoding)
         if ( theString == NULL )
             return wxCONV_FAILED;
 
-        // Ensure that the string is in canonical composed form (NFC): this is
-        // important because Darwin uses decomposed form (NFD) for e.g. file
-        // names but we want to use NFC internally.
-        wxCFRef<CFMutableStringRef>
+        if ( m_normalization & ToWChar_C )
+        {
+            // Ensure that the string is in canonical composed form (NFC): this is
+            // important because Darwin uses decomposed form (NFD) for e.g. file
+            // names but we want to use NFC internally.
+            wxCFRef<CFMutableStringRef>
             cfMutableString(CFStringCreateMutableCopy(NULL, 0, theString));
-        CFStringNormalize(cfMutableString, kCFStringNormalizationFormC);
-        theString = cfMutableString;
+            CFStringNormalize(cfMutableString, kCFStringNormalizationFormC);
+            theString = cfMutableString;
+        }
 
         /* NOTE: The string content includes the NULL element if the source string did
          * That means we have to do nothing special because the destination will have
@@ -103,13 +106,6 @@ WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_cf(wxFontEncoding encoding)
          * in the count iff it was included in the source count.
          */
 
-
-/* If we're compiling against Tiger headers we can support direct conversion
- * to UTF32.  If we are then run against a pre-Tiger system, the encoding
- * won't be available so we'll defer to the string->UTF-16->UTF-32 conversion.
- */
-        if(CFStringIsEncodingAvailable(wxCFStringEncodingWcharT))
-        {
             CFRange fullStringRange = CFRangeMake(0, CFStringGetLength(theString));
             CFIndex usedBufLen;
 
@@ -135,34 +131,7 @@ WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_cf(wxFontEncoding encoding)
             // CFStringGetBytes does exactly the right thing when buffer
             // pointer is NULL and returns the number of bytes required
             return usedBufLen / sizeof(wchar_t);
-        }
-        else
-        {
-            // NOTE: Includes NULL iff source did
-            /* NOTE: This is an approximation.  The eventual UTF-32 will
-             * possibly have less elements but certainly not more.
-             */
-            size_t returnSize = CFStringGetLength(theString);
 
-            if (dstSize == 0 || dst == NULL)
-            {
-                return returnSize;
-            }
-
-            // Convert the entire string.. too hard to figure out how many UTF-16 we'd need
-            // for an undersized UTF-32 destination buffer.
-            CFRange fullStringRange = CFRangeMake(0, CFStringGetLength(theString));
-            UniChar *szUniCharBuffer = new UniChar[fullStringRange.length];
-
-            CFStringGetCharacters(theString, fullStringRange, szUniCharBuffer);
-
-            wxMBConvUTF16 converter;
-            returnSize = converter.ToWChar( dst, dstSize, (const char*)szUniCharBuffer, fullStringRange.length );
-            delete [] szUniCharBuffer;
-
-            return returnSize;
-        }
-        // NOTREACHED
     }
 
     size_t wxMBConv_cf::FromWChar(char *dst, size_t dstSize, const wchar_t *src, size_t srcSize) const
@@ -175,44 +144,23 @@ WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_cf(wxFontEncoding encoding)
         // Temporary CFString
         wxCFRef<CFStringRef> theString;
 
-/* If we're compiling against Tiger headers we can support direct conversion
- * from UTF32.  If we are then run against a pre-Tiger system, the encoding
- * won't be available so we'll defer to the UTF-32->UTF-16->string conversion.
- */
-        if(CFStringIsEncodingAvailable(wxCFStringEncodingWcharT))
-        {
             theString = wxCFRef<CFStringRef>(CFStringCreateWithBytes(
                     kCFAllocatorDefault,
                     reinterpret_cast<const UInt8*>(src),
                     srcSize * sizeof(wchar_t),
                     wxCFStringEncodingWcharT,
                     false));
-        }
-        else
-        {
-            wxMBConvUTF16 converter;
-            size_t cbUniBuffer = converter.FromWChar( NULL, 0, src, srcSize );
-            wxASSERT(cbUniBuffer % sizeof(UniChar));
-
-            // Will be free'd by kCFAllocatorMalloc when CFString is released
-            UniChar *tmpUniBuffer = (UniChar*)malloc(cbUniBuffer);
-
-            cbUniBuffer = converter.FromWChar( (char*) tmpUniBuffer, cbUniBuffer, src, srcSize );
-            wxASSERT(cbUniBuffer % sizeof(UniChar));
-
-            theString = wxCFRef<CFStringRef>(CFStringCreateWithCharactersNoCopy(
-                        kCFAllocatorDefault,
-                        tmpUniBuffer,
-                        cbUniBuffer / sizeof(UniChar),
-                        kCFAllocatorMalloc
-                    ));
-
-        }
 
         wxCHECK(theString != NULL, wxCONV_FAILED);
 
-        CFIndex usedBufLen;
+        if ( m_normalization & FromWChar_D )
+        {
+            wxCFRef<CFMutableStringRef> normalizedFormD = CFStringCreateMutableCopy(kCFAllocatorDefault,0,theString);
+            CFStringNormalize(normalizedFormD, kCFStringNormalizationFormD);
+            theString = normalizedFormD;
+        }
 
+        CFIndex usedBufLen;
         CFIndex charsConverted = CFStringGetBytes(
                 theString,
                 CFRangeMake(0, CFStringGetLength(theString)),