Submit patch based on Michael W.'s invalid UTF8

to PUA patch. I added UTF8_TO_OCTAL and made that the default for filename conversion uner GTK2. More adapation, e.g. to G_FILENAME_ENCODING need to be done. git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@33099 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
2005-03-27 17:23:15 +00:00
parent 894d74dcc2
commit ea8ce907e1
2 changed files with 162 additions and 45 deletions
--- a/include/wx/strconv.h
+++ b/include/wx/strconv.h
@@ -104,8 +104,18 @@ public:
 class WXDLLIMPEXP_BASE wxMBConvUTF8 : public wxMBConv
 {
 public:
    enum { 
        MAP_INVALID_UTF8_NOT = 0,
        MAP_INVALID_UTF8_TO_PUA = 1,
        MAP_INVALID_UTF8_TO_OCTAL = 2
    };
    wxMBConvUTF8(int options = MAP_INVALID_UTF8_NOT) : m_options(options) { }
    virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const;
    virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
 private:
    int m_options;
 };
 // ----------------------------------------------------------------------------
--- a/src/common/strconv.cpp
+++ b/src/common/strconv.cpp
@@ -55,6 +55,9 @@
 #include <ctype.h>
 #include <string.h>
 #include <stdlib.h>
 #ifdef HAVE_LANGINFO_H
  #include <langinfo.h>
 #endif
 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
    #define wxHAVE_WIN32_MB2WC
@@ -361,34 +364,44 @@ size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 // wxConvBrokenFileNames is made for GTK2 in Unicode mode when
 // files are accidentally written in an encoding which is not
 // the system encoding. Typically, the system encoding will be
-// UTF8 but there might be files stored in ISO8859-1 in disk. 
+// UTF8 but there might be files stored in ISO8859-1 on disk. 
 // ----------------------------------------------------------------------------
 class wxConvBrokenFileNames: public wxMBConvLibc
 {
 public:
    wxConvBrokenFileNames() : m_utf8conv(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL) { }
    virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const;
    virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
    inline bool UseUTF8() const;
 private:
    wxMBConvUTF8 m_utf8conv;
 };
 bool wxConvBrokenFileNames::UseUTF8() const
 {
 #if defined HAVE_LANGINFO_H && defined CODESET
    char *codeset = nl_langinfo(CODESET);
    return strcmp(codeset, "UTF-8") == 0;
 #else
    return false;
 #endif
 }
 size_t wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const
 {
-#if 0
+    if (UseUTF8())
-    if (we find some invalid characters)
+        return m_utf8conv.MB2WC( outputBuf, psz, outputSize );
    {
       Convert to Unicode range.
    }
    else
-#endif
+        return wxMBConvLibc::MB2WC( outputBuf, psz, outputSize );
    return wxMBConvLibc::MB2WC( outputBuf, psz, outputSize );
 }
 size_t wxConvBrokenFileNames::WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const
 {
-#if 0
+    if (UseUTF8())
-    Convert back from Unicode range.
+        return m_utf8conv.WC2MB( outputBuf, psz, outputSize );
-#endif
+    else
-    return wxMBConvLibc::WC2MB( outputBuf, psz, outputSize );
+        return wxMBConvLibc::WC2MB( outputBuf, psz, outputSize );
 }
 // ----------------------------------------------------------------------------
@@ -602,12 +615,17 @@ size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 static wxUint32 utf8_max[]=
    { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 const wxUint32 wxUnicodePUA = 0x100000;
 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 {
    size_t len = 0;
    while (*psz && ((!buf) || (len < n)))
    {
        const char *opsz = psz;
        bool invalid = false;
        unsigned char cc = *psz++, fc = cc;
        unsigned cnt;
        for (cnt = 0; fc & 0x80; cnt++)
@@ -625,7 +643,7 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
            if (!cnt)
            {
                // invalid UTF-8 sequence
-                return (size_t)-1;
+                invalid = true;
            }
            else
            {
@@ -633,32 +651,96 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
                wxUint32 res = cc & (0x3f >> cnt);
                while (cnt--)
                {
-                    cc = *psz++;
+                    cc = *psz;
                    if ((cc & 0xC0) != 0x80)
                    {
                        // invalid UTF-8 sequence
-                        return (size_t)-1;
+                        invalid = true;
                        break;
                    }
                    psz++;
                    res = (res << 6) | (cc & 0x3f);
                }
-                if (res <= utf8_max[ocnt])
+                if (invalid || res <= utf8_max[ocnt])
                {
                    // illegal UTF-8 encoding
                    invalid = true;
                }
                else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
                        res >= wxUnicodePUA && res < wxUnicodePUAEnd)
                {
                    // if one of our PUA characters turns up externally
                    // it must also be treated as an illegal sequence
                    // (a bit like you have to escape an escape character)
                    invalid = true;
                }
                else
                {
 #ifdef WC_UTF16
                    // cast is ok because wchar_t == wxUuint16 if WC_UTF16
                    size_t pa = encode_utf16(res, (wxUint16 *)buf);
                    if (pa == (size_t)-1)
                    {
                        invalid = true;
                    }
                    else
                    {
                        if (buf)
                            buf += pa;
                        len += pa;
                    }
 #else // !WC_UTF16
                    if (buf)
                        *buf++ = res;
                    len++;
 #endif // WC_UTF16/!WC_UTF16
                }
            }
            if (invalid)
            {
                if (m_options & MAP_INVALID_UTF8_TO_PUA)
                {
                    while (opsz < psz && (!buf || len < n))
                    {
 #ifdef WC_UTF16
                        // cast is ok because wchar_t == wxUuint16 if WC_UTF16
                        size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
                        wxASSERT(pa != (size_t)-1);
                        if (buf)
                            buf += pa;
                        opsz++;
                        len += pa;
 #else
                        if (buf)
                            *buf++ = wxUnicodePUA + (unsigned char)*opsz;
                        opsz++;
                        len++;
 #endif
                    }
                }
                else
                if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
                {
                    while (opsz < psz && (!buf || len < n))
                    {
                        wchar_t str[6];
                        wxSnprintf( str, 5, L"\\%o", (int) (unsigned char) *opsz );
                        if (buf)
                            *buf++ = str[0];
                        if (buf)
                            *buf++ = str[1];
                        if (buf)
                            *buf++ = str[2];
                        if (buf)
                            *buf++ = str[3];
                        opsz++;
                        len += 4;
                    }
                }
                else
                {
                    return (size_t)-1;
                }
 #ifdef WC_UTF16
                // cast is ok because wchar_t == wxUuint16 if WC_UTF16
                size_t pa = encode_utf16(res, (wxUint16 *)buf);
                if (pa == (size_t)-1)
                  return (size_t)-1;
                if (buf)
                    buf += pa;
                len += pa;
 #else // !WC_UTF16
                if (buf)
                    *buf++ = res;
                len++;
 #endif // WC_UTF16/!WC_UTF16
            }
        }
    }
@@ -681,24 +763,49 @@ size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 #else
        cc=(*psz++) & 0x7fffffff;
 #endif
-        unsigned cnt;
+        if ((m_options & MAP_INVALID_UTF8_TO_PUA)
-        for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
+            && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd)
        if (!cnt)
        {
            // plain ASCII char
            if (buf)
-                *buf++ = (char) cc;
+                *buf++ = (char)(cc - wxUnicodePUA);
            len++;
        } 
        else
        if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
            && cc == L'\\')
        {
            wchar_t str[4];
            str[0] = *psz; psz++;
            str[1] = *psz; psz++;
            str[2] = *psz; psz++;
            str[3] = 0;
            int octal;
            wxSscanf( str, L"%o", &octal );
            if (buf)
                *buf++ = (char) octal;
            len++;
        }
        else
        {
-            len += cnt + 1;
+            unsigned cnt;
-            if (buf)
+            for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
            if (!cnt)
            {
-                *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
+                // plain ASCII char
-                while (cnt--)
+                if (buf)
-                    *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
+                    *buf++ = (char) cc;
                len++;
            }
            else
            {
                len += cnt + 1;
                if (buf)
                {
                    *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
                    while (cnt--)
                        *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
                }
            }
        }
    }
@@ -708,9 +815,6 @@ size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
    return len;
 }
 // ----------------------------------------------------------------------------
 // UTF-16
 // ----------------------------------------------------------------------------
@@ -2627,6 +2731,7 @@ static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
 static wxMBConvUTF7 wxConvUTF7Obj;
 static wxMBConvUTF8 wxConvUTF8Obj;
 static wxConvBrokenFileNames wxConvBrokenFileNamesObj;
 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
@@ -2636,9 +2741,11 @@ WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
 #ifdef __WXOSX__
-                                                    wxConvUTF8Obj;
+                                    wxConvUTF8Obj;
 #elif __WXGTK20__
                                    wxConvBrokenFileNamesObj;
 #else
-                                                    wxConvLibcObj;
+                                    wxConvLibcObj;
 #endif