From fa8a482593938f04ea6cae1c39f8eb0faf9acb6a Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Fri, 19 Feb 2016 02:34:20 +0100 Subject: [PATCH 1/6] Don't lose Unicode data in wxMessageOutputStderr Ensure that we do output the string contents even if we have to encode it in UTF-8 instead of the current locale encoding -- this is still better than not outputting anything at all. Closes #17358. --- src/common/msgout.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/common/msgout.cpp b/src/common/msgout.cpp index 48587c5624..9003413e57 100644 --- a/src/common/msgout.cpp +++ b/src/common/msgout.cpp @@ -148,13 +148,8 @@ wxString wxMessageOutputStderr::AppendLineFeedIfNeeded(const wxString& str) void wxMessageOutputStderr::Output(const wxString& str) { const wxString strWithLF = AppendLineFeedIfNeeded(str); - const wxWX2MBbuf buf = strWithLF.mb_str(); - - if ( buf ) - fprintf(m_fp, "%s", (const char*) buf); - else // print at least something - fprintf(m_fp, "%s", (const char*) strWithLF.ToAscii()); + fprintf(m_fp, "%s", (const char*) strWithLF.mb_str(wxConvWhateverWorks)); fflush(m_fp); } From 837e6d186d850ee01e5fd6fadd106a0aa02acd81 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Fri, 19 Feb 2016 02:34:20 +0100 Subject: [PATCH 2/6] Don't lose Unicode data when outputting wxString to std::ostream Fall back to UTF-8 rather than not outputting anything at all if the string is not representable in the current locale encoding. Even if we did try to handle this error by setting failbit, chances of anybody checking for it (especially on e.g. std::cout) were very low and the only possible workaround in practice would have been attempting to output the string in UTF-8 anyhow, so just do it ourselves. See #17358. --- src/common/string.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/common/string.cpp b/src/common/string.cpp index 930b488162..feb6d8e501 100644 --- a/src/common/string.cpp +++ b/src/common/string.cpp @@ -191,13 +191,7 @@ static wxStrCacheStatsDumper s_showCacheStats; wxSTD ostream& operator<<(wxSTD ostream& os, const wxCStrData& str) { #if wxUSE_UNICODE && !wxUSE_UNICODE_UTF8 - const wxScopedCharBuffer buf(str.AsCharBuf()); - if ( !buf ) - os.clear(wxSTD ios_base::failbit); - else - os << buf.data(); - - return os; + return os << wxConvWhateverWorks.cWX2MB(str); #else return os << str.AsInternal(); #endif From a11456c0780de09b771ff5fef73a8dbcad3a2cf7 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Fri, 19 Feb 2016 02:41:28 +0100 Subject: [PATCH 3/6] Add wxWhateverWorksConv and use it for file names under Unix This ensures that we can create output files with Unicode names even when they're not representable in the current locale encoding, notably when the current locale has never been changed and is still the default "C" one, not supporting anything else other than 7 bit ASCII. Credits for the new class name go to Woody Allen. --- include/wx/strconv.h | 39 +++++++++++++++++++++++++++++++++++++++ interface/wx/strconv.h | 24 ++++++++++++++++++++++++ src/common/strconv.cpp | 38 +++++++++++++++++++++++++++++++++++++- 3 files changed, 100 insertions(+), 1 deletion(-) diff --git a/include/wx/strconv.h b/include/wx/strconv.h index 12346c9763..9aa87f8014 100644 --- a/include/wx/strconv.h +++ b/include/wx/strconv.h @@ -543,6 +543,39 @@ private: wxMBConv *m_convReal; }; +// ---------------------------------------------------------------------------- +// wxWhateverWorksConv: use whatever encoding works for the input +// ---------------------------------------------------------------------------- + +class WXDLLIMPEXP_BASE wxWhateverWorksConv : public wxMBConv +{ +public: + wxWhateverWorksConv() + { + } + + // Try to interpret the string as UTF-8, if it fails fall back to the + // current locale encoding (wxConvLibc) and if this fails as well, + // interpret it as wxConvISO8859_1 (which is used because it never fails + // and this conversion is used when we really, really must produce + // something on output). + virtual size_t + ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; + + // Try to encode the string using the current locale encoding (wxConvLibc) + // and fall back to UTF-8 (which never fails) if it doesn't work. Note that + // we never use wxConvISO8859_1 here as we prefer to fall back on UTF-8 + // even for the strings containing only code points representable in 8869-1. + virtual size_t + FromWChar(char *dst, size_t dstLen, + const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; + + virtual wxMBConv *Clone() const wxOVERRIDE + { + return new wxWhateverWorksConv(); + } +}; // ---------------------------------------------------------------------------- // declare predefined conversion objects @@ -578,6 +611,12 @@ WX_DECLARE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8) WX_DECLARE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7) #define wxConvUTF7 wxGet_wxConvUTF7() +// conversion used when we may not afford to lose data when outputting Unicode +// strings (should be avoid in the other direction as it can misinterpret the +// input encoding) +WX_DECLARE_GLOBAL_CONV(wxWhateverWorksConv, wxConvWhateverWorks) +#define wxConvWhateverWorks wxGet_wxConvWhateverWorks() + // conversion used for the file names on the systems where they're not Unicode // (basically anything except Windows) // diff --git a/interface/wx/strconv.h b/interface/wx/strconv.h index 9388b49135..86e765d548 100644 --- a/interface/wx/strconv.h +++ b/interface/wx/strconv.h @@ -483,6 +483,30 @@ public: bool IsOk() const; }; +/** + Conversion object always producing non-empty output for non-empty input. + + Conversions done using this object never lose data, at the cost of possibly + producing the output in an unwanted encoding or misinterpreting input + encoding. + + To be precise, converting Unicode to multibyte strings using this object + tries to use the current locale encoding first but if this doesn't work, it + falls back to using UTF-8. In the other direction, UTF-8 is tried first, + then the current locale encoding and if this fails too, input is + interpreted as using ISO 8859-1, which never fails. + + It is almost always @e wrong to use this converter for multibyte-to-Unicode + direction as the program should know which encoding the input data is + supposed to use and use the appropriate converter instead. However it may + be useful in the Unicode-to-multibyte direction if the goal is to produce + the output in the current locale encoding if possible, but still output + something, instead of nothing at all, even if the Unicode string is not + representable in this encoding. + + @since 3.1.0 + */ +extern wxMBConv& wxConvWhateverWorks; /** diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index 9e7da47925..2756870d55 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -3286,6 +3286,40 @@ bool wxCSConv::IsUTF8() const #endif +// ============================================================================ +// wxWhateverWorksConv +// ============================================================================ + +size_t +wxWhateverWorksConv::ToWChar(wchar_t *dst, size_t dstLen, + const char *src, size_t srcLen) const +{ + size_t rc = wxConvUTF8.ToWChar(dst, dstLen, src, srcLen); + if ( rc != wxCONV_FAILED ) + return rc; + + rc = wxConvLibc.ToWChar(dst, dstLen, src, srcLen); + if ( rc != wxCONV_FAILED ) + return rc; + + rc = wxConvISO8859_1.ToWChar(dst, dstLen, src, srcLen); + + return rc; +} + +size_t +wxWhateverWorksConv::FromWChar(char *dst, size_t dstLen, + const wchar_t *src, size_t srcLen) const +{ + size_t rc = wxConvLibc.FromWChar(dst, dstLen, src, srcLen); + if ( rc != wxCONV_FAILED ) + return rc; + + rc = wxConvUTF8.FromWChar(dst, dstLen, src, srcLen); + + return rc; +} + #if wxUSE_UNICODE wxWCharBuffer wxSafeConvertMB2WX(const char *s) @@ -3330,6 +3364,7 @@ wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws) #undef wxConvLibc #undef wxConvUTF8 #undef wxConvUTF7 +#undef wxConvWhateverWorks #undef wxConvLocal #undef wxConvISO8859_1 @@ -3369,6 +3404,7 @@ wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws) // empty statement (and hope that no compilers warns about this) WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;); WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;); +WX_DEFINE_GLOBAL_CONV(wxWhateverWorksConv, wxConvWhateverWorks, ;); WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM)); WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1)); @@ -3387,5 +3423,5 @@ WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = #ifdef __DARWIN__ &wxConvMacUTF8DObj; #else // !__DARWIN__ - wxGet_wxConvLibcPtr(); + wxGet_wxConvWhateverWorksPtr(); #endif // __DARWIN__/!__DARWIN__ From 8eac125e8659f0b5f45babc3cf6afd308e160f75 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Fri, 19 Feb 2016 02:28:39 +0100 Subject: [PATCH 4/6] Revert "Temporarily ensure that the file functions test uses UTF-8" This reverts commit a44bcb474660fe73bd8a0080646b8f44d9d2bacf which is not necessary any more since the fixes to file name conversions in the previous commit. --- tests/file/filefn.cpp | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/tests/file/filefn.cpp b/tests/file/filefn.cpp index 77a6dc2bfb..659823806e 100644 --- a/tests/file/filefn.cpp +++ b/tests/file/filefn.cpp @@ -78,10 +78,6 @@ private: wxString m_fileNameNonASCII; wxString m_fileNameWork; -#ifndef __DARWIN__ - wxMBConv* m_convFNOld; -#endif - wxDECLARE_NO_COPY_CLASS(FileFunctionsTestCase); }; @@ -98,16 +94,6 @@ CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( FileFunctionsTestCase, "FileFunctionsTest void FileFunctionsTestCase::setUp() { - // Under Unix we need to use UTF-8 for the tests using non-ASCII filenames - // and this is not necessarily the case because the tests don't call - // setlocale(LC_ALL, ""), so ensure it explicitly. This is just a temporary - // hack until we find the solution to make the library work with Unicode - // filenames irrespectively of the current locale. -#ifndef __DARWIN__ - m_convFNOld = wxConvFileName; - wxConvFileName = &wxConvUTF8; -#endif - // Initialize local data wxFileName fn1(wxFileName::GetTempDir(), wxT("wx_file_mask.txt")); @@ -137,10 +123,6 @@ void FileFunctionsTestCase::tearDown() { wxRemoveFile(m_fileNameWork); } - -#ifndef __DARWIN__ - wxConvFileName = m_convFNOld; -#endif } void FileFunctionsTestCase::GetTempFolder() From 956edbb309239bd676715990d0ccda7561f6a844 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Fri, 19 Feb 2016 02:57:20 +0100 Subject: [PATCH 5/6] Reimplement wxSafeConvertXXX() functions using wxWhateverWorksConv These functions were almost but not quite identical to it: wxSafeConvertMB2WX() tried the current locale encoding before UTF-8 while wxConvWhateverWorks tries UTF-8 first and then the current locale encoding. The latter behaviour is more correct as valid UTF-8 could be misinterpreted as some legacy multibyte encoding otherwise, so get rid of this difference and just forward these functions to wxConvWhateverWorks. --- include/wx/strconv.h | 13 ++++++++----- src/common/strconv.cpp | 30 ------------------------------ 2 files changed, 8 insertions(+), 35 deletions(-) diff --git a/include/wx/strconv.h b/include/wx/strconv.h index 9aa87f8014..470632ca4a 100644 --- a/include/wx/strconv.h +++ b/include/wx/strconv.h @@ -687,12 +687,15 @@ extern WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI; // function which would crash if we passed NULL to it), so these functions // always return a valid pointer if their argument is non-NULL - // this function safety is achieved by trying wxConvLibc first, wxConvUTF8 - // next if it fails and, finally, wxConvISO8859_1 which always succeeds - extern WXDLLIMPEXP_BASE wxWCharBuffer wxSafeConvertMB2WX(const char *s); + inline wxWCharBuffer wxSafeConvertMB2WX(const char *s) + { + return wxConvWhateverWorks.cMB2WC(s); + } - // this function uses wxConvLibc and wxConvUTF8 if it fails - extern WXDLLIMPEXP_BASE wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws); + inline wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws) + { + return wxConvWhateverWorks.cWC2MB(ws); + } #else // ANSI // no conversions to do #define wxConvertWX2MB(s) (s) diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index 2756870d55..0390dc3795 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -3320,36 +3320,6 @@ wxWhateverWorksConv::FromWChar(char *dst, size_t dstLen, return rc; } -#if wxUSE_UNICODE - -wxWCharBuffer wxSafeConvertMB2WX(const char *s) -{ - if ( !s ) - return wxWCharBuffer(); - - wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s)); - if ( !wbuf ) - wbuf = wxConvUTF8.cMB2WX(s); - if ( !wbuf ) - wbuf = wxConvISO8859_1.cMB2WX(s); - - return wbuf; -} - -wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws) -{ - if ( !ws ) - return wxCharBuffer(); - - wxCharBuffer buf(wxConvLibc.cWX2MB(ws)); - if ( !buf ) - buf = wxConvUTF8.cWX2MB(ws); - - return buf; -} - -#endif // wxUSE_UNICODE - // ---------------------------------------------------------------------------- // globals // ---------------------------------------------------------------------------- From 704055f200d97f327a8ee5212762b41bf1d6d503 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Fri, 19 Feb 2016 03:10:47 +0100 Subject: [PATCH 6/6] Don't lose data when converting wxExecute() arguments under Unix Use wxConvWhateverWorks when converting the command line given as a string to individual arguments: we already used wxSafeConvertWX2MB() when converting the arguments specified as an array, but not here. Closes #16206. --- src/unix/utilsunx.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unix/utilsunx.cpp b/src/unix/utilsunx.cpp index 2a403b1a4d..a0db944983 100644 --- a/src/unix/utilsunx.cpp +++ b/src/unix/utilsunx.cpp @@ -400,7 +400,7 @@ public: for ( int i = 0; i < m_argc; i++ ) { - m_argv[i] = wxStrdup(args[i]); + m_argv[i] = wxStrdup(args[i].mb_str(wxConvWhateverWorks)); } }