Merge pull request #219 from vadz/whatever-conv

Fall back on UTF-8 when converting Unicode to multibyte fails

This is not ideal, but better than just losing data entirely.
This commit is contained in:
VZ
2016-02-19 17:10:52 +01:00
7 changed files with 100 additions and 57 deletions

View File

@@ -543,6 +543,39 @@ private:
wxMBConv *m_convReal; wxMBConv *m_convReal;
}; };
// ----------------------------------------------------------------------------
// wxWhateverWorksConv: use whatever encoding works for the input
// ----------------------------------------------------------------------------
class WXDLLIMPEXP_BASE wxWhateverWorksConv : public wxMBConv
{
public:
wxWhateverWorksConv()
{
}
// Try to interpret the string as UTF-8, if it fails fall back to the
// current locale encoding (wxConvLibc) and if this fails as well,
// interpret it as wxConvISO8859_1 (which is used because it never fails
// and this conversion is used when we really, really must produce
// something on output).
virtual size_t
ToWChar(wchar_t *dst, size_t dstLen,
const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
// Try to encode the string using the current locale encoding (wxConvLibc)
// and fall back to UTF-8 (which never fails) if it doesn't work. Note that
// we never use wxConvISO8859_1 here as we prefer to fall back on UTF-8
// even for the strings containing only code points representable in 8869-1.
virtual size_t
FromWChar(char *dst, size_t dstLen,
const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
virtual wxMBConv *Clone() const wxOVERRIDE
{
return new wxWhateverWorksConv();
}
};
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// declare predefined conversion objects // declare predefined conversion objects
@@ -578,6 +611,12 @@ WX_DECLARE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8)
WX_DECLARE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7) WX_DECLARE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7)
#define wxConvUTF7 wxGet_wxConvUTF7() #define wxConvUTF7 wxGet_wxConvUTF7()
// conversion used when we may not afford to lose data when outputting Unicode
// strings (should be avoid in the other direction as it can misinterpret the
// input encoding)
WX_DECLARE_GLOBAL_CONV(wxWhateverWorksConv, wxConvWhateverWorks)
#define wxConvWhateverWorks wxGet_wxConvWhateverWorks()
// conversion used for the file names on the systems where they're not Unicode // conversion used for the file names on the systems where they're not Unicode
// (basically anything except Windows) // (basically anything except Windows)
// //
@@ -648,12 +687,15 @@ extern WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI;
// function which would crash if we passed NULL to it), so these functions // function which would crash if we passed NULL to it), so these functions
// always return a valid pointer if their argument is non-NULL // always return a valid pointer if their argument is non-NULL
// this function safety is achieved by trying wxConvLibc first, wxConvUTF8 inline wxWCharBuffer wxSafeConvertMB2WX(const char *s)
// next if it fails and, finally, wxConvISO8859_1 which always succeeds {
extern WXDLLIMPEXP_BASE wxWCharBuffer wxSafeConvertMB2WX(const char *s); return wxConvWhateverWorks.cMB2WC(s);
}
// this function uses wxConvLibc and wxConvUTF8 if it fails inline wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
extern WXDLLIMPEXP_BASE wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws); {
return wxConvWhateverWorks.cWC2MB(ws);
}
#else // ANSI #else // ANSI
// no conversions to do // no conversions to do
#define wxConvertWX2MB(s) (s) #define wxConvertWX2MB(s) (s)

View File

@@ -483,6 +483,30 @@ public:
bool IsOk() const; bool IsOk() const;
}; };
/**
Conversion object always producing non-empty output for non-empty input.
Conversions done using this object never lose data, at the cost of possibly
producing the output in an unwanted encoding or misinterpreting input
encoding.
To be precise, converting Unicode to multibyte strings using this object
tries to use the current locale encoding first but if this doesn't work, it
falls back to using UTF-8. In the other direction, UTF-8 is tried first,
then the current locale encoding and if this fails too, input is
interpreted as using ISO 8859-1, which never fails.
It is almost always @e wrong to use this converter for multibyte-to-Unicode
direction as the program should know which encoding the input data is
supposed to use and use the appropriate converter instead. However it may
be useful in the Unicode-to-multibyte direction if the goal is to produce
the output in the current locale encoding if possible, but still output
something, instead of nothing at all, even if the Unicode string is not
representable in this encoding.
@since 3.1.0
*/
extern wxMBConv& wxConvWhateverWorks;
/** /**

View File

@@ -148,13 +148,8 @@ wxString wxMessageOutputStderr::AppendLineFeedIfNeeded(const wxString& str)
void wxMessageOutputStderr::Output(const wxString& str) void wxMessageOutputStderr::Output(const wxString& str)
{ {
const wxString strWithLF = AppendLineFeedIfNeeded(str); const wxString strWithLF = AppendLineFeedIfNeeded(str);
const wxWX2MBbuf buf = strWithLF.mb_str();
if ( buf )
fprintf(m_fp, "%s", (const char*) buf);
else // print at least something
fprintf(m_fp, "%s", (const char*) strWithLF.ToAscii());
fprintf(m_fp, "%s", (const char*) strWithLF.mb_str(wxConvWhateverWorks));
fflush(m_fp); fflush(m_fp);
} }

View File

@@ -3286,36 +3286,40 @@ bool wxCSConv::IsUTF8() const
#endif #endif
#if wxUSE_UNICODE // ============================================================================
// wxWhateverWorksConv
// ============================================================================
wxWCharBuffer wxSafeConvertMB2WX(const char *s) size_t
wxWhateverWorksConv::ToWChar(wchar_t *dst, size_t dstLen,
const char *src, size_t srcLen) const
{ {
if ( !s ) size_t rc = wxConvUTF8.ToWChar(dst, dstLen, src, srcLen);
return wxWCharBuffer(); if ( rc != wxCONV_FAILED )
return rc;
wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s)); rc = wxConvLibc.ToWChar(dst, dstLen, src, srcLen);
if ( !wbuf ) if ( rc != wxCONV_FAILED )
wbuf = wxConvUTF8.cMB2WX(s); return rc;
if ( !wbuf )
wbuf = wxConvISO8859_1.cMB2WX(s);
return wbuf; rc = wxConvISO8859_1.ToWChar(dst, dstLen, src, srcLen);
return rc;
} }
wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws) size_t
wxWhateverWorksConv::FromWChar(char *dst, size_t dstLen,
const wchar_t *src, size_t srcLen) const
{ {
if ( !ws ) size_t rc = wxConvLibc.FromWChar(dst, dstLen, src, srcLen);
return wxCharBuffer(); if ( rc != wxCONV_FAILED )
return rc;
wxCharBuffer buf(wxConvLibc.cWX2MB(ws)); rc = wxConvUTF8.FromWChar(dst, dstLen, src, srcLen);
if ( !buf )
buf = wxConvUTF8.cWX2MB(ws);
return buf; return rc;
} }
#endif // wxUSE_UNICODE
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// globals // globals
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
@@ -3330,6 +3334,7 @@ wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
#undef wxConvLibc #undef wxConvLibc
#undef wxConvUTF8 #undef wxConvUTF8
#undef wxConvUTF7 #undef wxConvUTF7
#undef wxConvWhateverWorks
#undef wxConvLocal #undef wxConvLocal
#undef wxConvISO8859_1 #undef wxConvISO8859_1
@@ -3369,6 +3374,7 @@ wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
// empty statement (and hope that no compilers warns about this) // empty statement (and hope that no compilers warns about this)
WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;); WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;); WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
WX_DEFINE_GLOBAL_CONV(wxWhateverWorksConv, wxConvWhateverWorks, ;);
WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM)); WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1)); WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
@@ -3387,5 +3393,5 @@ WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
#ifdef __DARWIN__ #ifdef __DARWIN__
&wxConvMacUTF8DObj; &wxConvMacUTF8DObj;
#else // !__DARWIN__ #else // !__DARWIN__
wxGet_wxConvLibcPtr(); wxGet_wxConvWhateverWorksPtr();
#endif // __DARWIN__/!__DARWIN__ #endif // __DARWIN__/!__DARWIN__

View File

@@ -191,13 +191,7 @@ static wxStrCacheStatsDumper s_showCacheStats;
wxSTD ostream& operator<<(wxSTD ostream& os, const wxCStrData& str) wxSTD ostream& operator<<(wxSTD ostream& os, const wxCStrData& str)
{ {
#if wxUSE_UNICODE && !wxUSE_UNICODE_UTF8 #if wxUSE_UNICODE && !wxUSE_UNICODE_UTF8
const wxScopedCharBuffer buf(str.AsCharBuf()); return os << wxConvWhateverWorks.cWX2MB(str);
if ( !buf )
os.clear(wxSTD ios_base::failbit);
else
os << buf.data();
return os;
#else #else
return os << str.AsInternal(); return os << str.AsInternal();
#endif #endif

View File

@@ -400,7 +400,7 @@ public:
for ( int i = 0; i < m_argc; i++ ) for ( int i = 0; i < m_argc; i++ )
{ {
m_argv[i] = wxStrdup(args[i]); m_argv[i] = wxStrdup(args[i].mb_str(wxConvWhateverWorks));
} }
} }

View File

@@ -78,10 +78,6 @@ private:
wxString m_fileNameNonASCII; wxString m_fileNameNonASCII;
wxString m_fileNameWork; wxString m_fileNameWork;
#ifndef __DARWIN__
wxMBConv* m_convFNOld;
#endif
wxDECLARE_NO_COPY_CLASS(FileFunctionsTestCase); wxDECLARE_NO_COPY_CLASS(FileFunctionsTestCase);
}; };
@@ -98,16 +94,6 @@ CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( FileFunctionsTestCase, "FileFunctionsTest
void FileFunctionsTestCase::setUp() void FileFunctionsTestCase::setUp()
{ {
// Under Unix we need to use UTF-8 for the tests using non-ASCII filenames
// and this is not necessarily the case because the tests don't call
// setlocale(LC_ALL, ""), so ensure it explicitly. This is just a temporary
// hack until we find the solution to make the library work with Unicode
// filenames irrespectively of the current locale.
#ifndef __DARWIN__
m_convFNOld = wxConvFileName;
wxConvFileName = &wxConvUTF8;
#endif
// Initialize local data // Initialize local data
wxFileName fn1(wxFileName::GetTempDir(), wxT("wx_file_mask.txt")); wxFileName fn1(wxFileName::GetTempDir(), wxT("wx_file_mask.txt"));
@@ -137,10 +123,6 @@ void FileFunctionsTestCase::tearDown()
{ {
wxRemoveFile(m_fileNameWork); wxRemoveFile(m_fileNameWork);
} }
#ifndef __DARWIN__
wxConvFileName = m_convFNOld;
#endif
} }
void FileFunctionsTestCase::GetTempFolder() void FileFunctionsTestCase::GetTempFolder()