Merge pull request #219 from vadz/whatever-conv

Fall back on UTF-8 when converting Unicode to multibyte fails

This is not ideal, but better than just losing data entirely.
This commit is contained in:
VZ
2016-02-19 17:10:52 +01:00
7 changed files with 100 additions and 57 deletions

View File

@@ -543,6 +543,39 @@ private:
wxMBConv *m_convReal;
};
// ----------------------------------------------------------------------------
// wxWhateverWorksConv: use whatever encoding works for the input
// ----------------------------------------------------------------------------
class WXDLLIMPEXP_BASE wxWhateverWorksConv : public wxMBConv
{
public:
wxWhateverWorksConv()
{
}
// Try to interpret the string as UTF-8, if it fails fall back to the
// current locale encoding (wxConvLibc) and if this fails as well,
// interpret it as wxConvISO8859_1 (which is used because it never fails
// and this conversion is used when we really, really must produce
// something on output).
virtual size_t
ToWChar(wchar_t *dst, size_t dstLen,
const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
// Try to encode the string using the current locale encoding (wxConvLibc)
// and fall back to UTF-8 (which never fails) if it doesn't work. Note that
// we never use wxConvISO8859_1 here as we prefer to fall back on UTF-8
// even for the strings containing only code points representable in 8869-1.
virtual size_t
FromWChar(char *dst, size_t dstLen,
const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
virtual wxMBConv *Clone() const wxOVERRIDE
{
return new wxWhateverWorksConv();
}
};
// ----------------------------------------------------------------------------
// declare predefined conversion objects
@@ -578,6 +611,12 @@ WX_DECLARE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8)
WX_DECLARE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7)
#define wxConvUTF7 wxGet_wxConvUTF7()
// conversion used when we may not afford to lose data when outputting Unicode
// strings (should be avoid in the other direction as it can misinterpret the
// input encoding)
WX_DECLARE_GLOBAL_CONV(wxWhateverWorksConv, wxConvWhateverWorks)
#define wxConvWhateverWorks wxGet_wxConvWhateverWorks()
// conversion used for the file names on the systems where they're not Unicode
// (basically anything except Windows)
//
@@ -648,12 +687,15 @@ extern WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI;
// function which would crash if we passed NULL to it), so these functions
// always return a valid pointer if their argument is non-NULL
// this function safety is achieved by trying wxConvLibc first, wxConvUTF8
// next if it fails and, finally, wxConvISO8859_1 which always succeeds
extern WXDLLIMPEXP_BASE wxWCharBuffer wxSafeConvertMB2WX(const char *s);
inline wxWCharBuffer wxSafeConvertMB2WX(const char *s)
{
return wxConvWhateverWorks.cMB2WC(s);
}
// this function uses wxConvLibc and wxConvUTF8 if it fails
extern WXDLLIMPEXP_BASE wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws);
inline wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
{
return wxConvWhateverWorks.cWC2MB(ws);
}
#else // ANSI
// no conversions to do
#define wxConvertWX2MB(s) (s)

View File

@@ -483,6 +483,30 @@ public:
bool IsOk() const;
};
/**
Conversion object always producing non-empty output for non-empty input.
Conversions done using this object never lose data, at the cost of possibly
producing the output in an unwanted encoding or misinterpreting input
encoding.
To be precise, converting Unicode to multibyte strings using this object
tries to use the current locale encoding first but if this doesn't work, it
falls back to using UTF-8. In the other direction, UTF-8 is tried first,
then the current locale encoding and if this fails too, input is
interpreted as using ISO 8859-1, which never fails.
It is almost always @e wrong to use this converter for multibyte-to-Unicode
direction as the program should know which encoding the input data is
supposed to use and use the appropriate converter instead. However it may
be useful in the Unicode-to-multibyte direction if the goal is to produce
the output in the current locale encoding if possible, but still output
something, instead of nothing at all, even if the Unicode string is not
representable in this encoding.
@since 3.1.0
*/
extern wxMBConv& wxConvWhateverWorks;
/**

View File

@@ -148,13 +148,8 @@ wxString wxMessageOutputStderr::AppendLineFeedIfNeeded(const wxString& str)
void wxMessageOutputStderr::Output(const wxString& str)
{
const wxString strWithLF = AppendLineFeedIfNeeded(str);
const wxWX2MBbuf buf = strWithLF.mb_str();
if ( buf )
fprintf(m_fp, "%s", (const char*) buf);
else // print at least something
fprintf(m_fp, "%s", (const char*) strWithLF.ToAscii());
fprintf(m_fp, "%s", (const char*) strWithLF.mb_str(wxConvWhateverWorks));
fflush(m_fp);
}

View File

@@ -3286,36 +3286,40 @@ bool wxCSConv::IsUTF8() const
#endif
#if wxUSE_UNICODE
// ============================================================================
// wxWhateverWorksConv
// ============================================================================
wxWCharBuffer wxSafeConvertMB2WX(const char *s)
size_t
wxWhateverWorksConv::ToWChar(wchar_t *dst, size_t dstLen,
const char *src, size_t srcLen) const
{
if ( !s )
return wxWCharBuffer();
size_t rc = wxConvUTF8.ToWChar(dst, dstLen, src, srcLen);
if ( rc != wxCONV_FAILED )
return rc;
wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
if ( !wbuf )
wbuf = wxConvUTF8.cMB2WX(s);
if ( !wbuf )
wbuf = wxConvISO8859_1.cMB2WX(s);
rc = wxConvLibc.ToWChar(dst, dstLen, src, srcLen);
if ( rc != wxCONV_FAILED )
return rc;
return wbuf;
rc = wxConvISO8859_1.ToWChar(dst, dstLen, src, srcLen);
return rc;
}
wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
size_t
wxWhateverWorksConv::FromWChar(char *dst, size_t dstLen,
const wchar_t *src, size_t srcLen) const
{
if ( !ws )
return wxCharBuffer();
size_t rc = wxConvLibc.FromWChar(dst, dstLen, src, srcLen);
if ( rc != wxCONV_FAILED )
return rc;
wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
if ( !buf )
buf = wxConvUTF8.cWX2MB(ws);
rc = wxConvUTF8.FromWChar(dst, dstLen, src, srcLen);
return buf;
return rc;
}
#endif // wxUSE_UNICODE
// ----------------------------------------------------------------------------
// globals
// ----------------------------------------------------------------------------
@@ -3330,6 +3334,7 @@ wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
#undef wxConvLibc
#undef wxConvUTF8
#undef wxConvUTF7
#undef wxConvWhateverWorks
#undef wxConvLocal
#undef wxConvISO8859_1
@@ -3369,6 +3374,7 @@ wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
// empty statement (and hope that no compilers warns about this)
WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
WX_DEFINE_GLOBAL_CONV(wxWhateverWorksConv, wxConvWhateverWorks, ;);
WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
@@ -3387,5 +3393,5 @@ WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
#ifdef __DARWIN__
&wxConvMacUTF8DObj;
#else // !__DARWIN__
wxGet_wxConvLibcPtr();
wxGet_wxConvWhateverWorksPtr();
#endif // __DARWIN__/!__DARWIN__

View File

@@ -191,13 +191,7 @@ static wxStrCacheStatsDumper s_showCacheStats;
wxSTD ostream& operator<<(wxSTD ostream& os, const wxCStrData& str)
{
#if wxUSE_UNICODE && !wxUSE_UNICODE_UTF8
const wxScopedCharBuffer buf(str.AsCharBuf());
if ( !buf )
os.clear(wxSTD ios_base::failbit);
else
os << buf.data();
return os;
return os << wxConvWhateverWorks.cWX2MB(str);
#else
return os << str.AsInternal();
#endif

View File

@@ -400,7 +400,7 @@ public:
for ( int i = 0; i < m_argc; i++ )
{
m_argv[i] = wxStrdup(args[i]);
m_argv[i] = wxStrdup(args[i].mb_str(wxConvWhateverWorks));
}
}

View File

@@ -78,10 +78,6 @@ private:
wxString m_fileNameNonASCII;
wxString m_fileNameWork;
#ifndef __DARWIN__
wxMBConv* m_convFNOld;
#endif
wxDECLARE_NO_COPY_CLASS(FileFunctionsTestCase);
};
@@ -98,16 +94,6 @@ CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( FileFunctionsTestCase, "FileFunctionsTest
void FileFunctionsTestCase::setUp()
{
// Under Unix we need to use UTF-8 for the tests using non-ASCII filenames
// and this is not necessarily the case because the tests don't call
// setlocale(LC_ALL, ""), so ensure it explicitly. This is just a temporary
// hack until we find the solution to make the library work with Unicode
// filenames irrespectively of the current locale.
#ifndef __DARWIN__
m_convFNOld = wxConvFileName;
wxConvFileName = &wxConvUTF8;
#endif
// Initialize local data
wxFileName fn1(wxFileName::GetTempDir(), wxT("wx_file_mask.txt"));
@@ -137,10 +123,6 @@ void FileFunctionsTestCase::tearDown()
{
wxRemoveFile(m_fileNameWork);
}
#ifndef __DARWIN__
wxConvFileName = m_convFNOld;
#endif
}
void FileFunctionsTestCase::GetTempFolder()