Merge branch 'utf8-text-stream'
Really fix reading from UTF-8 text streams. Closes #14720. See https://github.com/wxWidgets/wxWidgets/pull/1304
This commit is contained in:
@@ -32,6 +32,8 @@ class WXDLLIMPEXP_FWD_BASE wxString;
|
|||||||
// don't let the fact that the existing classes implement MB2WC/WC2MB() instead
|
// don't let the fact that the existing classes implement MB2WC/WC2MB() instead
|
||||||
// confuse you.
|
// confuse you.
|
||||||
//
|
//
|
||||||
|
// For many encodings you must override GetMaxCharLen().
|
||||||
|
//
|
||||||
// You also have to implement Clone() to allow copying the conversions
|
// You also have to implement Clone() to allow copying the conversions
|
||||||
// polymorphically.
|
// polymorphically.
|
||||||
//
|
//
|
||||||
@@ -118,6 +120,10 @@ public:
|
|||||||
wxWCharBuffer cWX2WC(const char *psz) const { return cMB2WC(psz); }
|
wxWCharBuffer cWX2WC(const char *psz) const { return cMB2WC(psz); }
|
||||||
#endif // Unicode/ANSI
|
#endif // Unicode/ANSI
|
||||||
|
|
||||||
|
// return the maximum number of bytes that can be required to encode a
|
||||||
|
// single character in this encoding, e.g. 4 for UTF-8
|
||||||
|
virtual size_t GetMaxCharLen() const { return 1; }
|
||||||
|
|
||||||
// this function is used in the implementation of cMB2WC() to distinguish
|
// this function is used in the implementation of cMB2WC() to distinguish
|
||||||
// between the following cases:
|
// between the following cases:
|
||||||
//
|
//
|
||||||
@@ -254,6 +260,8 @@ public:
|
|||||||
virtual size_t FromWChar(char *dst, size_t dstLen,
|
virtual size_t FromWChar(char *dst, size_t dstLen,
|
||||||
const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
||||||
|
|
||||||
|
virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; }
|
||||||
|
|
||||||
virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF7; }
|
virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF7; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@@ -341,6 +349,8 @@ public:
|
|||||||
virtual size_t FromWChar(char *dst, size_t dstLen,
|
virtual size_t FromWChar(char *dst, size_t dstLen,
|
||||||
const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
||||||
|
|
||||||
|
virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; }
|
||||||
|
|
||||||
virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvStrictUTF8(); }
|
virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvStrictUTF8(); }
|
||||||
|
|
||||||
// NB: other mapping modes are not, strictly speaking, UTF-8, so we can't
|
// NB: other mapping modes are not, strictly speaking, UTF-8, so we can't
|
||||||
@@ -365,6 +375,8 @@ public:
|
|||||||
virtual size_t FromWChar(char *dst, size_t dstLen,
|
virtual size_t FromWChar(char *dst, size_t dstLen,
|
||||||
const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
||||||
|
|
||||||
|
virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; }
|
||||||
|
|
||||||
virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF8(m_options); }
|
virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF8(m_options); }
|
||||||
|
|
||||||
// NB: other mapping modes are not, strictly speaking, UTF-8, so we can't
|
// NB: other mapping modes are not, strictly speaking, UTF-8, so we can't
|
||||||
@@ -405,6 +417,7 @@ public:
|
|||||||
const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
||||||
virtual size_t FromWChar(char *dst, size_t dstLen,
|
virtual size_t FromWChar(char *dst, size_t dstLen,
|
||||||
const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
||||||
|
virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; }
|
||||||
virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF16LE; }
|
virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF16LE; }
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -419,6 +432,7 @@ public:
|
|||||||
const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
||||||
virtual size_t FromWChar(char *dst, size_t dstLen,
|
virtual size_t FromWChar(char *dst, size_t dstLen,
|
||||||
const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
||||||
|
virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; }
|
||||||
virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF16BE; }
|
virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF16BE; }
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -451,6 +465,7 @@ public:
|
|||||||
const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
||||||
virtual size_t FromWChar(char *dst, size_t dstLen,
|
virtual size_t FromWChar(char *dst, size_t dstLen,
|
||||||
const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
||||||
|
virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; }
|
||||||
virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF32LE; }
|
virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF32LE; }
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -465,6 +480,7 @@ public:
|
|||||||
const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
||||||
virtual size_t FromWChar(char *dst, size_t dstLen,
|
virtual size_t FromWChar(char *dst, size_t dstLen,
|
||||||
const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
||||||
|
virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; }
|
||||||
virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF32BE; }
|
virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF32BE; }
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -566,6 +582,10 @@ public:
|
|||||||
FromWChar(char *dst, size_t dstLen,
|
FromWChar(char *dst, size_t dstLen,
|
||||||
const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
|
||||||
|
|
||||||
|
// Use the value for UTF-8 here to make sure we try to decode up to 4 bytes
|
||||||
|
// as UTF-8 before giving up.
|
||||||
|
virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; }
|
||||||
|
|
||||||
virtual wxMBConv *Clone() const wxOVERRIDE
|
virtual wxMBConv *Clone() const wxOVERRIDE
|
||||||
{
|
{
|
||||||
return new wxWhateverWorksConv();
|
return new wxWhateverWorksConv();
|
||||||
|
@@ -48,6 +48,26 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual wxMBConv* Clone() const = 0;
|
virtual wxMBConv* Clone() const = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
This function must be overridden in the derived classes to return the
|
||||||
|
maximum length, in bytes, of a single Unicode character representation
|
||||||
|
in this encoding.
|
||||||
|
|
||||||
|
As a consequence, the conversion object must be able to decode any
|
||||||
|
valid sequence of bytes in the corresponding encoding if it's at least
|
||||||
|
that many bytes long, but may fail if it is shorter. For example, for
|
||||||
|
UTF-8 the maximum character length is 4, as 3 bytes or less may be
|
||||||
|
insufficient to represent a Unicode character in UTF-8, but 4 are
|
||||||
|
always enough.
|
||||||
|
|
||||||
|
For compatibility reasons, this method is not pure virtual and returns
|
||||||
|
1 by default in the base class, however it should be always overridden
|
||||||
|
in the derived classes.
|
||||||
|
|
||||||
|
@since 3.1.3
|
||||||
|
*/
|
||||||
|
virtual size_t GetMaxCharLen() const;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
This function returns 1 for most of the multibyte encodings in which the
|
This function returns 1 for most of the multibyte encodings in which the
|
||||||
string is terminated by a single @c NUL, 2 for UTF-16 and 4 for UTF-32 for
|
string is terminated by a single @c NUL, 2 for UTF-16 and 4 for UTF-32 for
|
||||||
|
@@ -309,6 +309,13 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
|
|||||||
size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
|
size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
|
||||||
if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None )
|
if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None )
|
||||||
{
|
{
|
||||||
|
// we may need more bytes before we can decode the input, don't switch
|
||||||
|
// to the fall-back conversion in this case as it would prevent us from
|
||||||
|
// decoding UTF-8 input when fed it byte by byte, as done by
|
||||||
|
// wxTextInputStream, for example
|
||||||
|
if ( srcLen < m_conv->GetMaxCharLen() )
|
||||||
|
return wxCONV_FAILED;
|
||||||
|
|
||||||
// if the conversion failed but we didn't really detect anything and
|
// if the conversion failed but we didn't really detect anything and
|
||||||
// simply tried UTF-8 by default, retry it using the fall-back
|
// simply tried UTF-8 by default, retry it using the fall-back
|
||||||
if ( m_encDefault != wxFONTENCODING_MAX )
|
if ( m_encDefault != wxFONTENCODING_MAX )
|
||||||
|
@@ -303,12 +303,24 @@ wxString wxTextInputStream::ReadLine()
|
|||||||
{
|
{
|
||||||
wxString line;
|
wxString line;
|
||||||
|
|
||||||
while ( !m_input.Eof() )
|
for ( ;; )
|
||||||
{
|
{
|
||||||
wxChar c = GetChar();
|
wxChar c = GetChar();
|
||||||
if (!c)
|
if ( m_input.Eof() )
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
if (!c)
|
||||||
|
{
|
||||||
|
// If we failed to get a character and the stream is not at EOF, it
|
||||||
|
// can only mean that decoding the stream contents using our
|
||||||
|
// conversion object failed. In this case, we must signal an error
|
||||||
|
// at the stream level, as otherwise the code using this function
|
||||||
|
// would never know that something went wrong and would continue
|
||||||
|
// calling it again and again, resulting in an infinite loop.
|
||||||
|
m_input.Reset(wxSTREAM_READ_ERROR);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
if (EatEOL(c))
|
if (EatEOL(c))
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@@ -613,14 +613,7 @@ static bool ReadAll(wxInputStream *is, wxArrayString& output)
|
|||||||
// the stream could be already at EOF or in wxSTREAM_BROKEN_PIPE state
|
// the stream could be already at EOF or in wxSTREAM_BROKEN_PIPE state
|
||||||
is->Reset();
|
is->Reset();
|
||||||
|
|
||||||
// Notice that wxTextInputStream doesn't work correctly with wxConvAuto
|
wxTextInputStream tis(*is);
|
||||||
// currently, see #14720, so use the current locale conversion explicitly
|
|
||||||
// under assumption that any external program should be using it too.
|
|
||||||
wxTextInputStream tis(*is, " \t"
|
|
||||||
#if wxUSE_UNICODE
|
|
||||||
, wxConvLibc
|
|
||||||
#endif
|
|
||||||
);
|
|
||||||
|
|
||||||
for ( ;; )
|
for ( ;; )
|
||||||
{
|
{
|
||||||
|
@@ -516,3 +516,28 @@ void ExecTestCase::TestOverlappedSyncExecute()
|
|||||||
CPPUNIT_ASSERT_EQUAL( SLEEP_END_STRING, longSleepOutput.Last() );
|
CPPUNIT_ASSERT_EQUAL( SLEEP_END_STRING, longSleepOutput.Last() );
|
||||||
#endif // !__WINDOWS__
|
#endif // !__WINDOWS__
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef __UNIX__
|
||||||
|
|
||||||
|
// This test is disabled by default because it must be run in French locale,
|
||||||
|
// i.e. with explicit LC_ALL=fr_FR.UTF-8 and only works with GNU ls, which
|
||||||
|
// produces the expected output.
|
||||||
|
TEST_CASE("wxExecute::RedirectUTF8", "[exec][unicode][.]")
|
||||||
|
{
|
||||||
|
wxArrayString output;
|
||||||
|
REQUIRE( wxExecute("/bin/ls --version", output) == 0 );
|
||||||
|
|
||||||
|
for ( size_t n = 0; n < output.size(); ++n )
|
||||||
|
{
|
||||||
|
// It seems unlikely that this part of the output will change for GNU
|
||||||
|
// ls, so check for its presence as a sign that the program output was
|
||||||
|
// decoded correctly.
|
||||||
|
if ( output[n].find(wxString::FromUTF8("vous \xc3\xaates libre")) != wxString::npos )
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
INFO("output was:\n" << wxJoin(output, '\n'));
|
||||||
|
FAIL("Expected output fragment not found.");
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // __UNIX__
|
||||||
|
Reference in New Issue
Block a user