diff --git a/include/wx/strconv.h b/include/wx/strconv.h index 207a235605..d344a5a911 100644 --- a/include/wx/strconv.h +++ b/include/wx/strconv.h @@ -32,6 +32,8 @@ class WXDLLIMPEXP_FWD_BASE wxString; // don't let the fact that the existing classes implement MB2WC/WC2MB() instead // confuse you. // +// For many encodings you must override GetMaxCharLen(). +// // You also have to implement Clone() to allow copying the conversions // polymorphically. // @@ -118,6 +120,10 @@ public: wxWCharBuffer cWX2WC(const char *psz) const { return cMB2WC(psz); } #endif // Unicode/ANSI + // return the maximum number of bytes that can be required to encode a + // single character in this encoding, e.g. 4 for UTF-8 + virtual size_t GetMaxCharLen() const { return 1; } + // this function is used in the implementation of cMB2WC() to distinguish // between the following cases: // @@ -254,6 +260,8 @@ public: virtual size_t FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; + virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; } + virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF7; } private: @@ -341,6 +349,8 @@ public: virtual size_t FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; + virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; } + virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvStrictUTF8(); } // NB: other mapping modes are not, strictly speaking, UTF-8, so we can't @@ -365,6 +375,8 @@ public: virtual size_t FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; + virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; } + virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF8(m_options); } // NB: other mapping modes are not, strictly speaking, UTF-8, so we can't @@ -405,6 +417,7 @@ public: const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; virtual size_t FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; + virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; } virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF16LE; } }; @@ -419,6 +432,7 @@ public: const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; virtual size_t FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; + virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; } virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF16BE; } }; @@ -451,6 +465,7 @@ public: const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; virtual size_t FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; + virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; } virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF32LE; } }; @@ -465,6 +480,7 @@ public: const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; virtual size_t FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; + virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; } virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConvUTF32BE; } }; @@ -566,6 +582,10 @@ public: FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE; + // Use the value for UTF-8 here to make sure we try to decode up to 4 bytes + // as UTF-8 before giving up. + virtual size_t GetMaxCharLen() const wxOVERRIDE { return 4; } + virtual wxMBConv *Clone() const wxOVERRIDE { return new wxWhateverWorksConv(); diff --git a/interface/wx/strconv.h b/interface/wx/strconv.h index 00dfb359e6..7a2055495f 100644 --- a/interface/wx/strconv.h +++ b/interface/wx/strconv.h @@ -48,6 +48,26 @@ public: */ virtual wxMBConv* Clone() const = 0; + /** + This function must be overridden in the derived classes to return the + maximum length, in bytes, of a single Unicode character representation + in this encoding. + + As a consequence, the conversion object must be able to decode any + valid sequence of bytes in the corresponding encoding if it's at least + that many bytes long, but may fail if it is shorter. For example, for + UTF-8 the maximum character length is 4, as 3 bytes or less may be + insufficient to represent a Unicode character in UTF-8, but 4 are + always enough. + + For compatibility reasons, this method is not pure virtual and returns + 1 by default in the base class, however it should be always overridden + in the derived classes. + + @since 3.1.3 + */ + virtual size_t GetMaxCharLen() const; + /** This function returns 1 for most of the multibyte encodings in which the string is terminated by a single @c NUL, 2 for UTF-16 and 4 for UTF-32 for diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp index a2e831e47b..952b4455f5 100644 --- a/src/common/convauto.cpp +++ b/src/common/convauto.cpp @@ -309,6 +309,13 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen); if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None ) { + // we may need more bytes before we can decode the input, don't switch + // to the fall-back conversion in this case as it would prevent us from + // decoding UTF-8 input when fed it byte by byte, as done by + // wxTextInputStream, for example + if ( srcLen < m_conv->GetMaxCharLen() ) + return wxCONV_FAILED; + // if the conversion failed but we didn't really detect anything and // simply tried UTF-8 by default, retry it using the fall-back if ( m_encDefault != wxFONTENCODING_MAX ) diff --git a/src/common/txtstrm.cpp b/src/common/txtstrm.cpp index 814efb62b0..c38f7c29ab 100644 --- a/src/common/txtstrm.cpp +++ b/src/common/txtstrm.cpp @@ -303,12 +303,24 @@ wxString wxTextInputStream::ReadLine() { wxString line; - while ( !m_input.Eof() ) + for ( ;; ) { wxChar c = GetChar(); - if (!c) + if ( m_input.Eof() ) break; + if (!c) + { + // If we failed to get a character and the stream is not at EOF, it + // can only mean that decoding the stream contents using our + // conversion object failed. In this case, we must signal an error + // at the stream level, as otherwise the code using this function + // would never know that something went wrong and would continue + // calling it again and again, resulting in an infinite loop. + m_input.Reset(wxSTREAM_READ_ERROR); + break; + } + if (EatEOL(c)) break; diff --git a/src/common/utilscmn.cpp b/src/common/utilscmn.cpp index b1f9ffed46..2206f3d895 100644 --- a/src/common/utilscmn.cpp +++ b/src/common/utilscmn.cpp @@ -613,14 +613,7 @@ static bool ReadAll(wxInputStream *is, wxArrayString& output) // the stream could be already at EOF or in wxSTREAM_BROKEN_PIPE state is->Reset(); - // Notice that wxTextInputStream doesn't work correctly with wxConvAuto - // currently, see #14720, so use the current locale conversion explicitly - // under assumption that any external program should be using it too. - wxTextInputStream tis(*is, " \t" -#if wxUSE_UNICODE - , wxConvLibc -#endif - ); + wxTextInputStream tis(*is); for ( ;; ) { diff --git a/tests/exec/exec.cpp b/tests/exec/exec.cpp index f350376043..bb82e4c059 100644 --- a/tests/exec/exec.cpp +++ b/tests/exec/exec.cpp @@ -516,3 +516,28 @@ void ExecTestCase::TestOverlappedSyncExecute() CPPUNIT_ASSERT_EQUAL( SLEEP_END_STRING, longSleepOutput.Last() ); #endif // !__WINDOWS__ } + +#ifdef __UNIX__ + +// This test is disabled by default because it must be run in French locale, +// i.e. with explicit LC_ALL=fr_FR.UTF-8 and only works with GNU ls, which +// produces the expected output. +TEST_CASE("wxExecute::RedirectUTF8", "[exec][unicode][.]") +{ + wxArrayString output; + REQUIRE( wxExecute("/bin/ls --version", output) == 0 ); + + for ( size_t n = 0; n < output.size(); ++n ) + { + // It seems unlikely that this part of the output will change for GNU + // ls, so check for its presence as a sign that the program output was + // decoded correctly. + if ( output[n].find(wxString::FromUTF8("vous \xc3\xaates libre")) != wxString::npos ) + return; + } + + INFO("output was:\n" << wxJoin(output, '\n')); + FAIL("Expected output fragment not found."); +} + +#endif // __UNIX__