diff --git a/include/wx/convauto.h b/include/wx/convauto.h index 3da6c6adc4..d7ed45592f 100644 --- a/include/wx/convauto.h +++ b/include/wx/convauto.h @@ -78,6 +78,8 @@ public: virtual size_t GetMBNulLen() const wxOVERRIDE { return m_conv->GetMBNulLen(); } + virtual bool IsUTF8() const wxOVERRIDE { return m_conv && m_conv->IsUTF8(); } + virtual wxMBConv *Clone() const wxOVERRIDE { return new wxConvAuto(*this); } // return the BOM type of this buffer @@ -91,6 +93,14 @@ public: return m_bomType; } + wxFontEncoding GetEncoding() const; + + // Return true if the fall-back encoding is used + bool IsUsingFallbackEncoding() const + { + return m_ownsConv && m_bomType == wxBOM_None; + } + private: // common part of all ctors void Init() diff --git a/include/wx/private/unicode.h b/include/wx/private/unicode.h new file mode 100644 index 0000000000..6c81c23504 --- /dev/null +++ b/include/wx/private/unicode.h @@ -0,0 +1,16 @@ +///////////////////////////////////////////////////////////////////////////// +// Name: wx/private/unicode.h +// Purpose: Unicode private declsrations +// Author: Pavel Tyunin +// Created: 2020-10-06 +// Copyright: (c) 2020 Pavel Tyunin +// Licence: wxWindows licence +///////////////////////////////////////////////////////////////////////////// + +#ifndef _WX_PRIVATE_UNICODEH__ +#define _WX_PRIVATE_UNICODEH__ + +// this table gives the length of the UTF-8 encoding from its first character: +extern const unsigned char tableUtf8Lengths[256]; + +#endif // _WX_PRIVATE_UNICODEH__ diff --git a/include/wx/stringops.h b/include/wx/stringops.h index 150554f341..dd46e6616c 100644 --- a/include/wx/stringops.h +++ b/include/wx/stringops.h @@ -94,15 +94,15 @@ struct WXDLLIMPEXP_BASE wxStringOperationsUtf8 return (c <= 0x7F) || (c >= 0xC2 && c <= 0xF4); } - // table of offsets to skip forward when iterating over UTF-8 sequence - static const unsigned char ms_utf8IterTable[256]; + // returns offset to skip forward when iterating over UTF-8 sequence + static unsigned char GetUTF8IterOffset(unsigned char c); template static void IncIter(Iterator& i) { wxASSERT( IsValidUtf8LeadByte(*i) ); - i += ms_utf8IterTable[(unsigned char)*i]; + i += GetUTF8IterOffset(*i); } template @@ -178,7 +178,7 @@ struct WXDLLIMPEXP_BASE wxStringOperationsUtf8 static size_t GetUtf8CharLength(char c) { wxASSERT( IsValidUtf8LeadByte(c) ); - return ms_utf8IterTable[(unsigned char)c]; + return GetUTF8IterOffset(c); } // decodes single UTF-8 character from UTF-8 string diff --git a/interface/wx/convauto.h b/interface/wx/convauto.h index 7ddfb26927..324b5b24e7 100644 --- a/interface/wx/convauto.h +++ b/interface/wx/convauto.h @@ -146,6 +146,22 @@ public: */ wxBOM GetBOM() const; + /** + Return the detected encoding + + Returns @c wxFONTENCODING_MAX if called before the first use. + + @since 3.1.5 + */ + wxBOM GetEncoding() const; + + /** + Check if the fall-back encoding is used. + + @since 3.1.5 + */ + bool IsUsingFallbackEncoding() const; + /** Return a pointer to the characters that makes up this BOM. diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp index 952b4455f5..708096de5d 100644 --- a/src/common/convauto.cpp +++ b/src/common/convauto.cpp @@ -23,6 +23,7 @@ #endif #include "wx/convauto.h" +#include "wx/private/unicode.h" // we use latin1 by default as it seems the least bad choice: the files we need // to detect input of don't always come from the user system (they are often @@ -266,6 +267,23 @@ bool wxConvAuto::InitFromInput(const char *src, size_t len) return true; } +// checks if the input can be the beginning of a valid UTF-8 sequence +static bool wxCanBeUTF8SequencePrefix(const char *src, size_t len) +{ + size_t i = 0; + unsigned char l = tableUtf8Lengths[(unsigned char)src[i]]; + if ( !l ) + return false; // invalid leading byte + while ( --l ) + { + if ( ++i == len ) + return true; // truncated sequence + if ( (src[i] & 0xC0) != 0x80 ) + return false; // invalid continuation byte + } + return false; // complete sequence +} + size_t wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, const char *src, size_t srcLen) const @@ -307,25 +325,28 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, // try to convert using the auto-detected encoding size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen); - if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None ) + if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None && !m_ownsConv ) { // we may need more bytes before we can decode the input, don't switch // to the fall-back conversion in this case as it would prevent us from // decoding UTF-8 input when fed it byte by byte, as done by // wxTextInputStream, for example - if ( srcLen < m_conv->GetMaxCharLen() ) + // up to 2 extra bytes are needed for inputs that start with null bytes + // that look like the start of UTF-32BE BOM, but can be in UTF-8 too + size_t nNull = 0; + if ( srcLen != wxNO_LEN && srcLen >= 2 && !src[0] ) + nNull = ( src[1]? 1 : 2 ); + if ( srcLen < nNull + m_conv->GetMaxCharLen() && + wxCanBeUTF8SequencePrefix(src + nNull, srcLen - nNull) ) return wxCONV_FAILED; // if the conversion failed but we didn't really detect anything and // simply tried UTF-8 by default, retry it using the fall-back + if ( m_encDefault == wxFONTENCODING_DEFAULT ) + self->m_encDefault = GetFallbackEncoding(); if ( m_encDefault != wxFONTENCODING_MAX ) { - if ( m_ownsConv ) - delete m_conv; - - self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT - ? GetFallbackEncoding() - : m_encDefault); + self->m_conv = new wxCSConv(m_encDefault); self->m_ownsConv = true; rc = m_conv->ToWChar(dst, dstLen, src, srcLen); @@ -351,3 +372,32 @@ wxConvAuto::FromWChar(char *dst, size_t dstLen, return m_conv->FromWChar(dst, dstLen, src, srcLen); } + +wxFontEncoding wxConvAuto::GetEncoding() const +{ + switch ( m_bomType ) + { + case wxBOM_UTF32BE: + return wxFONTENCODING_UTF32BE; + case wxBOM_UTF32LE: + return wxFONTENCODING_UTF32LE; + case wxBOM_UTF16BE: + return wxFONTENCODING_UTF16BE; + case wxBOM_UTF16LE: + return wxFONTENCODING_UTF16LE; + case wxBOM_UTF8: + return wxFONTENCODING_UTF8; + + case wxBOM_Unknown: + case wxBOM_None: + if ( !m_conv ) + return wxFONTENCODING_MAX; + else if ( !m_ownsConv ) + return wxFONTENCODING_UTF8; + else + return m_encDefault; + } + + wxFAIL_MSG( "unknown BOM type" ); + return wxFONTENCODING_MAX; +} diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index ba25dae157..23c2b0a545 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -46,6 +46,7 @@ #include "wx/encconv.h" #include "wx/fontmap.h" +#include "wx/private/unicode.h" #ifdef __DARWIN__ #include "wx/osx/core/private/strconv_cf.h" @@ -921,7 +922,7 @@ const wxUint32 wxUnicodePUA = 0x100000; const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256; // this table gives the length of the UTF-8 encoding from its first character: -const unsigned char tableUtf8Lengths[256] = { +extern const unsigned char tableUtf8Lengths[256] = { // single-byte sequences (ASCII): 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F diff --git a/src/common/stringops.cpp b/src/common/stringops.cpp index 85629406a3..84017ae523 100644 --- a/src/common/stringops.cpp +++ b/src/common/stringops.cpp @@ -23,6 +23,8 @@ #include "wx/stringops.h" #endif +#include "wx/private/unicode.h" + // =========================================================================== // implementation // =========================================================================== @@ -97,40 +99,13 @@ wxWxCharBuffer wxStringOperationsWchar::EncodeNChars(size_t n, const wxUniChar& // UTF-8 sequences lengths // --------------------------------------------------------------------------- -const unsigned char wxStringOperationsUtf8::ms_utf8IterTable[256] = { - // single-byte sequences (ASCII): - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F - - // these are invalid, we use step 1 to skip - // over them (should never happen): - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80..8F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90..9F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0..AF - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0..BF - 1, 1, // C0,C1 - - // two-byte sequences: - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF - - // three-byte sequences: - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF - - // four-byte sequences: - 4, 4, 4, 4, 4, // F0..F4 - - // these are invalid again (5- or 6-byte - // sequences and sequences for code points - // above U+10FFFF, as restricted by RFC 3629): - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F5..FF -}; +unsigned char wxStringOperationsUtf8::GetUTF8IterOffset(unsigned char c) +{ + unsigned char l = tableUtf8Lengths[c]; + if ( !l ) //skip over invalid characters + l = 1; + return l; +} // --------------------------------------------------------------------------- // UTF-8 operations @@ -166,7 +141,7 @@ bool wxStringOperationsUtf8::IsValidUtf8String(const char *str, size_t len) { // if the string is not NULL-terminated, verify we have enough // bytes in it left for current character's encoding: - if ( c + ms_utf8IterTable[*c] > end ) + if ( c + GetUTF8IterOffset(*c) > end ) return false; } @@ -364,7 +339,7 @@ wxCharBuffer wxStringOperationsUtf8::EncodeNChars(size_t n, const wxUniChar& ch) { Utf8CharBuffer once(EncodeChar(ch)); // the IncIter() table can be used to determine the length of ch's encoding: - size_t len = ms_utf8IterTable[(unsigned char)once.data[0]]; + size_t len = GetUTF8IterOffset(once.data[0]); wxCharBuffer buf(n * len); char *ptr = buf.data(); diff --git a/src/common/txtstrm.cpp b/src/common/txtstrm.cpp index c38f7c29ab..1332bb3cf0 100644 --- a/src/common/txtstrm.cpp +++ b/src/common/txtstrm.cpp @@ -97,10 +97,11 @@ wxChar wxTextInputStream::GetChar() m_validEnd = 0; } - // We may need to decode up to 4 characters if we have input starting with - // 3 BOM-like bytes, but not actually containing a BOM, as decoding it will - // only succeed when 4 bytes are read -- and will yield 4 wide characters. - wxChar wbuf[4]; + // We may need to decode up to 6 characters if we have input starting with + // 2 null bytes (like in UTF-32BE BOM), and then 3 bytes that look like + // the start of UTF-8 sequence, as decoding it will only succeed when + // 6 bytes are read -- and will yield 6 wide characters. + wxChar wbuf[6]; for(size_t inlen = 0; inlen < sizeof(m_lastBytes); inlen++) { if ( inlen >= m_validEnd ) @@ -134,12 +135,13 @@ wxChar wxTextInputStream::GetChar() // one extra byte, the only explanation is that we were using a // wxConvAuto conversion recognizing the initial BOM and that // it couldn't detect the presence or absence of BOM so far, - // but now finally has enough data to see that there is none. - // As we must have fallen back to Latin-1 in this case, return - // just the first byte and keep the other ones for the next - // time. - m_validBegin = 1; - return wbuf[0]; + // but now finally has enough data to see that there is none, or + // it was trying to decode the data as UTF-8 sequence, but now + // recognized that it's not valid UTF-8 and switched to fallback. + // We don't know how long is the first character or if it's decoded + // as 1 or 2 wchar_t characters, so we need to start with 1 byte again. + inlen = -1; + break; #if SIZEOF_WCHAR_T == 2 case 2: diff --git a/src/common/ustring.cpp b/src/common/ustring.cpp index 6e1768064b..531ee41b9c 100644 --- a/src/common/ustring.cpp +++ b/src/common/ustring.cpp @@ -15,6 +15,7 @@ #endif #include "wx/ustring.h" +#include "wx/private/unicode.h" #ifndef WX_PRECOMP #include "wx/crt.h" @@ -67,41 +68,6 @@ wxUString &wxUString::assignFromAscii( const char *str, size_type n ) // UTF-8 // ---------------------------------------------------------------------------- -// this table gives the length of the UTF-8 encoding from its first character: -const unsigned char tableUtf8Lengths[256] = { - // single-byte sequences (ASCII): - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F - - // these are invalid: - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF - 0, 0, // C0,C1 - - // two-byte sequences: - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF - - // three-byte sequences: - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF - - // four-byte sequences: - 4, 4, 4, 4, 4, // F0..F4 - - // these are invalid again (5- or 6-byte - // sequences and sequences for code points - // above U+10FFFF, as restricted by RFC 3629): - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF -}; - wxUString &wxUString::assignFromUTF8( const char *str ) { if (!str) diff --git a/tests/mbconv/convautotest.cpp b/tests/mbconv/convautotest.cpp index 6a5d5791d1..12e19c21ed 100644 --- a/tests/mbconv/convautotest.cpp +++ b/tests/mbconv/convautotest.cpp @@ -34,7 +34,9 @@ public: private: CPPUNIT_TEST_SUITE( ConvAutoTestCase ); + CPPUNIT_TEST( Init ); CPPUNIT_TEST( Empty ); + CPPUNIT_TEST( Encode ); CPPUNIT_TEST( Short ); CPPUNIT_TEST( None ); CPPUNIT_TEST( UTF32LE ); @@ -42,22 +44,53 @@ private: CPPUNIT_TEST( UTF16LE ); CPPUNIT_TEST( UTF16BE ); CPPUNIT_TEST( UTF8 ); + CPPUNIT_TEST( UTF8NoBom ); + CPPUNIT_TEST( Fallback ); + CPPUNIT_TEST( FallbackMultibyte ); + CPPUNIT_TEST( FallbackShort ); CPPUNIT_TEST( StreamUTF8NoBOM ); CPPUNIT_TEST( StreamUTF8 ); CPPUNIT_TEST( StreamUTF16LE ); CPPUNIT_TEST( StreamUTF16BE ); CPPUNIT_TEST( StreamUTF32LE ); CPPUNIT_TEST( StreamUTF32BE ); + CPPUNIT_TEST( StreamFallback ); + CPPUNIT_TEST( StreamFallbackMultibyte ); CPPUNIT_TEST_SUITE_END(); + // expected converter state, UTF-8 without BOM by default + struct ConvState + { + ConvState( wxBOM bom = wxBOM_None, + wxFontEncoding enc = wxFONTENCODING_UTF8, + bool fallback = false ) + : m_bom(bom), m_enc(enc), m_fallback(fallback) {} + + void Check(const wxConvAuto& conv) const + { + CPPUNIT_ASSERT( conv.GetBOM() == m_bom ); + CPPUNIT_ASSERT( conv.GetEncoding() == m_enc ); + CPPUNIT_ASSERT( conv.IsUsingFallbackEncoding() == m_fallback ); + CPPUNIT_ASSERT( conv.IsUTF8() == (m_enc == wxFONTENCODING_UTF8) ); + } + + wxBOM m_bom; + wxFontEncoding m_enc; + bool m_fallback; + }; + // real test function: check that converting the src multibyte string to // wide char using wxConvAuto yields wch as the first result // // the length of the string may need to be passed explicitly if it has // embedded NULs, otherwise it's not necessary - void TestFirstChar(const char *src, wchar_t wch, size_t len = wxNO_LEN); + void TestFirstChar(const char *src, wchar_t wch, size_t len = wxNO_LEN, + ConvState st = ConvState(), + wxFontEncoding fe = wxFONTENCODING_DEFAULT); + void Init(); void Empty(); + void Encode(); void Short(); void None(); void UTF32LE(); @@ -65,12 +98,17 @@ private: void UTF16LE(); void UTF16BE(); void UTF8(); + void UTF8NoBom(); + void Fallback(); + void FallbackMultibyte(); + void FallbackShort(); // test whether two lines of text are converted properly from a stream void TestTextStream(const char *src, size_t srclength, const wxString& line1, - const wxString& line2); + const wxString& line2, + wxFontEncoding fe = wxFONTENCODING_DEFAULT); void StreamUTF8NoBOM(); void StreamUTF8(); @@ -78,6 +116,8 @@ private: void StreamUTF16BE(); void StreamUTF32LE(); void StreamUTF32BE(); + void StreamFallback(); + void StreamFallbackMultibyte(); }; // register in the unnamed registry so that these tests are run by default @@ -90,16 +130,36 @@ CPPUNIT_TEST_SUITE_NAMED_REGISTRATION(ConvAutoTestCase, "ConvAutoTestCase"); // tests // ---------------------------------------------------------------------------- -void ConvAutoTestCase::TestFirstChar(const char *src, wchar_t wch, size_t len) +void ConvAutoTestCase::TestFirstChar(const char *src, wchar_t wch, size_t len, + ConvState st, wxFontEncoding fe) { - wxWCharBuffer wbuf = wxConvAuto().cMB2WC(src, len, NULL); + wxConvAuto conv(fe); + wxWCharBuffer wbuf = conv.cMB2WC(src, len, NULL); CPPUNIT_ASSERT( wbuf ); CPPUNIT_ASSERT_EQUAL( wch, *wbuf ); + st.Check(conv); +} + +void ConvAutoTestCase::Init() +{ + ConvState(wxBOM_Unknown, wxFONTENCODING_MAX).Check(wxConvAuto()); } void ConvAutoTestCase::Empty() { - CPPUNIT_ASSERT( !wxConvAuto().cMB2WC("") ); + wxConvAuto conv; + CPPUNIT_ASSERT( !conv.cMB2WC("") ); + ConvState(wxBOM_Unknown, wxFONTENCODING_MAX).Check(conv); +} + +void ConvAutoTestCase::Encode() +{ + wxConvAuto conv; + wxString str = wxString::FromUTF8("\xd0\x9f\xe3\x81\x82"); + wxCharBuffer buf = conv.cWC2MB(str.wc_str()); + CPPUNIT_ASSERT( buf ); + CPPUNIT_ASSERT_EQUAL( str, wxString::FromUTF8(buf) ); + ConvState(wxBOM_Unknown, wxFONTENCODING_UTF8).Check(conv); } void ConvAutoTestCase::Short() @@ -114,38 +174,71 @@ void ConvAutoTestCase::None() void ConvAutoTestCase::UTF32LE() { - TestFirstChar("\xff\xfe\0\0A\0\0\0", wxT('A'), 8); + TestFirstChar("\xff\xfe\0\0A\0\0\0", wxT('A'), 8, ConvState(wxBOM_UTF32LE, wxFONTENCODING_UTF32LE)); } void ConvAutoTestCase::UTF32BE() { - TestFirstChar("\0\0\xfe\xff\0\0\0B", wxT('B'), 8); + TestFirstChar("\0\0\xfe\xff\0\0\0B", wxT('B'), 8, ConvState(wxBOM_UTF32BE, wxFONTENCODING_UTF32BE)); } void ConvAutoTestCase::UTF16LE() { - TestFirstChar("\xff\xfeZ\0", wxT('Z'), 4); + TestFirstChar("\xff\xfeZ\0", wxT('Z'), 4, ConvState(wxBOM_UTF16LE, wxFONTENCODING_UTF16LE)); } void ConvAutoTestCase::UTF16BE() { - TestFirstChar("\xfe\xff\0Y", wxT('Y'), 4); + TestFirstChar("\xfe\xff\0Y", wxT('Y'), 4, ConvState(wxBOM_UTF16BE, wxFONTENCODING_UTF16BE)); } void ConvAutoTestCase::UTF8() { #ifdef wxHAVE_U_ESCAPE - TestFirstChar("\xef\xbb\xbf\xd0\x9f", L'\u041f'); + TestFirstChar("\xef\xbb\xbf\xd0\x9f", L'\u041f', wxNO_LEN, ConvState(wxBOM_UTF8, wxFONTENCODING_UTF8)); #endif } +void ConvAutoTestCase::UTF8NoBom() +{ +#ifdef wxHAVE_U_ESCAPE + TestFirstChar("\xd0\x9f\xe3\x81\x82", L'\u041f', wxNO_LEN, ConvState(wxBOM_None, wxFONTENCODING_UTF8)); +#endif +} + +void ConvAutoTestCase::Fallback() +{ +#ifdef wxHAVE_U_ESCAPE + TestFirstChar("\xbf", L'\u041f', wxNO_LEN, + ConvState(wxBOM_None, wxFONTENCODING_ISO8859_5, true), + wxFONTENCODING_ISO8859_5); +#endif +} + +void ConvAutoTestCase::FallbackMultibyte() +{ +#ifdef wxHAVE_U_ESCAPE + TestFirstChar("\x84\x50", L'\u041f', wxNO_LEN, + ConvState(wxBOM_None, wxFONTENCODING_CP932, true), + wxFONTENCODING_CP932); +#endif +} + +void ConvAutoTestCase::FallbackShort() +{ + TestFirstChar("\x61\xc4", 'a', 2, + ConvState(wxBOM_None, wxFONTENCODING_ISO8859_5, true), + wxFONTENCODING_ISO8859_5); +} + void ConvAutoTestCase::TestTextStream(const char *src, size_t srclength, const wxString& line1, - const wxString& line2) + const wxString& line2, + wxFontEncoding fe) { wxMemoryInputStream instream(src, srclength); - wxTextInputStream text(instream); + wxTextInputStream text(instream, wxT(" \t"), wxConvAuto(fe)); CPPUNIT_ASSERT_EQUAL( line1, text.ReadLine() ); CPPUNIT_ASSERT_EQUAL( line2, text.ReadLine() ); @@ -166,16 +259,8 @@ const wxString line2 = wxString::FromUTF8("\xce\xb2"); void ConvAutoTestCase::StreamUTF8NoBOM() { - // currently this test doesn't work because without the BOM wxConvAuto - // decides that the string is in Latin-1 after finding the first (but not - // the two subsequent ones which are part of the same UTF-8 sequence!) - // 8-bit character - // - // FIXME: we need to fix this at wxTextInputStream level, see #11570 -#if 0 TestTextStream("\x61\xE3\x81\x82\x0A\xCE\xB2", 7, line1, line2); -#endif } void ConvAutoTestCase::StreamUTF8() @@ -210,4 +295,17 @@ void ConvAutoTestCase::StreamUTF32BE() 20, line1, line2); } +void ConvAutoTestCase::StreamFallback() +{ + TestTextStream("\x61\xbf\x0A\xe0", + 4, wxString::FromUTF8("a\xd0\x9f"), wxString::FromUTF8("\xd1\x80"), + wxFONTENCODING_ISO8859_5); +} + +void ConvAutoTestCase::StreamFallbackMultibyte() +{ + TestTextStream("\x61\x82\xa0\x0A\x83\xc0", + 6, line1, line2, wxFONTENCODING_CP932); +} + #endif // wxUSE_UNICODE diff --git a/tests/streams/textstreamtest.cpp b/tests/streams/textstreamtest.cpp index edb6eaa8a2..c6497b23f2 100644 --- a/tests/streams/textstreamtest.cpp +++ b/tests/streams/textstreamtest.cpp @@ -324,6 +324,46 @@ TEST_CASE("wxTextInputStream::GetChar", "[text][input][stream][char]") REQUIRE( tis.GetChar() == 0x00 ); CHECK( tis.GetInputStream().Eof() ); } + + // Two null bytes that look like the start of UTF-32BE BOM, + // followed by 4 byte UTF-8 sequence. + // Needs wxConvAuto to not switch to fallback on <6 bytes. + SECTION("UTF8-with-nulls") + { + const wxUint8 buf[] = { 0x00, 0x00, 0xf0, 0x90, 0x8c, 0x98 }; + wxMemoryInputStream mis(buf, sizeof(buf)); + wxTextInputStream tis(mis); + + wxCharTypeBuffer e = wxString::FromUTF8((char*)buf, sizeof(buf)) + .tchar_str(); + for ( size_t i = 0; i < e.length(); ++i ) + { + INFO("i = " << i); + REQUIRE( tis.GetChar() == e[i] ); + } + REQUIRE( tis.GetChar() == 0x00 ); + CHECK( tis.GetInputStream().Eof() ); + } + + // Two null bytes that look like the start of UTF-32BE BOM, + // then 3 bytes that look like the start of UTF-8 sequence. + // Needs 6 character output buffer in GetChar(). + SECTION("almost-UTF8-with-nulls") + { + const wxUint8 buf[] = { 0x00, 0x00, 0xf0, 0x90, 0x8c, 0xe0 }; + wxMemoryInputStream mis(buf, sizeof(buf)); + wxTextInputStream tis(mis); + + wxCharTypeBuffer e = wxString((char*)buf, wxCSConv(wxFONTENCODING_ISO8859_1), + sizeof(buf)).tchar_str(); + for ( size_t i = 0; i < e.length(); ++i ) + { + INFO("i = " << i); + REQUIRE( tis.GetChar() == e[i] ); + } + REQUIRE( tis.GetChar() == 0x00 ); + CHECK( tis.GetInputStream().Eof() ); + } } #endif // wxUSE_UNICODE