diff --git a/docs/changes.txt b/docs/changes.txt index bc778c5463..6099ceade0 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -435,6 +435,7 @@ All: - wxDateTime timezone functions now dynamic (no caching). - Added wxHttp::GetCookie and wxHttp::HasCookies (dodge). - Added support for unique volume names to wxFileName (Neno Ganchev). +- Correct bugs when using wxTextInputStream with wxConvAuto (Leon Buikstra). Unix: diff --git a/include/wx/convauto.h b/include/wx/convauto.h index b3dde4c73a..3a2e2e4809 100644 --- a/include/wx/convauto.h +++ b/include/wx/convauto.h @@ -75,6 +75,7 @@ private: // all currently recognized BOM values enum BOMType { + BOM_Unknown = -1, BOM_None, BOM_UTF32BE, BOM_UTF32LE, @@ -107,7 +108,10 @@ private: // create the correct conversion object for the BOM present in the // beginning of the buffer; adjust the buffer to skip the BOM if found - void InitFromInput(const char **src, size_t *len); + // + // return false if the buffer is too short to allow us to determine if we + // have BOM or not + bool InitFromInput(const char **src, size_t *len); // adjust src and len to skip over the BOM (identified by m_bomType) at the // start of the buffer diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp index f4e394d04d..c9ff7df9f6 100644 --- a/src/common/convauto.cpp +++ b/src/common/convauto.cpp @@ -26,6 +26,7 @@ #if wxUSE_WCHAR_T #ifndef WX_PRECOMP + #include "wx/wx.h" #endif //WX_PRECOMP #include "wx/convauto.h" @@ -52,55 +53,86 @@ void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc) /* static */ wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen) { - if ( srcLen < 2 ) - { - // minimal BOM is 2 bytes so bail out immediately and simplify the code - // below which wouldn't need to check for length for UTF-16 cases - return BOM_None; - } - // examine the buffer for BOM presence // - // see http://www.unicode.org/faq/utf_bom.html#BOM - switch ( *src++ ) + // quoting from http://www.unicode.org/faq/utf_bom.html#BOM: + // + // Bytes Encoding Form + // + // 00 00 FE FF UTF-32, big-endian + // FF FE 00 00 UTF-32, little-endian + // FE FF UTF-16, big-endian + // FF FE UTF-16, little-endian + // EF BB BF UTF-8 + // + // as some BOMs are prefixes of other ones we may need to read more bytes + // to disambiguate them + + switch ( srcLen ) { - case '\0': - // could only be big endian UTF-32 (00 00 FE FF) - if ( srcLen >= 4 && - src[0] == '\0' && - src[1] == '\xfe' && - src[2] == '\xff' ) + case 0: + return BOM_Unknown; + + case 1: + if ( src[0] == '\x00' || src[0] == '\xFF' || + src[0] == '\xFE' || src[0] == '\xEF') { - return BOM_UTF32BE; + // this could be a BOM but we don't know yet + return BOM_Unknown; } break; - case '\xfe': - // could only be big endian UTF-16 (FE FF) - if ( *src++ == '\xff' ) + case 2: + case 3: + if ( src[0] == '\xEF' && src[1] == '\xBB' ) { + if ( srcLen == 3 ) + return src[2] == '\xBF' ? BOM_UTF8 : BOM_None; + + return BOM_Unknown; + } + + if ( src[0] == '\xFE' && src[1] == '\xFF' ) return BOM_UTF16BE; + + if ( src[0] == '\xFF' && src[1] == '\xFE' ) + { + // if the next byte is 0, it could be an UTF-32LE BOM but if it + // isn't we can be sure it's UTF-16LE + if ( srcLen == 3 && src[2] != '\x00' ) + return BOM_UTF16LE; + + return BOM_Unknown; } + + if ( src[0] == '\x00' && src[1] == '\x00' ) + { + // this could only be UTF-32BE + if ( srcLen == 3 && src[2] == '\xFE' ) + return BOM_Unknown; + } + break; - case '\xff': - // could be either little endian UTF-16 or UTF-32, both start - // with FF FE - if ( *src++ == '\xfe' ) - { - return srcLen >= 4 && src[0] == '\0' && src[1] == '\0' - ? BOM_UTF32LE - : BOM_UTF16LE; - } - break; - - case '\xef': - // is this UTF-8 BOM (EF BB BF)? - if ( srcLen >= 3 && src[0] == '\xbb' && src[1] == '\xbf' ) - { + default: + // we have at least 4 characters so we may finally decide whether + // we have a BOM or not + if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' ) return BOM_UTF8; - } - break; + + if ( src[0] == '\x00' && src[1] == '\x00' && + src[2] == '\xFE' && src[3] == '\xFF' ) + return BOM_UTF32BE; + + if ( src[0] == '\xFF' && src[1] == '\xFE' && + src[2] == '\x00' && src[3] == '\x00' ) + return BOM_UTF32LE; + + if ( src[0] == '\xFE' && src[1] == '\xFF' ) + return BOM_UTF16BE; + + if ( src[0] == '\xFF' && src[1] == '\xFE' ) + return BOM_UTF16LE; } return BOM_None; @@ -112,6 +144,14 @@ void wxConvAuto::InitFromBOM(BOMType bomType) switch ( bomType ) { + case BOM_Unknown: + wxFAIL_MSG( "shouldn't be called for this BOM type" ); + break; + + case BOM_None: + // use the default + break; + case BOM_UTF32BE: m_conv = new wxMBConvUTF32BE; m_ownsConv = true; @@ -137,12 +177,16 @@ void wxConvAuto::InitFromBOM(BOMType bomType) break; default: - wxFAIL_MSG( wxT("unexpected BOM type") ); - // fall through: still need to create something + wxFAIL_MSG( "unknown BOM type" ); + } - case BOM_None: - InitWithUTF8(); - m_consumedBOM = true; // as there is nothing to consume + if ( !m_conv ) + { + // we end up here if there is no BOM or we didn't recognize it somehow + // (this shouldn't happen but still don't crash if it does), so use the + // default encoding + InitWithUTF8(); + m_consumedBOM = true; // as there is nothing to consume } } @@ -151,6 +195,14 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const int ofs; switch ( m_bomType ) { + case BOM_Unknown: + wxFAIL_MSG( "shouldn't be called for this BOM type" ); + return; + + case BOM_None: + ofs = 0; + break; + case BOM_UTF32BE: case BOM_UTF32LE: ofs = 4; @@ -166,11 +218,8 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const break; default: - wxFAIL_MSG( wxT("unexpected BOM type") ); - // fall through: still need to create something - - case BOM_None: - ofs = 0; + wxFAIL_MSG( "unknown BOM type" ); + return; } *src += ofs; @@ -178,11 +227,16 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const *len -= ofs; } -void wxConvAuto::InitFromInput(const char **src, size_t *len) +bool wxConvAuto::InitFromInput(const char **src, size_t *len) { m_bomType = DetectBOM(*src, *len); + if ( m_bomType == BOM_Unknown ) + return false; + InitFromBOM(m_bomType); SkipBOM(src, len); + + return true; } size_t @@ -195,16 +249,20 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, // dst as typically we're first called with NULL dst to calculate the // needed buffer size wxConvAuto *self = const_cast(this); + + if ( !m_conv ) { - self->InitFromInput(&src, &srcLen); - if ( dst ) - self->m_consumedBOM = true; + if ( !self->InitFromInput(&src, &srcLen) ) + { + // there is not enough data to determine whether we have a BOM or + // not, so fail for now -- the caller is supposed to call us again + // with more data + return wxCONV_FAILED; + } } - - if ( !m_consumedBOM && dst ) + else if ( !m_consumedBOM && dst ) { - self->m_consumedBOM = true; SkipBOM(&src, &srcLen); } @@ -228,6 +286,8 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, } } + if (rc != wxCONV_FAILED && dst && !m_consumedBOM) + self->m_consumedBOM = true; return rc; } @@ -245,4 +305,3 @@ wxConvAuto::FromWChar(char *dst, size_t dstLen, } #endif // wxUSE_WCHAR_T - diff --git a/src/common/txtstrm.cpp b/src/common/txtstrm.cpp index 830bc985ba..9f9669faa1 100644 --- a/src/common/txtstrm.cpp +++ b/src/common/txtstrm.cpp @@ -76,7 +76,7 @@ wxChar wxTextInputStream::NextChar() return wxEOT; if ( m_conv->ToWChar(wbuf, WXSIZEOF(wbuf), m_lastBytes, inlen + 1) - != wxCONV_FAILED ) + == 1 ) return wbuf[0]; } // there should be no encoding which requires more than nine bytes for one character... diff --git a/tests/mbconv/convautotest.cpp b/tests/mbconv/convautotest.cpp index fbd7042012..0c6292f625 100644 --- a/tests/mbconv/convautotest.cpp +++ b/tests/mbconv/convautotest.cpp @@ -19,11 +19,11 @@ #if wxUSE_WCHAR_T -#ifndef WX_PRECOMP -#endif // WX_PRECOMP - #include "wx/convauto.h" +#include "wx/mstream.h" +#include "wx/txtstrm.h" + // ---------------------------------------------------------------------------- // test class // ---------------------------------------------------------------------------- @@ -43,6 +43,12 @@ private: CPPUNIT_TEST( UTF16LE ); CPPUNIT_TEST( UTF16BE ); CPPUNIT_TEST( UTF8 ); + CPPUNIT_TEST( StreamUTF8NoBOM ); + CPPUNIT_TEST( StreamUTF8 ); + CPPUNIT_TEST( StreamUTF16LE ); + CPPUNIT_TEST( StreamUTF16BE ); + CPPUNIT_TEST( StreamUTF32LE ); + CPPUNIT_TEST( StreamUTF32BE ); CPPUNIT_TEST_SUITE_END(); // real test function: check that converting the src multibyte string to @@ -57,6 +63,19 @@ private: void UTF16LE(); void UTF16BE(); void UTF8(); + + // test whether two lines of text are converted properly from a stream + void TestTextStream(const char *src, + size_t srclength, + const wxString& line1, + const wxString& line2); + + void StreamUTF8NoBOM(); + void StreamUTF8(); + void StreamUTF16LE(); + void StreamUTF16BE(); + void StreamUTF32LE(); + void StreamUTF32BE(); }; // register in the unnamed registry so that these tests are run by default @@ -118,5 +137,76 @@ void ConvAutoTestCase::UTF8() #endif } +void ConvAutoTestCase::TestTextStream(const char *src, + size_t srclength, + const wxString& line1, + const wxString& line2) +{ + wxMemoryInputStream instream(src, srclength); + wxTextInputStream text(instream); + + CPPUNIT_ASSERT_EQUAL( line1, text.ReadLine() ); + CPPUNIT_ASSERT_EQUAL( line2, text.ReadLine() ); +} + +// the first line of the teststring used in the following functions is an +// 'a' followed by a Japanese hiragana A (u+3042). +// The second line is a single Greek beta (u+03B2). There is no blank line +// at the end. + +namespace +{ + +const wxString line1 = wxString::FromUTF8("a\xe3\x81\x82"); +const wxString line2 = wxString::FromUTF8("\xce\xb2"); + +} // anonymous namespace + +void ConvAutoTestCase::StreamUTF8NoBOM() +{ + // currently this test doesn't work because without the BOM wxConvAuto + // decides that the string is in Latin-1 after finding the first (but not + // the two subsequent ones which are part of the same UTF-8 sequence!) + // 8-bit character + // + // FIXME: we need to fix this at wxTextInputStream level, see #11570 +#if 0 + TestTextStream("\x61\xE3\x81\x82\x0A\xCE\xB2", + 7, line1, line2); +#endif +} + +void ConvAutoTestCase::StreamUTF8() +{ + TestTextStream("\xEF\xBB\xBF\x61\xE3\x81\x82\x0A\xCE\xB2", + 10, line1, line2); +} + +void ConvAutoTestCase::StreamUTF16LE() +{ + TestTextStream("\xFF\xFE\x61\x00\x42\x30\x0A\x00\xB2\x03", + 10, line1, line2); +} + +void ConvAutoTestCase::StreamUTF16BE() +{ + TestTextStream("\xFE\xFF\x00\x61\x30\x42\x00\x0A\x03\xB2", + 10, line1, line2); +} + +void ConvAutoTestCase::StreamUTF32LE() +{ + TestTextStream("\xFF\xFE\0\0\x61\x00\0\0\x42\x30\0\0\x0A" + "\x00\0\0\xB2\x03\0\0", + 20, line1, line2); +} + +void ConvAutoTestCase::StreamUTF32BE() +{ + TestTextStream("\0\0\xFE\xFF\0\0\x00\x61\0\0\x30\x42\0\0\x00\x0A" + "\0\0\x03\xB2", + 20, line1, line2); +} + #endif // wxUSE_WCHAR_T