Fix wxConvAuto behaviour when it is used by wxTextInputStream.
wxConvAuto implicitly supposed that the chunk of data passed to it for translation was big enough to allow it to at least detect the BOM from it. However this isn't necessarily the case and never is with wxTextInputStream which reads the bytes one by one. Fix this by waiting until we have enough data to be able to detect the BOM. This still doesn't fix the problem with streams without BOM and the corresponding unit test still fails -- it will need to be fixed at the level of wxTextInputStream itself later but handling correctly the cases when a BOM is present is already better than before. See #11570. git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@63064 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
This commit is contained in:
@@ -435,6 +435,7 @@ All:
|
||||
- wxDateTime timezone functions now dynamic (no caching).
|
||||
- Added wxHttp::GetCookie and wxHttp::HasCookies (dodge).
|
||||
- Added support for unique volume names to wxFileName (Neno Ganchev).
|
||||
- Correct bugs when using wxTextInputStream with wxConvAuto (Leon Buikstra).
|
||||
|
||||
Unix:
|
||||
|
||||
|
@@ -75,6 +75,7 @@ private:
|
||||
// all currently recognized BOM values
|
||||
enum BOMType
|
||||
{
|
||||
BOM_Unknown = -1,
|
||||
BOM_None,
|
||||
BOM_UTF32BE,
|
||||
BOM_UTF32LE,
|
||||
@@ -107,7 +108,10 @@ private:
|
||||
|
||||
// create the correct conversion object for the BOM present in the
|
||||
// beginning of the buffer; adjust the buffer to skip the BOM if found
|
||||
void InitFromInput(const char **src, size_t *len);
|
||||
//
|
||||
// return false if the buffer is too short to allow us to determine if we
|
||||
// have BOM or not
|
||||
bool InitFromInput(const char **src, size_t *len);
|
||||
|
||||
// adjust src and len to skip over the BOM (identified by m_bomType) at the
|
||||
// start of the buffer
|
||||
|
@@ -26,6 +26,7 @@
|
||||
#if wxUSE_WCHAR_T
|
||||
|
||||
#ifndef WX_PRECOMP
|
||||
#include "wx/wx.h"
|
||||
#endif //WX_PRECOMP
|
||||
|
||||
#include "wx/convauto.h"
|
||||
@@ -52,55 +53,86 @@ void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
|
||||
/* static */
|
||||
wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
|
||||
{
|
||||
if ( srcLen < 2 )
|
||||
{
|
||||
// minimal BOM is 2 bytes so bail out immediately and simplify the code
|
||||
// below which wouldn't need to check for length for UTF-16 cases
|
||||
return BOM_None;
|
||||
}
|
||||
|
||||
// examine the buffer for BOM presence
|
||||
//
|
||||
// see http://www.unicode.org/faq/utf_bom.html#BOM
|
||||
switch ( *src++ )
|
||||
// quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
|
||||
//
|
||||
// Bytes Encoding Form
|
||||
//
|
||||
// 00 00 FE FF UTF-32, big-endian
|
||||
// FF FE 00 00 UTF-32, little-endian
|
||||
// FE FF UTF-16, big-endian
|
||||
// FF FE UTF-16, little-endian
|
||||
// EF BB BF UTF-8
|
||||
//
|
||||
// as some BOMs are prefixes of other ones we may need to read more bytes
|
||||
// to disambiguate them
|
||||
|
||||
switch ( srcLen )
|
||||
{
|
||||
case '\0':
|
||||
// could only be big endian UTF-32 (00 00 FE FF)
|
||||
if ( srcLen >= 4 &&
|
||||
src[0] == '\0' &&
|
||||
src[1] == '\xfe' &&
|
||||
src[2] == '\xff' )
|
||||
case 0:
|
||||
return BOM_Unknown;
|
||||
|
||||
case 1:
|
||||
if ( src[0] == '\x00' || src[0] == '\xFF' ||
|
||||
src[0] == '\xFE' || src[0] == '\xEF')
|
||||
{
|
||||
return BOM_UTF32BE;
|
||||
// this could be a BOM but we don't know yet
|
||||
return BOM_Unknown;
|
||||
}
|
||||
break;
|
||||
|
||||
case '\xfe':
|
||||
// could only be big endian UTF-16 (FE FF)
|
||||
if ( *src++ == '\xff' )
|
||||
case 2:
|
||||
case 3:
|
||||
if ( src[0] == '\xEF' && src[1] == '\xBB' )
|
||||
{
|
||||
if ( srcLen == 3 )
|
||||
return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;
|
||||
|
||||
return BOM_Unknown;
|
||||
}
|
||||
|
||||
if ( src[0] == '\xFE' && src[1] == '\xFF' )
|
||||
return BOM_UTF16BE;
|
||||
|
||||
if ( src[0] == '\xFF' && src[1] == '\xFE' )
|
||||
{
|
||||
// if the next byte is 0, it could be an UTF-32LE BOM but if it
|
||||
// isn't we can be sure it's UTF-16LE
|
||||
if ( srcLen == 3 && src[2] != '\x00' )
|
||||
return BOM_UTF16LE;
|
||||
|
||||
return BOM_Unknown;
|
||||
}
|
||||
|
||||
if ( src[0] == '\x00' && src[1] == '\x00' )
|
||||
{
|
||||
// this could only be UTF-32BE
|
||||
if ( srcLen == 3 && src[2] == '\xFE' )
|
||||
return BOM_Unknown;
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case '\xff':
|
||||
// could be either little endian UTF-16 or UTF-32, both start
|
||||
// with FF FE
|
||||
if ( *src++ == '\xfe' )
|
||||
{
|
||||
return srcLen >= 4 && src[0] == '\0' && src[1] == '\0'
|
||||
? BOM_UTF32LE
|
||||
: BOM_UTF16LE;
|
||||
}
|
||||
break;
|
||||
|
||||
case '\xef':
|
||||
// is this UTF-8 BOM (EF BB BF)?
|
||||
if ( srcLen >= 3 && src[0] == '\xbb' && src[1] == '\xbf' )
|
||||
{
|
||||
default:
|
||||
// we have at least 4 characters so we may finally decide whether
|
||||
// we have a BOM or not
|
||||
if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
|
||||
return BOM_UTF8;
|
||||
}
|
||||
break;
|
||||
|
||||
if ( src[0] == '\x00' && src[1] == '\x00' &&
|
||||
src[2] == '\xFE' && src[3] == '\xFF' )
|
||||
return BOM_UTF32BE;
|
||||
|
||||
if ( src[0] == '\xFF' && src[1] == '\xFE' &&
|
||||
src[2] == '\x00' && src[3] == '\x00' )
|
||||
return BOM_UTF32LE;
|
||||
|
||||
if ( src[0] == '\xFE' && src[1] == '\xFF' )
|
||||
return BOM_UTF16BE;
|
||||
|
||||
if ( src[0] == '\xFF' && src[1] == '\xFE' )
|
||||
return BOM_UTF16LE;
|
||||
}
|
||||
|
||||
return BOM_None;
|
||||
@@ -112,6 +144,14 @@ void wxConvAuto::InitFromBOM(BOMType bomType)
|
||||
|
||||
switch ( bomType )
|
||||
{
|
||||
case BOM_Unknown:
|
||||
wxFAIL_MSG( "shouldn't be called for this BOM type" );
|
||||
break;
|
||||
|
||||
case BOM_None:
|
||||
// use the default
|
||||
break;
|
||||
|
||||
case BOM_UTF32BE:
|
||||
m_conv = new wxMBConvUTF32BE;
|
||||
m_ownsConv = true;
|
||||
@@ -137,10 +177,14 @@ void wxConvAuto::InitFromBOM(BOMType bomType)
|
||||
break;
|
||||
|
||||
default:
|
||||
wxFAIL_MSG( wxT("unexpected BOM type") );
|
||||
// fall through: still need to create something
|
||||
wxFAIL_MSG( "unknown BOM type" );
|
||||
}
|
||||
|
||||
case BOM_None:
|
||||
if ( !m_conv )
|
||||
{
|
||||
// we end up here if there is no BOM or we didn't recognize it somehow
|
||||
// (this shouldn't happen but still don't crash if it does), so use the
|
||||
// default encoding
|
||||
InitWithUTF8();
|
||||
m_consumedBOM = true; // as there is nothing to consume
|
||||
}
|
||||
@@ -151,6 +195,14 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const
|
||||
int ofs;
|
||||
switch ( m_bomType )
|
||||
{
|
||||
case BOM_Unknown:
|
||||
wxFAIL_MSG( "shouldn't be called for this BOM type" );
|
||||
return;
|
||||
|
||||
case BOM_None:
|
||||
ofs = 0;
|
||||
break;
|
||||
|
||||
case BOM_UTF32BE:
|
||||
case BOM_UTF32LE:
|
||||
ofs = 4;
|
||||
@@ -166,11 +218,8 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const
|
||||
break;
|
||||
|
||||
default:
|
||||
wxFAIL_MSG( wxT("unexpected BOM type") );
|
||||
// fall through: still need to create something
|
||||
|
||||
case BOM_None:
|
||||
ofs = 0;
|
||||
wxFAIL_MSG( "unknown BOM type" );
|
||||
return;
|
||||
}
|
||||
|
||||
*src += ofs;
|
||||
@@ -178,11 +227,16 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const
|
||||
*len -= ofs;
|
||||
}
|
||||
|
||||
void wxConvAuto::InitFromInput(const char **src, size_t *len)
|
||||
bool wxConvAuto::InitFromInput(const char **src, size_t *len)
|
||||
{
|
||||
m_bomType = DetectBOM(*src, *len);
|
||||
if ( m_bomType == BOM_Unknown )
|
||||
return false;
|
||||
|
||||
InitFromBOM(m_bomType);
|
||||
SkipBOM(src, len);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t
|
||||
@@ -195,16 +249,20 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
|
||||
// dst as typically we're first called with NULL dst to calculate the
|
||||
// needed buffer size
|
||||
wxConvAuto *self = const_cast<wxConvAuto *>(this);
|
||||
|
||||
|
||||
if ( !m_conv )
|
||||
{
|
||||
self->InitFromInput(&src, &srcLen);
|
||||
if ( dst )
|
||||
self->m_consumedBOM = true;
|
||||
}
|
||||
|
||||
if ( !m_consumedBOM && dst )
|
||||
if ( !self->InitFromInput(&src, &srcLen) )
|
||||
{
|
||||
// there is not enough data to determine whether we have a BOM or
|
||||
// not, so fail for now -- the caller is supposed to call us again
|
||||
// with more data
|
||||
return wxCONV_FAILED;
|
||||
}
|
||||
}
|
||||
else if ( !m_consumedBOM && dst )
|
||||
{
|
||||
self->m_consumedBOM = true;
|
||||
SkipBOM(&src, &srcLen);
|
||||
}
|
||||
|
||||
@@ -228,6 +286,8 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
|
||||
}
|
||||
}
|
||||
|
||||
if (rc != wxCONV_FAILED && dst && !m_consumedBOM)
|
||||
self->m_consumedBOM = true;
|
||||
return rc;
|
||||
}
|
||||
|
||||
@@ -245,4 +305,3 @@ wxConvAuto::FromWChar(char *dst, size_t dstLen,
|
||||
}
|
||||
|
||||
#endif // wxUSE_WCHAR_T
|
||||
|
||||
|
@@ -76,7 +76,7 @@ wxChar wxTextInputStream::NextChar()
|
||||
return wxEOT;
|
||||
|
||||
if ( m_conv->ToWChar(wbuf, WXSIZEOF(wbuf), m_lastBytes, inlen + 1)
|
||||
!= wxCONV_FAILED )
|
||||
== 1 )
|
||||
return wbuf[0];
|
||||
}
|
||||
// there should be no encoding which requires more than nine bytes for one character...
|
||||
|
@@ -19,11 +19,11 @@
|
||||
|
||||
#if wxUSE_WCHAR_T
|
||||
|
||||
#ifndef WX_PRECOMP
|
||||
#endif // WX_PRECOMP
|
||||
|
||||
#include "wx/convauto.h"
|
||||
|
||||
#include "wx/mstream.h"
|
||||
#include "wx/txtstrm.h"
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// test class
|
||||
// ----------------------------------------------------------------------------
|
||||
@@ -43,6 +43,12 @@ private:
|
||||
CPPUNIT_TEST( UTF16LE );
|
||||
CPPUNIT_TEST( UTF16BE );
|
||||
CPPUNIT_TEST( UTF8 );
|
||||
CPPUNIT_TEST( StreamUTF8NoBOM );
|
||||
CPPUNIT_TEST( StreamUTF8 );
|
||||
CPPUNIT_TEST( StreamUTF16LE );
|
||||
CPPUNIT_TEST( StreamUTF16BE );
|
||||
CPPUNIT_TEST( StreamUTF32LE );
|
||||
CPPUNIT_TEST( StreamUTF32BE );
|
||||
CPPUNIT_TEST_SUITE_END();
|
||||
|
||||
// real test function: check that converting the src multibyte string to
|
||||
@@ -57,6 +63,19 @@ private:
|
||||
void UTF16LE();
|
||||
void UTF16BE();
|
||||
void UTF8();
|
||||
|
||||
// test whether two lines of text are converted properly from a stream
|
||||
void TestTextStream(const char *src,
|
||||
size_t srclength,
|
||||
const wxString& line1,
|
||||
const wxString& line2);
|
||||
|
||||
void StreamUTF8NoBOM();
|
||||
void StreamUTF8();
|
||||
void StreamUTF16LE();
|
||||
void StreamUTF16BE();
|
||||
void StreamUTF32LE();
|
||||
void StreamUTF32BE();
|
||||
};
|
||||
|
||||
// register in the unnamed registry so that these tests are run by default
|
||||
@@ -118,5 +137,76 @@ void ConvAutoTestCase::UTF8()
|
||||
#endif
|
||||
}
|
||||
|
||||
void ConvAutoTestCase::TestTextStream(const char *src,
|
||||
size_t srclength,
|
||||
const wxString& line1,
|
||||
const wxString& line2)
|
||||
{
|
||||
wxMemoryInputStream instream(src, srclength);
|
||||
wxTextInputStream text(instream);
|
||||
|
||||
CPPUNIT_ASSERT_EQUAL( line1, text.ReadLine() );
|
||||
CPPUNIT_ASSERT_EQUAL( line2, text.ReadLine() );
|
||||
}
|
||||
|
||||
// the first line of the teststring used in the following functions is an
|
||||
// 'a' followed by a Japanese hiragana A (u+3042).
|
||||
// The second line is a single Greek beta (u+03B2). There is no blank line
|
||||
// at the end.
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
const wxString line1 = wxString::FromUTF8("a\xe3\x81\x82");
|
||||
const wxString line2 = wxString::FromUTF8("\xce\xb2");
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
void ConvAutoTestCase::StreamUTF8NoBOM()
|
||||
{
|
||||
// currently this test doesn't work because without the BOM wxConvAuto
|
||||
// decides that the string is in Latin-1 after finding the first (but not
|
||||
// the two subsequent ones which are part of the same UTF-8 sequence!)
|
||||
// 8-bit character
|
||||
//
|
||||
// FIXME: we need to fix this at wxTextInputStream level, see #11570
|
||||
#if 0
|
||||
TestTextStream("\x61\xE3\x81\x82\x0A\xCE\xB2",
|
||||
7, line1, line2);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ConvAutoTestCase::StreamUTF8()
|
||||
{
|
||||
TestTextStream("\xEF\xBB\xBF\x61\xE3\x81\x82\x0A\xCE\xB2",
|
||||
10, line1, line2);
|
||||
}
|
||||
|
||||
void ConvAutoTestCase::StreamUTF16LE()
|
||||
{
|
||||
TestTextStream("\xFF\xFE\x61\x00\x42\x30\x0A\x00\xB2\x03",
|
||||
10, line1, line2);
|
||||
}
|
||||
|
||||
void ConvAutoTestCase::StreamUTF16BE()
|
||||
{
|
||||
TestTextStream("\xFE\xFF\x00\x61\x30\x42\x00\x0A\x03\xB2",
|
||||
10, line1, line2);
|
||||
}
|
||||
|
||||
void ConvAutoTestCase::StreamUTF32LE()
|
||||
{
|
||||
TestTextStream("\xFF\xFE\0\0\x61\x00\0\0\x42\x30\0\0\x0A"
|
||||
"\x00\0\0\xB2\x03\0\0",
|
||||
20, line1, line2);
|
||||
}
|
||||
|
||||
void ConvAutoTestCase::StreamUTF32BE()
|
||||
{
|
||||
TestTextStream("\0\0\xFE\xFF\0\0\x00\x61\0\0\x30\x42\0\0\x00\x0A"
|
||||
"\0\0\x03\xB2",
|
||||
20, line1, line2);
|
||||
}
|
||||
|
||||
#endif // wxUSE_WCHAR_T
|
||||
|
||||
|
Reference in New Issue
Block a user