Fix wxConvAuto behaviour when it is used by wxTextInputStream.

wxConvAuto implicitly supposed that the chunk of data passed to it for
translation was big enough to allow it to at least detect the BOM from it.
However this isn't necessarily the case and never is with wxTextInputStream
which reads the bytes one by one.

Fix this by waiting until we have enough data to be able to detect the BOM.
This still doesn't fix the problem with streams without BOM and the
corresponding unit test still fails -- it will need to be fixed at the level
of wxTextInputStream itself later but handling correctly the cases when a BOM
is present is already better than before.

See #11570.

git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@63064 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
This commit is contained in:
Vadim Zeitlin
2010-01-04 12:22:49 +00:00
parent 55e5154d2c
commit 4cb0e8d05c
5 changed files with 213 additions and 59 deletions

View File

@@ -26,6 +26,7 @@
#if wxUSE_WCHAR_T
#ifndef WX_PRECOMP
#include "wx/wx.h"
#endif //WX_PRECOMP
#include "wx/convauto.h"
@@ -52,55 +53,86 @@ void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
/* static */
wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
{
if ( srcLen < 2 )
{
// minimal BOM is 2 bytes so bail out immediately and simplify the code
// below which wouldn't need to check for length for UTF-16 cases
return BOM_None;
}
// examine the buffer for BOM presence
//
// see http://www.unicode.org/faq/utf_bom.html#BOM
switch ( *src++ )
// quoting from http://www.unicode.org/faq/utf_bom.html#BOM:
//
// Bytes Encoding Form
//
// 00 00 FE FF UTF-32, big-endian
// FF FE 00 00 UTF-32, little-endian
// FE FF UTF-16, big-endian
// FF FE UTF-16, little-endian
// EF BB BF UTF-8
//
// as some BOMs are prefixes of other ones we may need to read more bytes
// to disambiguate them
switch ( srcLen )
{
case '\0':
// could only be big endian UTF-32 (00 00 FE FF)
if ( srcLen >= 4 &&
src[0] == '\0' &&
src[1] == '\xfe' &&
src[2] == '\xff' )
case 0:
return BOM_Unknown;
case 1:
if ( src[0] == '\x00' || src[0] == '\xFF' ||
src[0] == '\xFE' || src[0] == '\xEF')
{
return BOM_UTF32BE;
// this could be a BOM but we don't know yet
return BOM_Unknown;
}
break;
case '\xfe':
// could only be big endian UTF-16 (FE FF)
if ( *src++ == '\xff' )
case 2:
case 3:
if ( src[0] == '\xEF' && src[1] == '\xBB' )
{
if ( srcLen == 3 )
return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;
return BOM_Unknown;
}
if ( src[0] == '\xFE' && src[1] == '\xFF' )
return BOM_UTF16BE;
if ( src[0] == '\xFF' && src[1] == '\xFE' )
{
// if the next byte is 0, it could be an UTF-32LE BOM but if it
// isn't we can be sure it's UTF-16LE
if ( srcLen == 3 && src[2] != '\x00' )
return BOM_UTF16LE;
return BOM_Unknown;
}
if ( src[0] == '\x00' && src[1] == '\x00' )
{
// this could only be UTF-32BE
if ( srcLen == 3 && src[2] == '\xFE' )
return BOM_Unknown;
}
break;
case '\xff':
// could be either little endian UTF-16 or UTF-32, both start
// with FF FE
if ( *src++ == '\xfe' )
{
return srcLen >= 4 && src[0] == '\0' && src[1] == '\0'
? BOM_UTF32LE
: BOM_UTF16LE;
}
break;
case '\xef':
// is this UTF-8 BOM (EF BB BF)?
if ( srcLen >= 3 && src[0] == '\xbb' && src[1] == '\xbf' )
{
default:
// we have at least 4 characters so we may finally decide whether
// we have a BOM or not
if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
return BOM_UTF8;
}
break;
if ( src[0] == '\x00' && src[1] == '\x00' &&
src[2] == '\xFE' && src[3] == '\xFF' )
return BOM_UTF32BE;
if ( src[0] == '\xFF' && src[1] == '\xFE' &&
src[2] == '\x00' && src[3] == '\x00' )
return BOM_UTF32LE;
if ( src[0] == '\xFE' && src[1] == '\xFF' )
return BOM_UTF16BE;
if ( src[0] == '\xFF' && src[1] == '\xFE' )
return BOM_UTF16LE;
}
return BOM_None;
@@ -112,6 +144,14 @@ void wxConvAuto::InitFromBOM(BOMType bomType)
switch ( bomType )
{
case BOM_Unknown:
wxFAIL_MSG( "shouldn't be called for this BOM type" );
break;
case BOM_None:
// use the default
break;
case BOM_UTF32BE:
m_conv = new wxMBConvUTF32BE;
m_ownsConv = true;
@@ -137,12 +177,16 @@ void wxConvAuto::InitFromBOM(BOMType bomType)
break;
default:
wxFAIL_MSG( wxT("unexpected BOM type") );
// fall through: still need to create something
wxFAIL_MSG( "unknown BOM type" );
}
case BOM_None:
InitWithUTF8();
m_consumedBOM = true; // as there is nothing to consume
if ( !m_conv )
{
// we end up here if there is no BOM or we didn't recognize it somehow
// (this shouldn't happen but still don't crash if it does), so use the
// default encoding
InitWithUTF8();
m_consumedBOM = true; // as there is nothing to consume
}
}
@@ -151,6 +195,14 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const
int ofs;
switch ( m_bomType )
{
case BOM_Unknown:
wxFAIL_MSG( "shouldn't be called for this BOM type" );
return;
case BOM_None:
ofs = 0;
break;
case BOM_UTF32BE:
case BOM_UTF32LE:
ofs = 4;
@@ -166,11 +218,8 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const
break;
default:
wxFAIL_MSG( wxT("unexpected BOM type") );
// fall through: still need to create something
case BOM_None:
ofs = 0;
wxFAIL_MSG( "unknown BOM type" );
return;
}
*src += ofs;
@@ -178,11 +227,16 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const
*len -= ofs;
}
void wxConvAuto::InitFromInput(const char **src, size_t *len)
bool wxConvAuto::InitFromInput(const char **src, size_t *len)
{
m_bomType = DetectBOM(*src, *len);
if ( m_bomType == BOM_Unknown )
return false;
InitFromBOM(m_bomType);
SkipBOM(src, len);
return true;
}
size_t
@@ -195,16 +249,20 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
// dst as typically we're first called with NULL dst to calculate the
// needed buffer size
wxConvAuto *self = const_cast<wxConvAuto *>(this);
if ( !m_conv )
{
self->InitFromInput(&src, &srcLen);
if ( dst )
self->m_consumedBOM = true;
if ( !self->InitFromInput(&src, &srcLen) )
{
// there is not enough data to determine whether we have a BOM or
// not, so fail for now -- the caller is supposed to call us again
// with more data
return wxCONV_FAILED;
}
}
if ( !m_consumedBOM && dst )
else if ( !m_consumedBOM && dst )
{
self->m_consumedBOM = true;
SkipBOM(&src, &srcLen);
}
@@ -228,6 +286,8 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
}
}
if (rc != wxCONV_FAILED && dst && !m_consumedBOM)
self->m_consumedBOM = true;
return rc;
}
@@ -245,4 +305,3 @@ wxConvAuto::FromWChar(char *dst, size_t dstLen,
}
#endif // wxUSE_WCHAR_T