Make BOM-detection code in wxConvAuto public.

Export GetBOM() and DetectBOM() functions.

Also rename BOMType enum elements to use "wx" prefix now that they're public.

Closes #13599.

git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@69571 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
This commit is contained in:
Vadim Zeitlin
2011-10-27 22:48:54 +00:00
parent 76ff3d06f5
commit 038809c2f6
3 changed files with 151 additions and 52 deletions

View File

@@ -18,6 +18,18 @@
// wxConvAuto: uses BOM to automatically detect input encoding // wxConvAuto: uses BOM to automatically detect input encoding
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// All currently recognized BOM values.
enum wxBOM
{
wxBOM_Unknown = -1,
wxBOM_None,
wxBOM_UTF32BE,
wxBOM_UTF32LE,
wxBOM_UTF16BE,
wxBOM_UTF16LE,
wxBOM_UTF8
};
class WXDLLIMPEXP_BASE wxConvAuto : public wxMBConv class WXDLLIMPEXP_BASE wxConvAuto : public wxMBConv
{ {
public: public:
@@ -69,29 +81,24 @@ public:
virtual wxMBConv *Clone() const { return new wxConvAuto(*this); } virtual wxMBConv *Clone() const { return new wxConvAuto(*this); }
private:
// all currently recognized BOM values
enum BOMType
{
BOM_Unknown = -1,
BOM_None,
BOM_UTF32BE,
BOM_UTF32LE,
BOM_UTF16BE,
BOM_UTF16LE,
BOM_UTF8
};
// return the BOM type of this buffer // return the BOM type of this buffer
static BOMType DetectBOM(const char *src, size_t srcLen); static wxBOM DetectBOM(const char *src, size_t srcLen);
wxBOM GetBOM() const
{
return m_bomType;
}
private:
// common part of all ctors // common part of all ctors
void Init() void Init()
{ {
// no need to initialize m_bomType and m_consumedBOM here, this will be // We don't initialize m_encDefault here as different ctors do it
// done when m_conv is created // differently.
m_conv = NULL; m_conv = NULL;
m_bomType = wxBOM_Unknown;
m_ownsConv = false; m_ownsConv = false;
m_consumedBOM = false;
} }
// initialize m_conv with the UTF-8 conversion // initialize m_conv with the UTF-8 conversion
@@ -102,7 +109,7 @@ private:
} }
// create the correct conversion object for the given BOM type // create the correct conversion object for the given BOM type
void InitFromBOM(BOMType bomType); void InitFromBOM(wxBOM bomType);
// create the correct conversion object for the BOM present in the // create the correct conversion object for the BOM present in the
// beginning of the buffer // beginning of the buffer
@@ -128,7 +135,7 @@ private:
wxFontEncoding m_encDefault; wxFontEncoding m_encDefault;
// our BOM type // our BOM type
BOMType m_bomType; wxBOM m_bomType;
// true if we allocated m_conv ourselves, false if we just use an existing // true if we allocated m_conv ourselves, false if we just use an existing
// global conversion // global conversion

View File

@@ -6,6 +6,74 @@
// Licence: wxWindows licence // Licence: wxWindows licence
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
/**
Constants representing various BOM types.
BOM is an abbreviation for "Byte Order Mark", a special Unicode character
which may be inserted into the beginning of a text stream to indicate its
encoding.
@since 2.9.3
*/
enum wxBOM
{
/**
Unknown BOM.
This is returned if BOM presence couldn't be determined and normally
happens because not enough bytes of input have been analysed.
*/
wxBOM_Unknown = -1,
/**
No BOM.
The stream doesn't contain BOM character at all.
*/
wxBOM_None,
/**
UTF-32 big endian BOM.
The stream is encoded in big endian variant of UTF-32.
*/
wxBOM_UTF32BE,
/**
UTF-32 little endian BOM.
The stream is encoded in little endian variant of UTF-32.
*/
wxBOM_UTF32LE,
/**
UTF-16 big endian BOM.
The stream is encoded in big endian variant of UTF-16.
*/
wxBOM_UTF16BE,
/**
UTF-16 little endian BOM.
The stream is encoded in little endian variant of UTF-16.
*/
wxBOM_UTF16LE,
/**
UTF-8 BOM.
The stream is encoded in UTF-8.
Notice that contrary to a popular belief, it's perfectly possible and,
n fact, common under Microsoft Windows systems, to have a BOM in an
UTF-8 stream: while it's not used to indicate the endianness of UTF-8
stream (as it's byte-oriented), the BOM can still be useful just as an
unambiguous indicator of UTF-8 being used.
*/
wxBOM_UTF8
};
/** /**
@class wxConvAuto @class wxConvAuto
@@ -66,6 +134,19 @@ public:
*/ */
wxConvAuto(wxFontEncoding enc = wxFONTENCODING_DEFAULT); wxConvAuto(wxFontEncoding enc = wxFONTENCODING_DEFAULT);
/**
Return the detected BOM type.
The BOM type is detected after sufficiently many initial bytes have
passed through this conversion object so it will always return
wxBOM_Unknown immediately after the object creation but may return a
different value later.
@since 2.9.3
*/
wxBOM GetBOM() const;
/** /**
Disable the use of the fall back encoding: if the input doesn't have a Disable the use of the fall back encoding: if the input doesn't have a
BOM and is not valid UTF-8, the conversion will fail. BOM and is not valid UTF-8, the conversion will fail.
@@ -92,5 +173,16 @@ public:
@c wxFONTENCODING_DEFAULT can't be used here. @c wxFONTENCODING_DEFAULT can't be used here.
*/ */
static void SetFallbackEncoding(wxFontEncoding enc); static void SetFallbackEncoding(wxFontEncoding enc);
};
/**
Return the BOM type of this buffer.
This is a helper function which is normally only used internally by
wxConvAuto but provided for convenience of the code that wants to
detect the encoding of a stream by checking it for BOM presence on its
own.
@since 2.9.3
*/
static wxBOM DetectBOM(const char *src, size_t srcLen);
};

View File

@@ -45,7 +45,7 @@ void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
} }
/* static */ /* static */
wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen) wxBOM wxConvAuto::DetectBOM(const char *src, size_t srcLen)
{ {
// examine the buffer for BOM presence // examine the buffer for BOM presence
// //
@@ -65,14 +65,14 @@ wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
switch ( srcLen ) switch ( srcLen )
{ {
case 0: case 0:
return BOM_Unknown; return wxBOM_Unknown;
case 1: case 1:
if ( src[0] == '\x00' || src[0] == '\xFF' || if ( src[0] == '\x00' || src[0] == '\xFF' ||
src[0] == '\xFE' || src[0] == '\xEF') src[0] == '\xFE' || src[0] == '\xEF')
{ {
// this could be a BOM but we don't know yet // this could be a BOM but we don't know yet
return BOM_Unknown; return wxBOM_Unknown;
} }
break; break;
@@ -81,22 +81,22 @@ wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
if ( src[0] == '\xEF' && src[1] == '\xBB' ) if ( src[0] == '\xEF' && src[1] == '\xBB' )
{ {
if ( srcLen == 3 ) if ( srcLen == 3 )
return src[2] == '\xBF' ? BOM_UTF8 : BOM_None; return src[2] == '\xBF' ? wxBOM_UTF8 : wxBOM_None;
return BOM_Unknown; return wxBOM_Unknown;
} }
if ( src[0] == '\xFE' && src[1] == '\xFF' ) if ( src[0] == '\xFE' && src[1] == '\xFF' )
return BOM_UTF16BE; return wxBOM_UTF16BE;
if ( src[0] == '\xFF' && src[1] == '\xFE' ) if ( src[0] == '\xFF' && src[1] == '\xFE' )
{ {
// if the next byte is 0, it could be an UTF-32LE BOM but if it // if the next byte is 0, it could be an UTF-32LE BOM but if it
// isn't we can be sure it's UTF-16LE // isn't we can be sure it's UTF-16LE
if ( srcLen == 3 && src[2] != '\x00' ) if ( srcLen == 3 && src[2] != '\x00' )
return BOM_UTF16LE; return wxBOM_UTF16LE;
return BOM_Unknown; return wxBOM_Unknown;
} }
if ( src[0] == '\x00' && src[1] == '\x00' ) if ( src[0] == '\x00' && src[1] == '\x00' )
@@ -104,9 +104,9 @@ wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
// this could only be UTF-32BE, check that the data we have so // this could only be UTF-32BE, check that the data we have so
// far allows for it // far allows for it
if ( srcLen == 3 && src[2] != '\xFE' ) if ( srcLen == 3 && src[2] != '\xFE' )
return BOM_None; return wxBOM_None;
return BOM_Unknown; return wxBOM_Unknown;
} }
break; break;
@@ -114,61 +114,61 @@ wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
// we have at least 4 characters so we may finally decide whether // we have at least 4 characters so we may finally decide whether
// we have a BOM or not // we have a BOM or not
if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' ) if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
return BOM_UTF8; return wxBOM_UTF8;
if ( src[0] == '\x00' && src[1] == '\x00' && if ( src[0] == '\x00' && src[1] == '\x00' &&
src[2] == '\xFE' && src[3] == '\xFF' ) src[2] == '\xFE' && src[3] == '\xFF' )
return BOM_UTF32BE; return wxBOM_UTF32BE;
if ( src[0] == '\xFF' && src[1] == '\xFE' && if ( src[0] == '\xFF' && src[1] == '\xFE' &&
src[2] == '\x00' && src[3] == '\x00' ) src[2] == '\x00' && src[3] == '\x00' )
return BOM_UTF32LE; return wxBOM_UTF32LE;
if ( src[0] == '\xFE' && src[1] == '\xFF' ) if ( src[0] == '\xFE' && src[1] == '\xFF' )
return BOM_UTF16BE; return wxBOM_UTF16BE;
if ( src[0] == '\xFF' && src[1] == '\xFE' ) if ( src[0] == '\xFF' && src[1] == '\xFE' )
return BOM_UTF16LE; return wxBOM_UTF16LE;
} }
return BOM_None; return wxBOM_None;
} }
void wxConvAuto::InitFromBOM(BOMType bomType) void wxConvAuto::InitFromBOM(wxBOM bomType)
{ {
m_consumedBOM = false; m_consumedBOM = false;
switch ( bomType ) switch ( bomType )
{ {
case BOM_Unknown: case wxBOM_Unknown:
wxFAIL_MSG( "shouldn't be called for this BOM type" ); wxFAIL_MSG( "shouldn't be called for this BOM type" );
break; break;
case BOM_None: case wxBOM_None:
// use the default // use the default
break; break;
case BOM_UTF32BE: case wxBOM_UTF32BE:
m_conv = new wxMBConvUTF32BE; m_conv = new wxMBConvUTF32BE;
m_ownsConv = true; m_ownsConv = true;
break; break;
case BOM_UTF32LE: case wxBOM_UTF32LE:
m_conv = new wxMBConvUTF32LE; m_conv = new wxMBConvUTF32LE;
m_ownsConv = true; m_ownsConv = true;
break; break;
case BOM_UTF16BE: case wxBOM_UTF16BE:
m_conv = new wxMBConvUTF16BE; m_conv = new wxMBConvUTF16BE;
m_ownsConv = true; m_ownsConv = true;
break; break;
case BOM_UTF16LE: case wxBOM_UTF16LE:
m_conv = new wxMBConvUTF16LE; m_conv = new wxMBConvUTF16LE;
m_ownsConv = true; m_ownsConv = true;
break; break;
case BOM_UTF8: case wxBOM_UTF8:
InitWithUTF8(); InitWithUTF8();
break; break;
@@ -191,25 +191,25 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const
int ofs; int ofs;
switch ( m_bomType ) switch ( m_bomType )
{ {
case BOM_Unknown: case wxBOM_Unknown:
wxFAIL_MSG( "shouldn't be called for this BOM type" ); wxFAIL_MSG( "shouldn't be called for this BOM type" );
return; return;
case BOM_None: case wxBOM_None:
ofs = 0; ofs = 0;
break; break;
case BOM_UTF32BE: case wxBOM_UTF32BE:
case BOM_UTF32LE: case wxBOM_UTF32LE:
ofs = 4; ofs = 4;
break; break;
case BOM_UTF16BE: case wxBOM_UTF16BE:
case BOM_UTF16LE: case wxBOM_UTF16LE:
ofs = 2; ofs = 2;
break; break;
case BOM_UTF8: case wxBOM_UTF8:
ofs = 3; ofs = 3;
break; break;
@@ -226,7 +226,7 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const
bool wxConvAuto::InitFromInput(const char *src, size_t len) bool wxConvAuto::InitFromInput(const char *src, size_t len)
{ {
m_bomType = DetectBOM(src, len == wxNO_LEN ? strlen(src) : len); m_bomType = DetectBOM(src, len == wxNO_LEN ? strlen(src) : len);
if ( m_bomType == BOM_Unknown ) if ( m_bomType == wxBOM_Unknown )
return false; return false;
InitFromBOM(m_bomType); InitFromBOM(m_bomType);
@@ -275,7 +275,7 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
// try to convert using the auto-detected encoding // try to convert using the auto-detected encoding
size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen); size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
if ( rc == wxCONV_FAILED && m_bomType == BOM_None ) if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None )
{ {
// if the conversion failed but we didn't really detect anything and // if the conversion failed but we didn't really detect anything and
// simply tried UTF-8 by default, retry it using the fall-back // simply tried UTF-8 by default, retry it using the fall-back