Make BOM-detection code in wxConvAuto public.
Export GetBOM() and DetectBOM() functions. Also rename BOMType enum elements to use "wx" prefix now that they're public. Closes #13599. git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@69571 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
This commit is contained in:
@@ -18,6 +18,18 @@
|
|||||||
// wxConvAuto: uses BOM to automatically detect input encoding
|
// wxConvAuto: uses BOM to automatically detect input encoding
|
||||||
// ----------------------------------------------------------------------------
|
// ----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
// All currently recognized BOM values.
|
||||||
|
enum wxBOM
|
||||||
|
{
|
||||||
|
wxBOM_Unknown = -1,
|
||||||
|
wxBOM_None,
|
||||||
|
wxBOM_UTF32BE,
|
||||||
|
wxBOM_UTF32LE,
|
||||||
|
wxBOM_UTF16BE,
|
||||||
|
wxBOM_UTF16LE,
|
||||||
|
wxBOM_UTF8
|
||||||
|
};
|
||||||
|
|
||||||
class WXDLLIMPEXP_BASE wxConvAuto : public wxMBConv
|
class WXDLLIMPEXP_BASE wxConvAuto : public wxMBConv
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
@@ -69,29 +81,24 @@ public:
|
|||||||
|
|
||||||
virtual wxMBConv *Clone() const { return new wxConvAuto(*this); }
|
virtual wxMBConv *Clone() const { return new wxConvAuto(*this); }
|
||||||
|
|
||||||
private:
|
|
||||||
// all currently recognized BOM values
|
|
||||||
enum BOMType
|
|
||||||
{
|
|
||||||
BOM_Unknown = -1,
|
|
||||||
BOM_None,
|
|
||||||
BOM_UTF32BE,
|
|
||||||
BOM_UTF32LE,
|
|
||||||
BOM_UTF16BE,
|
|
||||||
BOM_UTF16LE,
|
|
||||||
BOM_UTF8
|
|
||||||
};
|
|
||||||
|
|
||||||
// return the BOM type of this buffer
|
// return the BOM type of this buffer
|
||||||
static BOMType DetectBOM(const char *src, size_t srcLen);
|
static wxBOM DetectBOM(const char *src, size_t srcLen);
|
||||||
|
|
||||||
|
wxBOM GetBOM() const
|
||||||
|
{
|
||||||
|
return m_bomType;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
// common part of all ctors
|
// common part of all ctors
|
||||||
void Init()
|
void Init()
|
||||||
{
|
{
|
||||||
// no need to initialize m_bomType and m_consumedBOM here, this will be
|
// We don't initialize m_encDefault here as different ctors do it
|
||||||
// done when m_conv is created
|
// differently.
|
||||||
m_conv = NULL;
|
m_conv = NULL;
|
||||||
|
m_bomType = wxBOM_Unknown;
|
||||||
m_ownsConv = false;
|
m_ownsConv = false;
|
||||||
|
m_consumedBOM = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// initialize m_conv with the UTF-8 conversion
|
// initialize m_conv with the UTF-8 conversion
|
||||||
@@ -102,7 +109,7 @@ private:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// create the correct conversion object for the given BOM type
|
// create the correct conversion object for the given BOM type
|
||||||
void InitFromBOM(BOMType bomType);
|
void InitFromBOM(wxBOM bomType);
|
||||||
|
|
||||||
// create the correct conversion object for the BOM present in the
|
// create the correct conversion object for the BOM present in the
|
||||||
// beginning of the buffer
|
// beginning of the buffer
|
||||||
@@ -128,7 +135,7 @@ private:
|
|||||||
wxFontEncoding m_encDefault;
|
wxFontEncoding m_encDefault;
|
||||||
|
|
||||||
// our BOM type
|
// our BOM type
|
||||||
BOMType m_bomType;
|
wxBOM m_bomType;
|
||||||
|
|
||||||
// true if we allocated m_conv ourselves, false if we just use an existing
|
// true if we allocated m_conv ourselves, false if we just use an existing
|
||||||
// global conversion
|
// global conversion
|
||||||
|
@@ -6,6 +6,74 @@
|
|||||||
// Licence: wxWindows licence
|
// Licence: wxWindows licence
|
||||||
/////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
/**
|
||||||
|
Constants representing various BOM types.
|
||||||
|
|
||||||
|
BOM is an abbreviation for "Byte Order Mark", a special Unicode character
|
||||||
|
which may be inserted into the beginning of a text stream to indicate its
|
||||||
|
encoding.
|
||||||
|
|
||||||
|
@since 2.9.3
|
||||||
|
*/
|
||||||
|
enum wxBOM
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
Unknown BOM.
|
||||||
|
|
||||||
|
This is returned if BOM presence couldn't be determined and normally
|
||||||
|
happens because not enough bytes of input have been analysed.
|
||||||
|
*/
|
||||||
|
wxBOM_Unknown = -1,
|
||||||
|
|
||||||
|
/**
|
||||||
|
No BOM.
|
||||||
|
|
||||||
|
The stream doesn't contain BOM character at all.
|
||||||
|
*/
|
||||||
|
wxBOM_None,
|
||||||
|
|
||||||
|
/**
|
||||||
|
UTF-32 big endian BOM.
|
||||||
|
|
||||||
|
The stream is encoded in big endian variant of UTF-32.
|
||||||
|
*/
|
||||||
|
wxBOM_UTF32BE,
|
||||||
|
|
||||||
|
/**
|
||||||
|
UTF-32 little endian BOM.
|
||||||
|
|
||||||
|
The stream is encoded in little endian variant of UTF-32.
|
||||||
|
*/
|
||||||
|
wxBOM_UTF32LE,
|
||||||
|
|
||||||
|
/**
|
||||||
|
UTF-16 big endian BOM.
|
||||||
|
|
||||||
|
The stream is encoded in big endian variant of UTF-16.
|
||||||
|
*/
|
||||||
|
wxBOM_UTF16BE,
|
||||||
|
|
||||||
|
/**
|
||||||
|
UTF-16 little endian BOM.
|
||||||
|
|
||||||
|
The stream is encoded in little endian variant of UTF-16.
|
||||||
|
*/
|
||||||
|
wxBOM_UTF16LE,
|
||||||
|
|
||||||
|
/**
|
||||||
|
UTF-8 BOM.
|
||||||
|
|
||||||
|
The stream is encoded in UTF-8.
|
||||||
|
|
||||||
|
Notice that contrary to a popular belief, it's perfectly possible and,
|
||||||
|
n fact, common under Microsoft Windows systems, to have a BOM in an
|
||||||
|
UTF-8 stream: while it's not used to indicate the endianness of UTF-8
|
||||||
|
stream (as it's byte-oriented), the BOM can still be useful just as an
|
||||||
|
unambiguous indicator of UTF-8 being used.
|
||||||
|
*/
|
||||||
|
wxBOM_UTF8
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@class wxConvAuto
|
@class wxConvAuto
|
||||||
|
|
||||||
@@ -66,6 +134,19 @@ public:
|
|||||||
*/
|
*/
|
||||||
wxConvAuto(wxFontEncoding enc = wxFONTENCODING_DEFAULT);
|
wxConvAuto(wxFontEncoding enc = wxFONTENCODING_DEFAULT);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
Return the detected BOM type.
|
||||||
|
|
||||||
|
The BOM type is detected after sufficiently many initial bytes have
|
||||||
|
passed through this conversion object so it will always return
|
||||||
|
wxBOM_Unknown immediately after the object creation but may return a
|
||||||
|
different value later.
|
||||||
|
|
||||||
|
@since 2.9.3
|
||||||
|
*/
|
||||||
|
wxBOM GetBOM() const;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
Disable the use of the fall back encoding: if the input doesn't have a
|
Disable the use of the fall back encoding: if the input doesn't have a
|
||||||
BOM and is not valid UTF-8, the conversion will fail.
|
BOM and is not valid UTF-8, the conversion will fail.
|
||||||
@@ -92,5 +173,16 @@ public:
|
|||||||
@c wxFONTENCODING_DEFAULT can't be used here.
|
@c wxFONTENCODING_DEFAULT can't be used here.
|
||||||
*/
|
*/
|
||||||
static void SetFallbackEncoding(wxFontEncoding enc);
|
static void SetFallbackEncoding(wxFontEncoding enc);
|
||||||
};
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
Return the BOM type of this buffer.
|
||||||
|
|
||||||
|
This is a helper function which is normally only used internally by
|
||||||
|
wxConvAuto but provided for convenience of the code that wants to
|
||||||
|
detect the encoding of a stream by checking it for BOM presence on its
|
||||||
|
own.
|
||||||
|
|
||||||
|
@since 2.9.3
|
||||||
|
*/
|
||||||
|
static wxBOM DetectBOM(const char *src, size_t srcLen);
|
||||||
|
};
|
||||||
|
@@ -45,7 +45,7 @@ void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* static */
|
/* static */
|
||||||
wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
|
wxBOM wxConvAuto::DetectBOM(const char *src, size_t srcLen)
|
||||||
{
|
{
|
||||||
// examine the buffer for BOM presence
|
// examine the buffer for BOM presence
|
||||||
//
|
//
|
||||||
@@ -65,14 +65,14 @@ wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
|
|||||||
switch ( srcLen )
|
switch ( srcLen )
|
||||||
{
|
{
|
||||||
case 0:
|
case 0:
|
||||||
return BOM_Unknown;
|
return wxBOM_Unknown;
|
||||||
|
|
||||||
case 1:
|
case 1:
|
||||||
if ( src[0] == '\x00' || src[0] == '\xFF' ||
|
if ( src[0] == '\x00' || src[0] == '\xFF' ||
|
||||||
src[0] == '\xFE' || src[0] == '\xEF')
|
src[0] == '\xFE' || src[0] == '\xEF')
|
||||||
{
|
{
|
||||||
// this could be a BOM but we don't know yet
|
// this could be a BOM but we don't know yet
|
||||||
return BOM_Unknown;
|
return wxBOM_Unknown;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
@@ -81,22 +81,22 @@ wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
|
|||||||
if ( src[0] == '\xEF' && src[1] == '\xBB' )
|
if ( src[0] == '\xEF' && src[1] == '\xBB' )
|
||||||
{
|
{
|
||||||
if ( srcLen == 3 )
|
if ( srcLen == 3 )
|
||||||
return src[2] == '\xBF' ? BOM_UTF8 : BOM_None;
|
return src[2] == '\xBF' ? wxBOM_UTF8 : wxBOM_None;
|
||||||
|
|
||||||
return BOM_Unknown;
|
return wxBOM_Unknown;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( src[0] == '\xFE' && src[1] == '\xFF' )
|
if ( src[0] == '\xFE' && src[1] == '\xFF' )
|
||||||
return BOM_UTF16BE;
|
return wxBOM_UTF16BE;
|
||||||
|
|
||||||
if ( src[0] == '\xFF' && src[1] == '\xFE' )
|
if ( src[0] == '\xFF' && src[1] == '\xFE' )
|
||||||
{
|
{
|
||||||
// if the next byte is 0, it could be an UTF-32LE BOM but if it
|
// if the next byte is 0, it could be an UTF-32LE BOM but if it
|
||||||
// isn't we can be sure it's UTF-16LE
|
// isn't we can be sure it's UTF-16LE
|
||||||
if ( srcLen == 3 && src[2] != '\x00' )
|
if ( srcLen == 3 && src[2] != '\x00' )
|
||||||
return BOM_UTF16LE;
|
return wxBOM_UTF16LE;
|
||||||
|
|
||||||
return BOM_Unknown;
|
return wxBOM_Unknown;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( src[0] == '\x00' && src[1] == '\x00' )
|
if ( src[0] == '\x00' && src[1] == '\x00' )
|
||||||
@@ -104,9 +104,9 @@ wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
|
|||||||
// this could only be UTF-32BE, check that the data we have so
|
// this could only be UTF-32BE, check that the data we have so
|
||||||
// far allows for it
|
// far allows for it
|
||||||
if ( srcLen == 3 && src[2] != '\xFE' )
|
if ( srcLen == 3 && src[2] != '\xFE' )
|
||||||
return BOM_None;
|
return wxBOM_None;
|
||||||
|
|
||||||
return BOM_Unknown;
|
return wxBOM_Unknown;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
@@ -114,61 +114,61 @@ wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
|
|||||||
// we have at least 4 characters so we may finally decide whether
|
// we have at least 4 characters so we may finally decide whether
|
||||||
// we have a BOM or not
|
// we have a BOM or not
|
||||||
if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
|
if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' )
|
||||||
return BOM_UTF8;
|
return wxBOM_UTF8;
|
||||||
|
|
||||||
if ( src[0] == '\x00' && src[1] == '\x00' &&
|
if ( src[0] == '\x00' && src[1] == '\x00' &&
|
||||||
src[2] == '\xFE' && src[3] == '\xFF' )
|
src[2] == '\xFE' && src[3] == '\xFF' )
|
||||||
return BOM_UTF32BE;
|
return wxBOM_UTF32BE;
|
||||||
|
|
||||||
if ( src[0] == '\xFF' && src[1] == '\xFE' &&
|
if ( src[0] == '\xFF' && src[1] == '\xFE' &&
|
||||||
src[2] == '\x00' && src[3] == '\x00' )
|
src[2] == '\x00' && src[3] == '\x00' )
|
||||||
return BOM_UTF32LE;
|
return wxBOM_UTF32LE;
|
||||||
|
|
||||||
if ( src[0] == '\xFE' && src[1] == '\xFF' )
|
if ( src[0] == '\xFE' && src[1] == '\xFF' )
|
||||||
return BOM_UTF16BE;
|
return wxBOM_UTF16BE;
|
||||||
|
|
||||||
if ( src[0] == '\xFF' && src[1] == '\xFE' )
|
if ( src[0] == '\xFF' && src[1] == '\xFE' )
|
||||||
return BOM_UTF16LE;
|
return wxBOM_UTF16LE;
|
||||||
}
|
}
|
||||||
|
|
||||||
return BOM_None;
|
return wxBOM_None;
|
||||||
}
|
}
|
||||||
|
|
||||||
void wxConvAuto::InitFromBOM(BOMType bomType)
|
void wxConvAuto::InitFromBOM(wxBOM bomType)
|
||||||
{
|
{
|
||||||
m_consumedBOM = false;
|
m_consumedBOM = false;
|
||||||
|
|
||||||
switch ( bomType )
|
switch ( bomType )
|
||||||
{
|
{
|
||||||
case BOM_Unknown:
|
case wxBOM_Unknown:
|
||||||
wxFAIL_MSG( "shouldn't be called for this BOM type" );
|
wxFAIL_MSG( "shouldn't be called for this BOM type" );
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case BOM_None:
|
case wxBOM_None:
|
||||||
// use the default
|
// use the default
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case BOM_UTF32BE:
|
case wxBOM_UTF32BE:
|
||||||
m_conv = new wxMBConvUTF32BE;
|
m_conv = new wxMBConvUTF32BE;
|
||||||
m_ownsConv = true;
|
m_ownsConv = true;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case BOM_UTF32LE:
|
case wxBOM_UTF32LE:
|
||||||
m_conv = new wxMBConvUTF32LE;
|
m_conv = new wxMBConvUTF32LE;
|
||||||
m_ownsConv = true;
|
m_ownsConv = true;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case BOM_UTF16BE:
|
case wxBOM_UTF16BE:
|
||||||
m_conv = new wxMBConvUTF16BE;
|
m_conv = new wxMBConvUTF16BE;
|
||||||
m_ownsConv = true;
|
m_ownsConv = true;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case BOM_UTF16LE:
|
case wxBOM_UTF16LE:
|
||||||
m_conv = new wxMBConvUTF16LE;
|
m_conv = new wxMBConvUTF16LE;
|
||||||
m_ownsConv = true;
|
m_ownsConv = true;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case BOM_UTF8:
|
case wxBOM_UTF8:
|
||||||
InitWithUTF8();
|
InitWithUTF8();
|
||||||
break;
|
break;
|
||||||
|
|
||||||
@@ -191,25 +191,25 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const
|
|||||||
int ofs;
|
int ofs;
|
||||||
switch ( m_bomType )
|
switch ( m_bomType )
|
||||||
{
|
{
|
||||||
case BOM_Unknown:
|
case wxBOM_Unknown:
|
||||||
wxFAIL_MSG( "shouldn't be called for this BOM type" );
|
wxFAIL_MSG( "shouldn't be called for this BOM type" );
|
||||||
return;
|
return;
|
||||||
|
|
||||||
case BOM_None:
|
case wxBOM_None:
|
||||||
ofs = 0;
|
ofs = 0;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case BOM_UTF32BE:
|
case wxBOM_UTF32BE:
|
||||||
case BOM_UTF32LE:
|
case wxBOM_UTF32LE:
|
||||||
ofs = 4;
|
ofs = 4;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case BOM_UTF16BE:
|
case wxBOM_UTF16BE:
|
||||||
case BOM_UTF16LE:
|
case wxBOM_UTF16LE:
|
||||||
ofs = 2;
|
ofs = 2;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case BOM_UTF8:
|
case wxBOM_UTF8:
|
||||||
ofs = 3;
|
ofs = 3;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
@@ -226,7 +226,7 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const
|
|||||||
bool wxConvAuto::InitFromInput(const char *src, size_t len)
|
bool wxConvAuto::InitFromInput(const char *src, size_t len)
|
||||||
{
|
{
|
||||||
m_bomType = DetectBOM(src, len == wxNO_LEN ? strlen(src) : len);
|
m_bomType = DetectBOM(src, len == wxNO_LEN ? strlen(src) : len);
|
||||||
if ( m_bomType == BOM_Unknown )
|
if ( m_bomType == wxBOM_Unknown )
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
InitFromBOM(m_bomType);
|
InitFromBOM(m_bomType);
|
||||||
@@ -275,7 +275,7 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
|
|||||||
|
|
||||||
// try to convert using the auto-detected encoding
|
// try to convert using the auto-detected encoding
|
||||||
size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
|
size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
|
||||||
if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
|
if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None )
|
||||||
{
|
{
|
||||||
// if the conversion failed but we didn't really detect anything and
|
// if the conversion failed but we didn't really detect anything and
|
||||||
// simply tried UTF-8 by default, retry it using the fall-back
|
// simply tried UTF-8 by default, retry it using the fall-back
|
||||||
|
Reference in New Issue
Block a user