use fallback encoding in wxConvAuto when input is not in UTF-8

git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@48463 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
This commit is contained in:
Vadim Zeitlin
2007-08-30 17:54:28 +00:00
parent b61af83717
commit 01a9232b5e
4 changed files with 194 additions and 10 deletions

View File

@@ -70,6 +70,7 @@
\input cshelp.tex \input cshelp.tex
\input control.tex \input control.tex
\input ctrlsub.tex \input ctrlsub.tex
\input convauto.tex
\input countstr.tex \input countstr.tex
\input critsect.tex \input critsect.tex
\input crtslock.tex \input crtslock.tex

115
docs/latex/wx/convauto.tex Normal file
View File

@@ -0,0 +1,115 @@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Name: convauto.tex
%% Purpose: wxConvAuto documentation
%% Author: Vadim Zeitlin
%% Created: 2007-08-26
%% RCS-ID: $Id:$
%% Copyright: (c) 2007 Vadim Zeitlin <vadim@wxwidgets.org>
%% License: wxWindows license
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{\class{wxConvAuto}}\label{wxconvauto}
This class implements a Unicode to/from multibyte converter capable of
automatically recognizing the encoding of the multibyte text on input. The
logic used is very simple: the class uses the BOM (byte order mark) if it's
present and tries to interpret the input as UTF-8 otherwise. If this fails, the
input is interpreted as being in the default multibyte encoding which can be
specified in the constructor of a wxConvAuto instance and, in turn, defaults to
the value of \helpref{GetFallbackEncoding}{wxconvautogetdefaultmbencoding} if
not explicitly given.
For the conversion from Unicode to multibyte, the same encoding as was
previously used for multibyte to Unicode conversion is reused. If there had
been no previous multibyte to Unicode conversion, UTF-8 is used by default.
Notice that once the multibyte encoding is automatically detected, it doesn't
change any more, i.e. it is entirely determined by the first use of wxConvAuto
object in the multibyte-to-Unicode direction. However creating a copy of
wxConvAuto object, either via the usual copy constructor or assignment
operator, or using \helpref{Clone}{wxmbconvclone} method, resets the
automatically detected encoding so that the new copy will try to detect the
encoding of the input on first use.
This class is used by default in wxWidgets classes and functions reading text
from files such as \helpref{wxFile}{wxfile}, \helpref{wxFFile}{wxffile},
\helpref{wxTextFile}{wxtextfile}, \helpref{wxFileConfig}{wxfileconfig} and
various stream classes so the encoding set with its
\helpref{SetFallbackEncoding}{wxconvautosetdefaultmbencoding} method will
affect how these classes treat input files. In particular, use this method
to change the fall-back multibyte encoding used to interpret the contents of
the files whose contents isn't valid UTF-8 or to disallow it completely.
\wxheading{Derived from}
\helpref{wxMBConv}{mbconv}
\wxheading{Include files}
<wx/convauto.h>
\wxheading{Library}
\helpref{wxBase}{librarieslist}
\wxheading{See also}
\helpref{wxMBConv classes overview}{mbconvclasses}
\latexignore{\rtfignore{\wxheading{Members}}}
\membersection{wxConvAuto::wxConvAuto}\label{wxconvautowxconvauto}
\func{}{wxConvAuto}{\param{wxFontEncoding }{enc = wxFONTENCODING\_DEFAULT}}
Constructs a new wxConvAuto instance. The object will try to detect the input
of the multibyte text given to its \helpref{ToWChar}{wxmbconvtowchar} method
automatically but if the automatic detection of Unicode encodings fails, the
fall-back encoding \arg{enc} will be used to interpret it as multibyte text.
The default value of this parameter, \texttt{wxFONTENCODING\_DEFAULT} means
that the global default value which can be set using
\helpref{SetFallbackEncoding}{wxconvautosetdefaultmbencoding} method should be
used. As with that method, passing \texttt{wxFONTENCODING\_MAX} inhibits using
this encoding completely so the input multibyte text will always be interpreted
as UTF-8 in the absence of BOM and the conversion will fail if the input
doesn't form valid UTF-8 sequence. Another special value is
\texttt{wxFONTENCODING\_SYSTEM} which means to use the encoding currently used
on the user system, i.e. the encoding returned by
\helpref{wxLocale::GetSystemEncoding}{wxlocalegetsystemencoding}. Any other
encoding will be used as is, e.g. passing \texttt{wxFONTENCODING\_ISO8859\_1}
ensures that non-UTF-8 input will be treated as latin1.
\membersection{wxConvAuto::DisableFallbackEncoding}\label{wxconvautodisablefallbackencoding}
\func{static void}{DisableFallbackEncoding}{\void}
Disable the use of the fall back encoding: if the input doesn't have a BOM and
is not valid UTF-8, the conversion will fail.
\membersection{wxConvAuto::GetFallbackEncoding}\label{wxconvautogetdefaultmbencoding}
\func{static wxFontEncoding}{GetFallbackEncoding}{\void}
Returns the encoding used by default by wxConvAuto if no other encoding is
explicitly specified in constructor. By default, returns
\texttt{wxFONTENCODING\_ISO8859\_1} but can be changed using
\helpref{SetFallbackEncoding}{wxconvautosetdefaultmbencoding} method.
\membersection{wxConvAuto::SetFallbackEncoding}\label{wxconvautosetdefaultmbencoding}
\func{static void}{SetFallbackEncoding}{\param{wxFontEncoding }{enc}}
Changes the encoding used by default by wxConvAuto if no other encoding is
explicitly specified in constructor. The default value, which can be retrieved
using \helpref{GetFallbackEncoding}{wxconvautogetdefaultmbencoding}, is
\texttt{wxFONTENCODING\_ISO8859\_1}.
Special values of \texttt{wxFONTENCODING\_SYSTEM} or
\texttt{wxFONTENCODING\_MAX} can be used for \arg{enc} parameter to use the
encoding of the current user locale as fall back or not use any encoding for
fall back at all, respectively (just as with the similar constructor
parameter). However \texttt{wxFONTENCODING\_DEFAULT} value cannot be used here.

View File

@@ -12,6 +12,7 @@
#define _WX_CONVAUTO_H_ #define _WX_CONVAUTO_H_
#include "wx/strconv.h" #include "wx/strconv.h"
#include "wx/fontenc.h"
#if wxUSE_WCHAR_T #if wxUSE_WCHAR_T
@@ -23,13 +24,39 @@ class WXDLLIMPEXP_BASE wxConvAuto : public wxMBConv
{ {
public: public:
// default ctor, the real conversion will be created on demand // default ctor, the real conversion will be created on demand
wxConvAuto() { m_conv = NULL; /* the rest will be initialized later */ } wxConvAuto(wxFontEncoding enc = wxFONTENCODING_DEFAULT)
{
m_conv = NULL; // the rest will be initialized later
m_encDefault = enc;
}
// copy ctor doesn't initialize anything neither as conversion can only be // copy ctor doesn't initialize anything neither as conversion can only be
// deduced on first use // deduced on first use
wxConvAuto(const wxConvAuto& WXUNUSED(other)) : wxMBConv() { m_conv = NULL; } wxConvAuto(const wxConvAuto& other) : wxMBConv()
{
m_conv = NULL;
m_encDefault = other.m_encDefault;
}
virtual ~wxConvAuto()
{
if ( m_ownsConv )
delete m_conv;
}
// get/set the fall-back encoding used when the input text doesn't have BOM
// and isn't UTF-8
//
// special values are wxFONTENCODING_MAX meaning not to use any fall back
// at all (but just fail to convert in this case) and wxFONTENCODING_SYSTEM
// meaning to use the encoding of the system locale
static wxFontEncoding GetFallbackEncoding() { return ms_defaultMBEncoding; }
static void SetFallbackEncoding(wxFontEncoding enc);
static void DisableFallbackEncoding()
{
SetFallbackEncoding(wxFONTENCODING_MAX);
}
virtual ~wxConvAuto() { if ( m_conv && m_ownsConv ) delete m_conv; }
// override the base class virtual function(s) to use our m_conv // override the base class virtual function(s) to use our m_conv
virtual size_t ToWChar(wchar_t *dst, size_t dstLen, virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
@@ -57,8 +84,8 @@ private:
// return the BOM type of this buffer // return the BOM type of this buffer
static BOMType DetectBOM(const char *src, size_t srcLen); static BOMType DetectBOM(const char *src, size_t srcLen);
// initialize m_conv with the conversion to use by default (UTF-8) // initialize m_conv with the UTF-8 conversion
void InitWithDefault() void InitWithUTF8()
{ {
m_conv = &wxConvUTF8; m_conv = &wxConvUTF8;
m_ownsConv = false; m_ownsConv = false;
@@ -76,10 +103,17 @@ private:
void SkipBOM(const char **src, size_t *len) const; void SkipBOM(const char **src, size_t *len) const;
// fall-back multibyte encoding to use, may be wxFONTENCODING_SYSTEM or
// wxFONTENCODING_MAX but not wxFONTENCODING_DEFAULT
static wxFontEncoding ms_defaultMBEncoding;
// conversion object which we really use, NULL until the first call to // conversion object which we really use, NULL until the first call to
// either ToWChar() or FromWChar() // either ToWChar() or FromWChar()
wxMBConv *m_conv; wxMBConv *m_conv;
// the multibyte encoding to use by default if input isn't Unicode
wxFontEncoding m_encDefault;
// our BOM type // our BOM type
BOMType m_bomType; BOMType m_bomType;

View File

@@ -30,10 +30,25 @@
#include "wx/convauto.h" #include "wx/convauto.h"
// we use latin1 by default as it seems the least bad choice: the files we need
// to detect input of don't always come from the user system (they are often
// received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
// seem to be a good idea and there is no other reasonable alternative
wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
// ============================================================================ // ============================================================================
// implementation // implementation
// ============================================================================ // ============================================================================
/* static */
void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
{
wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
_T("wxFONTENCODING_DEFAULT doesn't make sense here") );
ms_defaultMBEncoding = enc;
}
/* static */ /* static */
wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen) wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
{ {
@@ -118,8 +133,7 @@ void wxConvAuto::InitFromBOM(BOMType bomType)
break; break;
case BOM_UTF8: case BOM_UTF8:
m_conv = &wxConvUTF8; InitWithUTF8();
m_ownsConv = false;
break; break;
default: default:
@@ -127,7 +141,7 @@ void wxConvAuto::InitFromBOM(BOMType bomType)
// fall through: still need to create something // fall through: still need to create something
case BOM_None: case BOM_None:
InitWithDefault(); InitWithUTF8();
m_consumedBOM = true; // as there is nothing to consume m_consumedBOM = true; // as there is nothing to consume
} }
} }
@@ -194,7 +208,27 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
SkipBOM(&src, &srcLen); SkipBOM(&src, &srcLen);
} }
return m_conv->ToWChar(dst, dstLen, src, srcLen); // try to convert using the auto-detected encoding
size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
{
// if the conversion failed but we didn't really detect anything and
// simply tried UTF-8 by default, retry it using the fall-back
if ( m_encDefault != wxFONTENCODING_MAX )
{
if ( m_ownsConv )
delete m_conv;
self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
? GetFallbackEncoding()
: m_encDefault);
self->m_ownsConv = true;
rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
}
}
return rc;
} }
size_t size_t
@@ -204,7 +238,7 @@ wxConvAuto::FromWChar(char *dst, size_t dstLen,
if ( !m_conv ) if ( !m_conv )
{ {
// default to UTF-8 for the multibyte output // default to UTF-8 for the multibyte output
wx_const_cast(wxConvAuto *, this)->InitWithDefault(); wx_const_cast(wxConvAuto *, this)->InitWithUTF8();
} }
return m_conv->FromWChar(dst, dstLen, src, srcLen); return m_conv->FromWChar(dst, dstLen, src, srcLen);