use fallback encoding in wxConvAuto when input is not in UTF-8
git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@48463 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
This commit is contained in:
@@ -70,6 +70,7 @@
|
||||
\input cshelp.tex
|
||||
\input control.tex
|
||||
\input ctrlsub.tex
|
||||
\input convauto.tex
|
||||
\input countstr.tex
|
||||
\input critsect.tex
|
||||
\input crtslock.tex
|
||||
|
115
docs/latex/wx/convauto.tex
Normal file
115
docs/latex/wx/convauto.tex
Normal file
@@ -0,0 +1,115 @@
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
%% Name: convauto.tex
|
||||
%% Purpose: wxConvAuto documentation
|
||||
%% Author: Vadim Zeitlin
|
||||
%% Created: 2007-08-26
|
||||
%% RCS-ID: $Id:$
|
||||
%% Copyright: (c) 2007 Vadim Zeitlin <vadim@wxwidgets.org>
|
||||
%% License: wxWindows license
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
\section{\class{wxConvAuto}}\label{wxconvauto}
|
||||
|
||||
This class implements a Unicode to/from multibyte converter capable of
|
||||
automatically recognizing the encoding of the multibyte text on input. The
|
||||
logic used is very simple: the class uses the BOM (byte order mark) if it's
|
||||
present and tries to interpret the input as UTF-8 otherwise. If this fails, the
|
||||
input is interpreted as being in the default multibyte encoding which can be
|
||||
specified in the constructor of a wxConvAuto instance and, in turn, defaults to
|
||||
the value of \helpref{GetFallbackEncoding}{wxconvautogetdefaultmbencoding} if
|
||||
not explicitly given.
|
||||
|
||||
For the conversion from Unicode to multibyte, the same encoding as was
|
||||
previously used for multibyte to Unicode conversion is reused. If there had
|
||||
been no previous multibyte to Unicode conversion, UTF-8 is used by default.
|
||||
Notice that once the multibyte encoding is automatically detected, it doesn't
|
||||
change any more, i.e. it is entirely determined by the first use of wxConvAuto
|
||||
object in the multibyte-to-Unicode direction. However creating a copy of
|
||||
wxConvAuto object, either via the usual copy constructor or assignment
|
||||
operator, or using \helpref{Clone}{wxmbconvclone} method, resets the
|
||||
automatically detected encoding so that the new copy will try to detect the
|
||||
encoding of the input on first use.
|
||||
|
||||
This class is used by default in wxWidgets classes and functions reading text
|
||||
from files such as \helpref{wxFile}{wxfile}, \helpref{wxFFile}{wxffile},
|
||||
\helpref{wxTextFile}{wxtextfile}, \helpref{wxFileConfig}{wxfileconfig} and
|
||||
various stream classes so the encoding set with its
|
||||
\helpref{SetFallbackEncoding}{wxconvautosetdefaultmbencoding} method will
|
||||
affect how these classes treat input files. In particular, use this method
|
||||
to change the fall-back multibyte encoding used to interpret the contents of
|
||||
the files whose contents isn't valid UTF-8 or to disallow it completely.
|
||||
|
||||
\wxheading{Derived from}
|
||||
|
||||
\helpref{wxMBConv}{mbconv}
|
||||
|
||||
\wxheading{Include files}
|
||||
|
||||
<wx/convauto.h>
|
||||
|
||||
\wxheading{Library}
|
||||
|
||||
\helpref{wxBase}{librarieslist}
|
||||
|
||||
\wxheading{See also}
|
||||
|
||||
\helpref{wxMBConv classes overview}{mbconvclasses}
|
||||
|
||||
|
||||
\latexignore{\rtfignore{\wxheading{Members}}}
|
||||
|
||||
\membersection{wxConvAuto::wxConvAuto}\label{wxconvautowxconvauto}
|
||||
|
||||
\func{}{wxConvAuto}{\param{wxFontEncoding }{enc = wxFONTENCODING\_DEFAULT}}
|
||||
|
||||
Constructs a new wxConvAuto instance. The object will try to detect the input
|
||||
of the multibyte text given to its \helpref{ToWChar}{wxmbconvtowchar} method
|
||||
automatically but if the automatic detection of Unicode encodings fails, the
|
||||
fall-back encoding \arg{enc} will be used to interpret it as multibyte text.
|
||||
The default value of this parameter, \texttt{wxFONTENCODING\_DEFAULT} means
|
||||
that the global default value which can be set using
|
||||
\helpref{SetFallbackEncoding}{wxconvautosetdefaultmbencoding} method should be
|
||||
used. As with that method, passing \texttt{wxFONTENCODING\_MAX} inhibits using
|
||||
this encoding completely so the input multibyte text will always be interpreted
|
||||
as UTF-8 in the absence of BOM and the conversion will fail if the input
|
||||
doesn't form valid UTF-8 sequence. Another special value is
|
||||
\texttt{wxFONTENCODING\_SYSTEM} which means to use the encoding currently used
|
||||
on the user system, i.e. the encoding returned by
|
||||
\helpref{wxLocale::GetSystemEncoding}{wxlocalegetsystemencoding}. Any other
|
||||
encoding will be used as is, e.g. passing \texttt{wxFONTENCODING\_ISO8859\_1}
|
||||
ensures that non-UTF-8 input will be treated as latin1.
|
||||
|
||||
|
||||
\membersection{wxConvAuto::DisableFallbackEncoding}\label{wxconvautodisablefallbackencoding}
|
||||
|
||||
\func{static void}{DisableFallbackEncoding}{\void}
|
||||
|
||||
Disable the use of the fall back encoding: if the input doesn't have a BOM and
|
||||
is not valid UTF-8, the conversion will fail.
|
||||
|
||||
|
||||
\membersection{wxConvAuto::GetFallbackEncoding}\label{wxconvautogetdefaultmbencoding}
|
||||
|
||||
\func{static wxFontEncoding}{GetFallbackEncoding}{\void}
|
||||
|
||||
Returns the encoding used by default by wxConvAuto if no other encoding is
|
||||
explicitly specified in constructor. By default, returns
|
||||
\texttt{wxFONTENCODING\_ISO8859\_1} but can be changed using
|
||||
\helpref{SetFallbackEncoding}{wxconvautosetdefaultmbencoding} method.
|
||||
|
||||
|
||||
\membersection{wxConvAuto::SetFallbackEncoding}\label{wxconvautosetdefaultmbencoding}
|
||||
|
||||
\func{static void}{SetFallbackEncoding}{\param{wxFontEncoding }{enc}}
|
||||
|
||||
Changes the encoding used by default by wxConvAuto if no other encoding is
|
||||
explicitly specified in constructor. The default value, which can be retrieved
|
||||
using \helpref{GetFallbackEncoding}{wxconvautogetdefaultmbencoding}, is
|
||||
\texttt{wxFONTENCODING\_ISO8859\_1}.
|
||||
|
||||
Special values of \texttt{wxFONTENCODING\_SYSTEM} or
|
||||
\texttt{wxFONTENCODING\_MAX} can be used for \arg{enc} parameter to use the
|
||||
encoding of the current user locale as fall back or not use any encoding for
|
||||
fall back at all, respectively (just as with the similar constructor
|
||||
parameter). However \texttt{wxFONTENCODING\_DEFAULT} value cannot be used here.
|
||||
|
@@ -12,6 +12,7 @@
|
||||
#define _WX_CONVAUTO_H_
|
||||
|
||||
#include "wx/strconv.h"
|
||||
#include "wx/fontenc.h"
|
||||
|
||||
#if wxUSE_WCHAR_T
|
||||
|
||||
@@ -23,13 +24,39 @@ class WXDLLIMPEXP_BASE wxConvAuto : public wxMBConv
|
||||
{
|
||||
public:
|
||||
// default ctor, the real conversion will be created on demand
|
||||
wxConvAuto() { m_conv = NULL; /* the rest will be initialized later */ }
|
||||
wxConvAuto(wxFontEncoding enc = wxFONTENCODING_DEFAULT)
|
||||
{
|
||||
m_conv = NULL; // the rest will be initialized later
|
||||
m_encDefault = enc;
|
||||
}
|
||||
|
||||
// copy ctor doesn't initialize anything neither as conversion can only be
|
||||
// deduced on first use
|
||||
wxConvAuto(const wxConvAuto& WXUNUSED(other)) : wxMBConv() { m_conv = NULL; }
|
||||
wxConvAuto(const wxConvAuto& other) : wxMBConv()
|
||||
{
|
||||
m_conv = NULL;
|
||||
m_encDefault = other.m_encDefault;
|
||||
}
|
||||
|
||||
virtual ~wxConvAuto()
|
||||
{
|
||||
if ( m_ownsConv )
|
||||
delete m_conv;
|
||||
}
|
||||
|
||||
// get/set the fall-back encoding used when the input text doesn't have BOM
|
||||
// and isn't UTF-8
|
||||
//
|
||||
// special values are wxFONTENCODING_MAX meaning not to use any fall back
|
||||
// at all (but just fail to convert in this case) and wxFONTENCODING_SYSTEM
|
||||
// meaning to use the encoding of the system locale
|
||||
static wxFontEncoding GetFallbackEncoding() { return ms_defaultMBEncoding; }
|
||||
static void SetFallbackEncoding(wxFontEncoding enc);
|
||||
static void DisableFallbackEncoding()
|
||||
{
|
||||
SetFallbackEncoding(wxFONTENCODING_MAX);
|
||||
}
|
||||
|
||||
virtual ~wxConvAuto() { if ( m_conv && m_ownsConv ) delete m_conv; }
|
||||
|
||||
// override the base class virtual function(s) to use our m_conv
|
||||
virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
|
||||
@@ -57,8 +84,8 @@ private:
|
||||
// return the BOM type of this buffer
|
||||
static BOMType DetectBOM(const char *src, size_t srcLen);
|
||||
|
||||
// initialize m_conv with the conversion to use by default (UTF-8)
|
||||
void InitWithDefault()
|
||||
// initialize m_conv with the UTF-8 conversion
|
||||
void InitWithUTF8()
|
||||
{
|
||||
m_conv = &wxConvUTF8;
|
||||
m_ownsConv = false;
|
||||
@@ -76,10 +103,17 @@ private:
|
||||
void SkipBOM(const char **src, size_t *len) const;
|
||||
|
||||
|
||||
// fall-back multibyte encoding to use, may be wxFONTENCODING_SYSTEM or
|
||||
// wxFONTENCODING_MAX but not wxFONTENCODING_DEFAULT
|
||||
static wxFontEncoding ms_defaultMBEncoding;
|
||||
|
||||
// conversion object which we really use, NULL until the first call to
|
||||
// either ToWChar() or FromWChar()
|
||||
wxMBConv *m_conv;
|
||||
|
||||
// the multibyte encoding to use by default if input isn't Unicode
|
||||
wxFontEncoding m_encDefault;
|
||||
|
||||
// our BOM type
|
||||
BOMType m_bomType;
|
||||
|
||||
|
@@ -30,10 +30,25 @@
|
||||
|
||||
#include "wx/convauto.h"
|
||||
|
||||
// we use latin1 by default as it seems the least bad choice: the files we need
|
||||
// to detect input of don't always come from the user system (they are often
|
||||
// received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
|
||||
// seem to be a good idea and there is no other reasonable alternative
|
||||
wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
|
||||
|
||||
// ============================================================================
|
||||
// implementation
|
||||
// ============================================================================
|
||||
|
||||
/* static */
|
||||
void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
|
||||
{
|
||||
wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
|
||||
_T("wxFONTENCODING_DEFAULT doesn't make sense here") );
|
||||
|
||||
ms_defaultMBEncoding = enc;
|
||||
}
|
||||
|
||||
/* static */
|
||||
wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
|
||||
{
|
||||
@@ -118,8 +133,7 @@ void wxConvAuto::InitFromBOM(BOMType bomType)
|
||||
break;
|
||||
|
||||
case BOM_UTF8:
|
||||
m_conv = &wxConvUTF8;
|
||||
m_ownsConv = false;
|
||||
InitWithUTF8();
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -127,7 +141,7 @@ void wxConvAuto::InitFromBOM(BOMType bomType)
|
||||
// fall through: still need to create something
|
||||
|
||||
case BOM_None:
|
||||
InitWithDefault();
|
||||
InitWithUTF8();
|
||||
m_consumedBOM = true; // as there is nothing to consume
|
||||
}
|
||||
}
|
||||
@@ -194,7 +208,27 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
|
||||
SkipBOM(&src, &srcLen);
|
||||
}
|
||||
|
||||
return m_conv->ToWChar(dst, dstLen, src, srcLen);
|
||||
// try to convert using the auto-detected encoding
|
||||
size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
|
||||
if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
|
||||
{
|
||||
// if the conversion failed but we didn't really detect anything and
|
||||
// simply tried UTF-8 by default, retry it using the fall-back
|
||||
if ( m_encDefault != wxFONTENCODING_MAX )
|
||||
{
|
||||
if ( m_ownsConv )
|
||||
delete m_conv;
|
||||
|
||||
self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
|
||||
? GetFallbackEncoding()
|
||||
: m_encDefault);
|
||||
self->m_ownsConv = true;
|
||||
|
||||
rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
size_t
|
||||
@@ -204,7 +238,7 @@ wxConvAuto::FromWChar(char *dst, size_t dstLen,
|
||||
if ( !m_conv )
|
||||
{
|
||||
// default to UTF-8 for the multibyte output
|
||||
wx_const_cast(wxConvAuto *, this)->InitWithDefault();
|
||||
wx_const_cast(wxConvAuto *, this)->InitWithUTF8();
|
||||
}
|
||||
|
||||
return m_conv->FromWChar(dst, dstLen, src, srcLen);
|
||||
|
Reference in New Issue
Block a user