use fallback encoding in wxConvAuto when input is not in UTF-8
git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@48463 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
This commit is contained in:
@@ -70,6 +70,7 @@
|
|||||||
\input cshelp.tex
|
\input cshelp.tex
|
||||||
\input control.tex
|
\input control.tex
|
||||||
\input ctrlsub.tex
|
\input ctrlsub.tex
|
||||||
|
\input convauto.tex
|
||||||
\input countstr.tex
|
\input countstr.tex
|
||||||
\input critsect.tex
|
\input critsect.tex
|
||||||
\input crtslock.tex
|
\input crtslock.tex
|
||||||
|
115
docs/latex/wx/convauto.tex
Normal file
115
docs/latex/wx/convauto.tex
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
|
%% Name: convauto.tex
|
||||||
|
%% Purpose: wxConvAuto documentation
|
||||||
|
%% Author: Vadim Zeitlin
|
||||||
|
%% Created: 2007-08-26
|
||||||
|
%% RCS-ID: $Id:$
|
||||||
|
%% Copyright: (c) 2007 Vadim Zeitlin <vadim@wxwidgets.org>
|
||||||
|
%% License: wxWindows license
|
||||||
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
|
|
||||||
|
\section{\class{wxConvAuto}}\label{wxconvauto}
|
||||||
|
|
||||||
|
This class implements a Unicode to/from multibyte converter capable of
|
||||||
|
automatically recognizing the encoding of the multibyte text on input. The
|
||||||
|
logic used is very simple: the class uses the BOM (byte order mark) if it's
|
||||||
|
present and tries to interpret the input as UTF-8 otherwise. If this fails, the
|
||||||
|
input is interpreted as being in the default multibyte encoding which can be
|
||||||
|
specified in the constructor of a wxConvAuto instance and, in turn, defaults to
|
||||||
|
the value of \helpref{GetFallbackEncoding}{wxconvautogetdefaultmbencoding} if
|
||||||
|
not explicitly given.
|
||||||
|
|
||||||
|
For the conversion from Unicode to multibyte, the same encoding as was
|
||||||
|
previously used for multibyte to Unicode conversion is reused. If there had
|
||||||
|
been no previous multibyte to Unicode conversion, UTF-8 is used by default.
|
||||||
|
Notice that once the multibyte encoding is automatically detected, it doesn't
|
||||||
|
change any more, i.e. it is entirely determined by the first use of wxConvAuto
|
||||||
|
object in the multibyte-to-Unicode direction. However creating a copy of
|
||||||
|
wxConvAuto object, either via the usual copy constructor or assignment
|
||||||
|
operator, or using \helpref{Clone}{wxmbconvclone} method, resets the
|
||||||
|
automatically detected encoding so that the new copy will try to detect the
|
||||||
|
encoding of the input on first use.
|
||||||
|
|
||||||
|
This class is used by default in wxWidgets classes and functions reading text
|
||||||
|
from files such as \helpref{wxFile}{wxfile}, \helpref{wxFFile}{wxffile},
|
||||||
|
\helpref{wxTextFile}{wxtextfile}, \helpref{wxFileConfig}{wxfileconfig} and
|
||||||
|
various stream classes so the encoding set with its
|
||||||
|
\helpref{SetFallbackEncoding}{wxconvautosetdefaultmbencoding} method will
|
||||||
|
affect how these classes treat input files. In particular, use this method
|
||||||
|
to change the fall-back multibyte encoding used to interpret the contents of
|
||||||
|
the files whose contents isn't valid UTF-8 or to disallow it completely.
|
||||||
|
|
||||||
|
\wxheading{Derived from}
|
||||||
|
|
||||||
|
\helpref{wxMBConv}{mbconv}
|
||||||
|
|
||||||
|
\wxheading{Include files}
|
||||||
|
|
||||||
|
<wx/convauto.h>
|
||||||
|
|
||||||
|
\wxheading{Library}
|
||||||
|
|
||||||
|
\helpref{wxBase}{librarieslist}
|
||||||
|
|
||||||
|
\wxheading{See also}
|
||||||
|
|
||||||
|
\helpref{wxMBConv classes overview}{mbconvclasses}
|
||||||
|
|
||||||
|
|
||||||
|
\latexignore{\rtfignore{\wxheading{Members}}}
|
||||||
|
|
||||||
|
\membersection{wxConvAuto::wxConvAuto}\label{wxconvautowxconvauto}
|
||||||
|
|
||||||
|
\func{}{wxConvAuto}{\param{wxFontEncoding }{enc = wxFONTENCODING\_DEFAULT}}
|
||||||
|
|
||||||
|
Constructs a new wxConvAuto instance. The object will try to detect the input
|
||||||
|
of the multibyte text given to its \helpref{ToWChar}{wxmbconvtowchar} method
|
||||||
|
automatically but if the automatic detection of Unicode encodings fails, the
|
||||||
|
fall-back encoding \arg{enc} will be used to interpret it as multibyte text.
|
||||||
|
The default value of this parameter, \texttt{wxFONTENCODING\_DEFAULT} means
|
||||||
|
that the global default value which can be set using
|
||||||
|
\helpref{SetFallbackEncoding}{wxconvautosetdefaultmbencoding} method should be
|
||||||
|
used. As with that method, passing \texttt{wxFONTENCODING\_MAX} inhibits using
|
||||||
|
this encoding completely so the input multibyte text will always be interpreted
|
||||||
|
as UTF-8 in the absence of BOM and the conversion will fail if the input
|
||||||
|
doesn't form valid UTF-8 sequence. Another special value is
|
||||||
|
\texttt{wxFONTENCODING\_SYSTEM} which means to use the encoding currently used
|
||||||
|
on the user system, i.e. the encoding returned by
|
||||||
|
\helpref{wxLocale::GetSystemEncoding}{wxlocalegetsystemencoding}. Any other
|
||||||
|
encoding will be used as is, e.g. passing \texttt{wxFONTENCODING\_ISO8859\_1}
|
||||||
|
ensures that non-UTF-8 input will be treated as latin1.
|
||||||
|
|
||||||
|
|
||||||
|
\membersection{wxConvAuto::DisableFallbackEncoding}\label{wxconvautodisablefallbackencoding}
|
||||||
|
|
||||||
|
\func{static void}{DisableFallbackEncoding}{\void}
|
||||||
|
|
||||||
|
Disable the use of the fall back encoding: if the input doesn't have a BOM and
|
||||||
|
is not valid UTF-8, the conversion will fail.
|
||||||
|
|
||||||
|
|
||||||
|
\membersection{wxConvAuto::GetFallbackEncoding}\label{wxconvautogetdefaultmbencoding}
|
||||||
|
|
||||||
|
\func{static wxFontEncoding}{GetFallbackEncoding}{\void}
|
||||||
|
|
||||||
|
Returns the encoding used by default by wxConvAuto if no other encoding is
|
||||||
|
explicitly specified in constructor. By default, returns
|
||||||
|
\texttt{wxFONTENCODING\_ISO8859\_1} but can be changed using
|
||||||
|
\helpref{SetFallbackEncoding}{wxconvautosetdefaultmbencoding} method.
|
||||||
|
|
||||||
|
|
||||||
|
\membersection{wxConvAuto::SetFallbackEncoding}\label{wxconvautosetdefaultmbencoding}
|
||||||
|
|
||||||
|
\func{static void}{SetFallbackEncoding}{\param{wxFontEncoding }{enc}}
|
||||||
|
|
||||||
|
Changes the encoding used by default by wxConvAuto if no other encoding is
|
||||||
|
explicitly specified in constructor. The default value, which can be retrieved
|
||||||
|
using \helpref{GetFallbackEncoding}{wxconvautogetdefaultmbencoding}, is
|
||||||
|
\texttt{wxFONTENCODING\_ISO8859\_1}.
|
||||||
|
|
||||||
|
Special values of \texttt{wxFONTENCODING\_SYSTEM} or
|
||||||
|
\texttt{wxFONTENCODING\_MAX} can be used for \arg{enc} parameter to use the
|
||||||
|
encoding of the current user locale as fall back or not use any encoding for
|
||||||
|
fall back at all, respectively (just as with the similar constructor
|
||||||
|
parameter). However \texttt{wxFONTENCODING\_DEFAULT} value cannot be used here.
|
||||||
|
|
@@ -12,6 +12,7 @@
|
|||||||
#define _WX_CONVAUTO_H_
|
#define _WX_CONVAUTO_H_
|
||||||
|
|
||||||
#include "wx/strconv.h"
|
#include "wx/strconv.h"
|
||||||
|
#include "wx/fontenc.h"
|
||||||
|
|
||||||
#if wxUSE_WCHAR_T
|
#if wxUSE_WCHAR_T
|
||||||
|
|
||||||
@@ -23,13 +24,39 @@ class WXDLLIMPEXP_BASE wxConvAuto : public wxMBConv
|
|||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
// default ctor, the real conversion will be created on demand
|
// default ctor, the real conversion will be created on demand
|
||||||
wxConvAuto() { m_conv = NULL; /* the rest will be initialized later */ }
|
wxConvAuto(wxFontEncoding enc = wxFONTENCODING_DEFAULT)
|
||||||
|
{
|
||||||
|
m_conv = NULL; // the rest will be initialized later
|
||||||
|
m_encDefault = enc;
|
||||||
|
}
|
||||||
|
|
||||||
// copy ctor doesn't initialize anything neither as conversion can only be
|
// copy ctor doesn't initialize anything neither as conversion can only be
|
||||||
// deduced on first use
|
// deduced on first use
|
||||||
wxConvAuto(const wxConvAuto& WXUNUSED(other)) : wxMBConv() { m_conv = NULL; }
|
wxConvAuto(const wxConvAuto& other) : wxMBConv()
|
||||||
|
{
|
||||||
|
m_conv = NULL;
|
||||||
|
m_encDefault = other.m_encDefault;
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual ~wxConvAuto()
|
||||||
|
{
|
||||||
|
if ( m_ownsConv )
|
||||||
|
delete m_conv;
|
||||||
|
}
|
||||||
|
|
||||||
|
// get/set the fall-back encoding used when the input text doesn't have BOM
|
||||||
|
// and isn't UTF-8
|
||||||
|
//
|
||||||
|
// special values are wxFONTENCODING_MAX meaning not to use any fall back
|
||||||
|
// at all (but just fail to convert in this case) and wxFONTENCODING_SYSTEM
|
||||||
|
// meaning to use the encoding of the system locale
|
||||||
|
static wxFontEncoding GetFallbackEncoding() { return ms_defaultMBEncoding; }
|
||||||
|
static void SetFallbackEncoding(wxFontEncoding enc);
|
||||||
|
static void DisableFallbackEncoding()
|
||||||
|
{
|
||||||
|
SetFallbackEncoding(wxFONTENCODING_MAX);
|
||||||
|
}
|
||||||
|
|
||||||
virtual ~wxConvAuto() { if ( m_conv && m_ownsConv ) delete m_conv; }
|
|
||||||
|
|
||||||
// override the base class virtual function(s) to use our m_conv
|
// override the base class virtual function(s) to use our m_conv
|
||||||
virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
|
virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
|
||||||
@@ -57,8 +84,8 @@ private:
|
|||||||
// return the BOM type of this buffer
|
// return the BOM type of this buffer
|
||||||
static BOMType DetectBOM(const char *src, size_t srcLen);
|
static BOMType DetectBOM(const char *src, size_t srcLen);
|
||||||
|
|
||||||
// initialize m_conv with the conversion to use by default (UTF-8)
|
// initialize m_conv with the UTF-8 conversion
|
||||||
void InitWithDefault()
|
void InitWithUTF8()
|
||||||
{
|
{
|
||||||
m_conv = &wxConvUTF8;
|
m_conv = &wxConvUTF8;
|
||||||
m_ownsConv = false;
|
m_ownsConv = false;
|
||||||
@@ -76,10 +103,17 @@ private:
|
|||||||
void SkipBOM(const char **src, size_t *len) const;
|
void SkipBOM(const char **src, size_t *len) const;
|
||||||
|
|
||||||
|
|
||||||
|
// fall-back multibyte encoding to use, may be wxFONTENCODING_SYSTEM or
|
||||||
|
// wxFONTENCODING_MAX but not wxFONTENCODING_DEFAULT
|
||||||
|
static wxFontEncoding ms_defaultMBEncoding;
|
||||||
|
|
||||||
// conversion object which we really use, NULL until the first call to
|
// conversion object which we really use, NULL until the first call to
|
||||||
// either ToWChar() or FromWChar()
|
// either ToWChar() or FromWChar()
|
||||||
wxMBConv *m_conv;
|
wxMBConv *m_conv;
|
||||||
|
|
||||||
|
// the multibyte encoding to use by default if input isn't Unicode
|
||||||
|
wxFontEncoding m_encDefault;
|
||||||
|
|
||||||
// our BOM type
|
// our BOM type
|
||||||
BOMType m_bomType;
|
BOMType m_bomType;
|
||||||
|
|
||||||
|
@@ -30,10 +30,25 @@
|
|||||||
|
|
||||||
#include "wx/convauto.h"
|
#include "wx/convauto.h"
|
||||||
|
|
||||||
|
// we use latin1 by default as it seems the least bad choice: the files we need
|
||||||
|
// to detect input of don't always come from the user system (they are often
|
||||||
|
// received from other machines) and so using wxFONTENCODING_SYSTEM doesn't
|
||||||
|
// seem to be a good idea and there is no other reasonable alternative
|
||||||
|
wxFontEncoding wxConvAuto::ms_defaultMBEncoding = wxFONTENCODING_ISO8859_1;
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// implementation
|
// implementation
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
||||||
|
/* static */
|
||||||
|
void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc)
|
||||||
|
{
|
||||||
|
wxASSERT_MSG( enc != wxFONTENCODING_DEFAULT,
|
||||||
|
_T("wxFONTENCODING_DEFAULT doesn't make sense here") );
|
||||||
|
|
||||||
|
ms_defaultMBEncoding = enc;
|
||||||
|
}
|
||||||
|
|
||||||
/* static */
|
/* static */
|
||||||
wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
|
wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen)
|
||||||
{
|
{
|
||||||
@@ -118,8 +133,7 @@ void wxConvAuto::InitFromBOM(BOMType bomType)
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
case BOM_UTF8:
|
case BOM_UTF8:
|
||||||
m_conv = &wxConvUTF8;
|
InitWithUTF8();
|
||||||
m_ownsConv = false;
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
@@ -127,7 +141,7 @@ void wxConvAuto::InitFromBOM(BOMType bomType)
|
|||||||
// fall through: still need to create something
|
// fall through: still need to create something
|
||||||
|
|
||||||
case BOM_None:
|
case BOM_None:
|
||||||
InitWithDefault();
|
InitWithUTF8();
|
||||||
m_consumedBOM = true; // as there is nothing to consume
|
m_consumedBOM = true; // as there is nothing to consume
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -194,7 +208,27 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
|
|||||||
SkipBOM(&src, &srcLen);
|
SkipBOM(&src, &srcLen);
|
||||||
}
|
}
|
||||||
|
|
||||||
return m_conv->ToWChar(dst, dstLen, src, srcLen);
|
// try to convert using the auto-detected encoding
|
||||||
|
size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
|
||||||
|
if ( rc == wxCONV_FAILED && m_bomType == BOM_None )
|
||||||
|
{
|
||||||
|
// if the conversion failed but we didn't really detect anything and
|
||||||
|
// simply tried UTF-8 by default, retry it using the fall-back
|
||||||
|
if ( m_encDefault != wxFONTENCODING_MAX )
|
||||||
|
{
|
||||||
|
if ( m_ownsConv )
|
||||||
|
delete m_conv;
|
||||||
|
|
||||||
|
self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
|
||||||
|
? GetFallbackEncoding()
|
||||||
|
: m_encDefault);
|
||||||
|
self->m_ownsConv = true;
|
||||||
|
|
||||||
|
rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t
|
size_t
|
||||||
@@ -204,7 +238,7 @@ wxConvAuto::FromWChar(char *dst, size_t dstLen,
|
|||||||
if ( !m_conv )
|
if ( !m_conv )
|
||||||
{
|
{
|
||||||
// default to UTF-8 for the multibyte output
|
// default to UTF-8 for the multibyte output
|
||||||
wx_const_cast(wxConvAuto *, this)->InitWithDefault();
|
wx_const_cast(wxConvAuto *, this)->InitWithUTF8();
|
||||||
}
|
}
|
||||||
|
|
||||||
return m_conv->FromWChar(dst, dstLen, src, srcLen);
|
return m_conv->FromWChar(dst, dstLen, src, srcLen);
|
||||||
|
Reference in New Issue
Block a user