Files
wxWidgets/src/common/strconv.cpp
Vadim Zeitlin 46ea3cb8c0 Refactor: merge decode_utf16() into wxDecodeSurrogate()
No real changes, but just get rid of two functions doing the same thing
but using (semantically) different API, this was just too confusing.

Change all the code to use wxDecodeSurrogate() that encapsulates
decoding the surrogate and advancing the input pointer as needed and so
is less error-prone.

More generally, change the code to use end pointers instead of
decrementing the length to check for the end condition: this is more
clear, simpler and probably even more efficient.
2017-11-09 23:49:53 +01:00

3319 lines
96 KiB
C++

/////////////////////////////////////////////////////////////////////////////
// Name: src/common/strconv.cpp
// Purpose: Unicode conversion classes
// Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
// Ryan Norton, Fredrik Roubert (UTF7)
// Modified by:
// Created: 29/01/98
// Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
// (c) 2000-2003 Vadim Zeitlin
// (c) 2004 Ryan Norton, Fredrik Roubert
// Licence: wxWindows licence
/////////////////////////////////////////////////////////////////////////////
// For compilers that support precompilation, includes "wx.h".
#include "wx/wxprec.h"
#ifdef __BORLANDC__
#pragma hdrstop
#endif //__BORLANDC__
#ifndef WX_PRECOMP
#include "wx/intl.h"
#include "wx/log.h"
#include "wx/utils.h"
#include "wx/hashmap.h"
#endif
#include "wx/strconv.h"
#include <errno.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#if defined(__WIN32__)
#include "wx/msw/private.h"
#include "wx/msw/missing.h"
#define wxHAVE_WIN32_MB2WC
#endif
#ifdef HAVE_ICONV
#include <iconv.h>
#include "wx/thread.h"
#endif
#include "wx/encconv.h"
#include "wx/fontmap.h"
#ifdef __DARWIN__
#include "wx/osx/core/private/strconv_cf.h"
#endif //def __DARWIN__
#define TRACE_STRCONV wxT("strconv")
// WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
// be 4 bytes
#if SIZEOF_WCHAR_T == 2
#define WC_UTF16
#endif
// ============================================================================
// implementation
// ============================================================================
// helper function of cMB2WC(): check if n bytes at this location are all NUL
static bool NotAllNULs(const char *p, size_t n)
{
while ( n && *p++ == '\0' )
n--;
return n != 0;
}
// ----------------------------------------------------------------------------
// UTF-16 en/decoding to/from UCS-4 with surrogates handling
// ----------------------------------------------------------------------------
static size_t encode_utf16(wxUint32 input, wxUint16 *output)
{
if (wxUniChar::IsBMP(input))
{
if (output)
*output = (wxUint16) input;
return 1;
}
else if (wxUniChar::IsSupplementary(input))
{
if (output)
{
*output++ = wxUniChar::HighSurrogate(input);
*output = wxUniChar::LowSurrogate(input);
}
return 2;
}
else
{
return wxCONV_FAILED;
}
}
// Returns the next UTF-32 character from the wchar_t buffer terminated by the
// "end" pointer (the caller must ensure that on input "*pSrc < end") and
// advances the pointer to the character after this one.
//
// If an invalid or incomplete character is found, *pSrc is set to NULL, the
// caller must check for this.
static wxUint32 wxDecodeSurrogate(const wxChar16 **pSrc, const wxChar16* end)
{
const wxChar16*& src = *pSrc;
// Is this a BMP character?
const wxUint16 u = *src++;
if ((u < 0xd800) || (u > 0xdfff))
{
// Yes, just return it.
return u;
}
// No, we have the first half of a surrogate, check if we also have the
// second half (notice that this check does nothing if end == NULL, as it
// is allowed to be, and this is correct).
if ( src == end )
{
// No, we don't because this is the end of input.
src = NULL;
return 0;
}
const wxUint16 u2 = *src++;
if ( (u2 < 0xdc00) || (u2 > 0xdfff) )
{
// No, it's not in the low surrogate range.
src = NULL;
return 0;
}
// Yes, decode it and return the corresponding Unicode character.
return ((u - 0xd7c0) << 10) + (u2 - 0xdc00);
}
// ----------------------------------------------------------------------------
// wxMBConv
// ----------------------------------------------------------------------------
size_t
wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
const char *src, size_t srcLen) const
{
// although new conversion classes are supposed to implement this function
// directly, the existing ones only implement the old MB2WC() and so, to
// avoid to have to rewrite all conversion classes at once, we provide a
// default (but not efficient) implementation of this one in terms of the
// old function by copying the input to ensure that it's NUL-terminated and
// then using MB2WC() to convert it
//
// moreover, some conversion classes simply can't implement ToWChar()
// directly, the primary example is wxConvLibc: mbstowcs() only handles
// NUL-terminated strings
// the number of chars [which would be] written to dst [if it were not NULL]
size_t dstWritten = 0;
// the number of NULs terminating this string
size_t nulLen = 0; // not really needed, but just to avoid warnings
// if we were not given the input size we just have to assume that the
// string is properly terminated as we have no way of knowing how long it
// is anyhow, but if we do have the size check whether there are enough
// NULs at the end
wxCharBuffer bufTmp;
const char *srcEnd;
if ( srcLen != wxNO_LEN )
{
// we need to know how to find the end of this string
nulLen = GetMBNulLen();
if ( nulLen == wxCONV_FAILED )
return wxCONV_FAILED;
// if there are enough NULs we can avoid the copy
if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
{
// make a copy in order to properly NUL-terminate the string
bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
char * const p = bufTmp.data();
memcpy(p, src, srcLen);
for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
*s = '\0';
src = bufTmp;
}
srcEnd = src + srcLen;
}
else // quit after the first loop iteration
{
srcEnd = NULL;
}
// the idea of this code is straightforward: it converts a NUL-terminated
// chunk of the string during each iteration and updates the output buffer
// with the result
//
// all the complication come from the fact that this function, for
// historical reasons, must behave in 2 subtly different ways when it's
// called with a fixed number of characters and when it's called for the
// entire NUL-terminated string: in the former case (srcEnd != NULL) we
// must count all characters we convert, NUL or not; but in the latter we
// do not count the trailing NUL -- but still count all the NULs inside the
// string
//
// so for the (simple) former case we just always count the trailing NUL,
// but for the latter we need to wait until we see if there is going to be
// another loop iteration and only count it then
for ( ;; )
{
// try to convert the current chunk
size_t lenChunk = MB2WC(NULL, src, 0);
if ( lenChunk == wxCONV_FAILED )
return wxCONV_FAILED;
dstWritten += lenChunk;
if ( !srcEnd )
dstWritten++;
if ( dst )
{
if ( dstWritten > dstLen )
return wxCONV_FAILED;
// +1 is for trailing NUL
if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
return wxCONV_FAILED;
dst += lenChunk;
if ( !srcEnd )
dst++;
}
if ( !srcEnd )
{
// we convert just one chunk in this case as this is the entire
// string anyhow (and we don't count the trailing NUL in this case)
break;
}
// advance the input pointer past the end of this chunk: notice that we
// will always stop before srcEnd because we know that the chunk is
// always properly NUL-terminated
while ( NotAllNULs(src, nulLen) )
{
// notice that we must skip over multiple bytes here as we suppose
// that if NUL takes 2 or 4 bytes, then all the other characters do
// too and so if advanced by a single byte we might erroneously
// detect sequences of NUL bytes in the middle of the input
src += nulLen;
}
// if the buffer ends before this NUL, we shouldn't count it in our
// output so skip the code below
if ( src == srcEnd )
break;
// do count this terminator as it's inside the buffer we convert
dstWritten++;
if ( dst )
dst++;
src += nulLen; // skip the terminator itself
if ( src >= srcEnd )
break;
}
return dstWritten;
}
size_t
wxMBConv::FromWChar(char *dst, size_t dstLen,
const wchar_t *src, size_t srcLen) const
{
// the number of chars [which would be] written to dst [if it were not NULL]
size_t dstWritten = 0;
// if we don't know its length we have no choice but to assume that it is
// NUL-terminated (notice that it can still be NUL-terminated even if
// explicit length is given but it doesn't change our return value)
const bool isNulTerminated = srcLen == wxNO_LEN;
// make a copy of the input string unless it is already properly
// NUL-terminated
wxWCharBuffer bufTmp;
if ( isNulTerminated )
{
srcLen = wxWcslen(src) + 1;
}
else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
{
// make a copy in order to properly NUL-terminate the string
bufTmp = wxWCharBuffer(srcLen);
memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
src = bufTmp;
}
const size_t lenNul = GetMBNulLen();
for ( const wchar_t * const srcEnd = src + srcLen;
src < srcEnd;
src++ /* skip L'\0' too */ )
{
// try to convert the current chunk
size_t lenChunk = WC2MB(NULL, src, 0);
if ( lenChunk == wxCONV_FAILED )
return wxCONV_FAILED;
dstWritten += lenChunk;
const wchar_t * const
chunkEnd = isNulTerminated ? srcEnd - 1 : src + wxWcslen(src);
// our return value accounts for the trailing NUL(s), unlike that of
// WC2MB(), however don't do it for the last NUL we artificially added
// ourselves above
if ( chunkEnd < srcEnd )
dstWritten += lenNul;
if ( dst )
{
if ( dstWritten > dstLen )
return wxCONV_FAILED;
// if we know that there is enough space in the destination buffer
// (because we accounted for lenNul in dstWritten above), we can
// convert directly in place -- but otherwise we need another
// temporary buffer to ensure that we don't overwrite the output
wxCharBuffer dstBuf;
char *dstTmp;
if ( chunkEnd == srcEnd )
{
dstBuf = wxCharBuffer(lenChunk + lenNul - 1);
dstTmp = dstBuf.data();
}
else
{
dstTmp = dst;
}
if ( WC2MB(dstTmp, src, lenChunk + lenNul) == wxCONV_FAILED )
return wxCONV_FAILED;
if ( dstTmp != dst )
{
// copy everything up to but excluding the terminating NUL(s)
// into the real output buffer
memcpy(dst, dstTmp, lenChunk);
// micro-optimization: if dstTmp != dst it means that chunkEnd
// == srcEnd and so we're done, no need to update anything below
break;
}
dst += lenChunk;
if ( chunkEnd < srcEnd )
dst += lenNul;
}
src = chunkEnd;
}
return dstWritten;
}
size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
{
size_t rc = ToWChar(outBuff, outLen, inBuff);
if ( rc != wxCONV_FAILED )
{
// ToWChar() returns the buffer length, i.e. including the trailing
// NUL, while this method doesn't take it into account
rc--;
}
return rc;
}
size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
{
size_t rc = FromWChar(outBuff, outLen, inBuff);
if ( rc != wxCONV_FAILED )
{
rc -= GetMBNulLen();
}
return rc;
}
wxWCharBuffer
wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
{
const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
if ( dstLen != wxCONV_FAILED )
{
// notice that we allocate space for dstLen+1 wide characters here
// because we want the buffer to always be NUL-terminated, even if the
// input isn't (as otherwise the caller has no way to know its length)
wxWCharBuffer wbuf(dstLen);
if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
{
if ( outLen )
{
*outLen = dstLen;
// we also need to handle NUL-terminated input strings
// specially: for them the output is the length of the string
// excluding the trailing NUL, however if we're asked to
// convert a specific number of characters we return the length
// of the resulting output even if it's NUL-terminated
if ( inLen == wxNO_LEN )
(*outLen)--;
}
return wbuf;
}
}
if ( outLen )
*outLen = 0;
return wxWCharBuffer();
}
wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
{
size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
if ( dstLen != wxCONV_FAILED )
{
const size_t nulLen = GetMBNulLen();
// as above, ensure that the buffer is always NUL-terminated, even if
// the input is not
wxCharBuffer buf(dstLen + nulLen - 1);
memset(buf.data() + dstLen, 0, nulLen);
// Notice that return value of the call to FromWChar() here may be
// different from the one above as it could have overestimated the
// space needed, while what we get here is the exact length.
dstLen = FromWChar(buf.data(), dstLen, inBuff, inLen);
if ( dstLen != wxCONV_FAILED )
{
if ( outLen )
{
*outLen = dstLen;
if ( inLen == wxNO_LEN )
{
// in this case both input and output are NUL-terminated
// and we're not supposed to count NUL
*outLen -= nulLen;
}
}
return buf;
}
}
if ( outLen )
*outLen = 0;
return wxCharBuffer();
}
wxWCharBuffer wxMBConv::DoConvertMB2WC(const char* buf, size_t srcLen) const
{
// Notice that converting NULL pointer should work, i.e. return an empty
// buffer instead of crashing, so we need to check both the length and the
// pointer because length is wxNO_LEN if it's a raw pointer and doesn't
// come from wxScopedCharBuffer.
if ( srcLen && buf )
{
const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
if ( dstLen != wxCONV_FAILED )
{
wxWCharBuffer wbuf(dstLen);
wbuf.data()[dstLen] = L'\0';
if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
{
// If the input string was NUL-terminated, we shouldn't include
// the length of the trailing NUL into the length of the return
// value.
if ( srcLen == wxNO_LEN )
wbuf.shrink(dstLen - 1);
return wbuf;
}
}
}
return wxWCharBuffer();
}
wxCharBuffer wxMBConv::DoConvertWC2MB(const wchar_t* wbuf, size_t srcLen) const
{
if ( srcLen && wbuf )
{
const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
if ( dstLen != wxCONV_FAILED )
{
wxCharBuffer buf(dstLen);
buf.data()[dstLen] = '\0';
if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
{
// As above, in DoConvertMB2WC(), except that the length of the
// trailing NUL is variable in this case.
if ( srcLen == wxNO_LEN )
buf.shrink(dstLen - GetMBNulLen());
return buf;
}
}
}
return wxCharBuffer();
}
// ----------------------------------------------------------------------------
// wxMBConvLibc
// ----------------------------------------------------------------------------
size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
{
return wxMB2WC(buf, psz, n);
}
size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
{
return wxWC2MB(buf, psz, n);
}
// ----------------------------------------------------------------------------
// wxConvBrokenFileNames
// ----------------------------------------------------------------------------
#ifdef __UNIX__
wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
{
if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
wxStricmp(charset, wxT("UTF8")) == 0 )
m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
else
m_conv = new wxCSConv(charset);
}
#endif // __UNIX__
// ----------------------------------------------------------------------------
// UTF-7
// ----------------------------------------------------------------------------
// Implementation (C) 2004 Fredrik Roubert
//
// Changes to work in streaming mode (C) 2008 Vadim Zeitlin
//
// BASE64 decoding table
//
static const unsigned char utf7unb64[] =
{
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
};
size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
const char *src, size_t srcLen) const
{
DecoderState stateOrig,
*statePtr;
if ( srcLen == wxNO_LEN )
{
// convert the entire string, up to and including the trailing NUL
srcLen = strlen(src) + 1;
// when working on the entire strings we don't update nor use the shift
// state from the previous call
statePtr = &stateOrig;
}
else // when working with partial strings we do use the shift state
{
statePtr = const_cast<DecoderState *>(&m_stateDecoder);
// also save the old state to be able to rollback to it on error
stateOrig = m_stateDecoder;
}
// but to simplify the code below we use this variable in both cases
DecoderState& state = *statePtr;
// number of characters [which would have been] written to dst [if it were
// not NULL]
size_t len = 0;
const char * const srcEnd = src + srcLen;
while ( (src < srcEnd) && (!dst || (len < dstLen)) )
{
const unsigned char cc = *src++;
if ( state.IsShifted() )
{
const unsigned char dc = utf7unb64[cc];
if ( dc == 0xff )
{
// end of encoded part, check that nothing was left: there can
// be up to 4 bits of 0 padding but nothing else (we also need
// to check isLSB as we count bits modulo 8 while a valid UTF-7
// encoded sequence must contain an integral number of UTF-16
// characters)
if ( state.isLSB || state.bit > 4 ||
(state.accum & ((1 << state.bit) - 1)) )
{
if ( !len )
state = stateOrig;
return wxCONV_FAILED;
}
state.ToDirect();
// re-parse this character normally below unless it's '-' which
// is consumed by the decoder
if ( cc == '-' )
continue;
}
else // valid encoded character
{
// mini base64 decoder: each character is 6 bits
state.bit += 6;
state.accum <<= 6;
state.accum += dc;
if ( state.bit >= 8 )
{
// got the full byte, consume it
state.bit -= 8;
unsigned char b = (state.accum >> state.bit) & 0x00ff;
if ( state.isLSB )
{
// we've got the full word, output it
if ( dst )
*dst++ = (state.msb << 8) | b;
len++;
state.isLSB = false;
}
else // MSB
{
// just store it while we wait for LSB
state.msb = b;
state.isLSB = true;
}
}
}
}
if ( state.IsDirect() )
{
// start of an encoded segment?
if ( cc == '+' )
{
// Can't end with a plus sign.
if ( src == srcEnd )
return wxCONV_FAILED;
if ( *src == '-' )
{
// just the encoded plus sign, don't switch to shifted mode
if ( dst )
*dst++ = '+';
len++;
src++;
}
else if ( utf7unb64[(unsigned)*src] == 0xff )
{
// empty encoded chunks are not allowed
if ( !len )
state = stateOrig;
return wxCONV_FAILED;
}
else // base-64 encoded chunk follows
{
state.ToShifted();
}
}
else // not '+'
{
// only printable 7 bit ASCII characters (with the exception of
// NUL, TAB, CR and LF) can be used directly
if ( cc >= 0x7f || (cc < ' ' &&
!(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
return wxCONV_FAILED;
if ( dst )
*dst++ = cc;
len++;
}
}
}
if ( !len )
{
// as we didn't read any characters we should be called with the same
// data (followed by some more new data) again later so don't save our
// state
state = stateOrig;
return wxCONV_FAILED;
}
return len;
}
//
// BASE64 encoding table
//
static const unsigned char utf7enb64[] =
{
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
'w', 'x', 'y', 'z', '0', '1', '2', '3',
'4', '5', '6', '7', '8', '9', '+', '/'
};
//
// UTF-7 encoding table
//
// 0 - Set D (directly encoded characters)
// 1 - Set O (optional direct characters)
// 2 - whitespace characters (optional)
// 3 - special characters
//
static const unsigned char utf7encode[128] =
{
0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
};
static inline bool wxIsUTF7Direct(wchar_t wc)
{
return wc < 0x80 && utf7encode[wc] < 1;
}
size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
const wchar_t *src, size_t srcLen) const
{
EncoderState stateOrig,
*statePtr;
if ( srcLen == wxNO_LEN )
{
// we don't apply the stored state when operating on entire strings at
// once
statePtr = &stateOrig;
srcLen = wxWcslen(src) + 1;
}
else // do use the mode we left the output in previously
{
stateOrig = m_stateEncoder;
statePtr = const_cast<EncoderState *>(&m_stateEncoder);
}
EncoderState& state = *statePtr;
size_t len = 0;
const wchar_t * const srcEnd = src + srcLen;
while ( src < srcEnd && (!dst || len < dstLen) )
{
wchar_t cc = *src++;
if ( wxIsUTF7Direct(cc) )
{
if ( state.IsShifted() )
{
// pad with zeros the last encoded block if necessary
if ( state.bit )
{
if ( dst )
*dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
len++;
}
state.ToDirect();
if ( dst )
*dst++ = '-';
len++;
}
if ( dst )
*dst++ = (char)cc;
len++;
}
else if ( cc == '+' && state.IsDirect() )
{
if ( dst )
{
*dst++ = '+';
*dst++ = '-';
}
len += 2;
}
#ifndef WC_UTF16
else if (((wxUint32)cc) > 0xffff)
{
// no surrogate pair generation (yet?)
return wxCONV_FAILED;
}
#endif
else
{
if ( state.IsDirect() )
{
state.ToShifted();
if ( dst )
*dst++ = '+';
len++;
}
// BASE64 encode string
for ( ;; )
{
for ( unsigned lsb = 0; lsb < 2; lsb++ )
{
state.accum <<= 8;
state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
for (state.bit += 8; state.bit >= 6; )
{
state.bit -= 6;
if ( dst )
*dst++ = utf7enb64[(state.accum >> state.bit) % 64];
len++;
}
}
if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
break;
src++;
}
}
}
// we need to restore the original encoder state if we were called just to
// calculate the amount of space needed as we will presumably be called
// again to really convert the data now
if ( !dst )
state = stateOrig;
return len;
}
// ----------------------------------------------------------------------------
// UTF-8
// ----------------------------------------------------------------------------
static const wxUint32 utf8_max[]=
{ 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
// boundaries of the private use area we use to (temporarily) remap invalid
// characters invalid in a UTF-8 encoded string
const wxUint32 wxUnicodePUA = 0x100000;
const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
// this table gives the length of the UTF-8 encoding from its first character:
const unsigned char tableUtf8Lengths[256] = {
// single-byte sequences (ASCII):
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
// these are invalid:
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
0, 0, // C0,C1
// two-byte sequences:
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
// three-byte sequences:
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
// four-byte sequences:
4, 4, 4, 4, 4, // F0..F4
// these are invalid again (5- or 6-byte
// sequences and sequences for code points
// above U+10FFFF, as restricted by RFC 3629):
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
};
size_t
wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
const char *src, size_t srcLen) const
{
wchar_t *out = dstLen ? dst : NULL;
size_t written = 0;
if ( srcLen == wxNO_LEN )
srcLen = strlen(src) + 1;
for ( const char *p = src; ; p++ )
{
if ( (srcLen == wxNO_LEN ? !*p : !srcLen) )
{
// all done successfully, just add the trailing NULL if we are not
// using explicit length
if ( srcLen == wxNO_LEN )
{
if ( out )
{
if ( !dstLen )
break;
*out = L'\0';
}
written++;
}
return written;
}
if ( out && !dstLen-- )
break;
wxUint32 code;
unsigned char c = *p;
if ( c < 0x80 )
{
if ( srcLen == 0 ) // the test works for wxNO_LEN too
break;
if ( srcLen != wxNO_LEN )
srcLen--;
code = c;
}
else
{
unsigned len = tableUtf8Lengths[c];
if ( !len )
break;
if ( srcLen < len ) // the test works for wxNO_LEN too
break;
if ( srcLen != wxNO_LEN )
srcLen -= len;
// Char. number range | UTF-8 octet sequence
// (hexadecimal) | (binary)
// ----------------------+----------------------------------------
// 0000 0000 - 0000 007F | 0xxxxxxx
// 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
// 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
// 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
//
// Code point value is stored in bits marked with 'x',
// lowest-order bit of the value on the right side in the diagram
// above. (from RFC 3629)
// mask to extract lead byte's value ('x' bits above), by sequence
// length:
static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
len--; // it's more convenient to work with 0-based length here
code = c & leadValueMask[len];
// all remaining bytes, if any, are handled in the same way
// regardless of sequence's length:
for ( ; len; --len )
{
c = *++p;
if ( (c & 0xC0) != 0x80 )
return wxCONV_FAILED;
code <<= 6;
code |= c & 0x3F;
}
}
#ifdef WC_UTF16
// cast is ok because wchar_t == wxUint16 if WC_UTF16
if ( encode_utf16(code, (wxUint16 *)out) == 2 )
{
if ( out )
out++;
written++;
}
#else // !WC_UTF16
if ( out )
*out = code;
#endif // WC_UTF16/!WC_UTF16
if ( out )
out++;
written++;
}
return wxCONV_FAILED;
}
size_t
wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
const wchar_t *src, size_t srcLen) const
{
char *out = dstLen ? dst : NULL;
size_t written = 0;
const wchar_t* const end = srcLen == wxNO_LEN ? NULL : src + srcLen;
for ( const wchar_t *wp = src; ; )
{
if ( end ? wp == end : !*wp )
{
// all done successfully, just add the trailing NULL if we are not
// using explicit length
if ( srcLen == wxNO_LEN )
{
if ( out )
{
if ( !dstLen )
break;
*out = '\0';
}
written++;
}
return written;
}
wxUint32 code;
#ifdef WC_UTF16
code = wxDecodeSurrogate(&wp, end);
if ( !wp )
return wxCONV_FAILED;
#else // wchar_t is UTF-32
code = *wp++ & 0x7fffffff;
#endif
unsigned len;
if ( code <= 0x7F )
{
len = 1;
if ( out )
{
if ( dstLen < len )
break;
out[0] = (char)code;
}
}
else if ( code <= 0x07FF )
{
len = 2;
if ( out )
{
if ( dstLen < len )
break;
// NB: this line takes 6 least significant bits, encodes them as
// 10xxxxxx and discards them so that the next byte can be encoded:
out[1] = 0x80 | (code & 0x3F); code >>= 6;
out[0] = 0xC0 | code;
}
}
else if ( code < 0xFFFF )
{
len = 3;
if ( out )
{
if ( dstLen < len )
break;
out[2] = 0x80 | (code & 0x3F); code >>= 6;
out[1] = 0x80 | (code & 0x3F); code >>= 6;
out[0] = 0xE0 | code;
}
}
else if ( code <= 0x10FFFF )
{
len = 4;
if ( out )
{
if ( dstLen < len )
break;
out[3] = 0x80 | (code & 0x3F); code >>= 6;
out[2] = 0x80 | (code & 0x3F); code >>= 6;
out[1] = 0x80 | (code & 0x3F); code >>= 6;
out[0] = 0xF0 | code;
}
}
else
{
wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
break;
}
if ( out )
{
out += len;
dstLen -= len;
}
written += len;
}
// we only get here if an error occurs during decoding
return wxCONV_FAILED;
}
size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
const char *psz, size_t srcLen) const
{
if ( m_options == MAP_INVALID_UTF8_NOT )
return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
size_t len = 0;
// The length can be either given explicitly or computed implicitly for the
// NUL-terminated strings.
const bool isNulTerminated = srcLen == wxNO_LEN;
while ((isNulTerminated ? *psz : srcLen--) && ((!buf) || (len < n)))
{
const char *opsz = psz;
bool invalid = false;
unsigned char cc = *psz++, fc = cc;
unsigned cnt;
for (cnt = 0; fc & 0x80; cnt++)
fc <<= 1;
if (!cnt)
{
// plain ASCII char
if (buf)
*buf++ = cc;
len++;
// escape the escape character for octal escapes
if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
&& cc == '\\' && (!buf || len < n))
{
if (buf)
*buf++ = cc;
len++;
}
}
else
{
cnt--;
if (!cnt)
{
// invalid UTF-8 sequence
invalid = true;
}
else
{
unsigned ocnt = cnt - 1;
wxUint32 res = cc & (0x3f >> cnt);
while (cnt--)
{
if (!isNulTerminated && !srcLen)
{
// invalid UTF-8 sequence ending before the end of code
// point.
invalid = true;
break;
}
cc = *psz;
if ((cc & 0xC0) != 0x80)
{
// invalid UTF-8 sequence
invalid = true;
break;
}
psz++;
if (!isNulTerminated)
srcLen--;
res = (res << 6) | (cc & 0x3f);
}
if (invalid || res <= utf8_max[ocnt])
{
// illegal UTF-8 encoding
invalid = true;
}
else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
res >= wxUnicodePUA && res < wxUnicodePUAEnd)
{
// if one of our PUA characters turns up externally
// it must also be treated as an illegal sequence
// (a bit like you have to escape an escape character)
invalid = true;
}
else
{
#ifdef WC_UTF16
// cast is ok because wchar_t == wxUint16 if WC_UTF16
size_t pa = encode_utf16(res, (wxUint16 *)buf);
if (pa == wxCONV_FAILED)
{
invalid = true;
}
else
{
if (buf)
buf += pa;
len += pa;
}
#else // !WC_UTF16
if (buf)
*buf++ = (wchar_t)res;
len++;
#endif // WC_UTF16/!WC_UTF16
}
}
if (invalid)
{
if (m_options & MAP_INVALID_UTF8_TO_PUA)
{
while (opsz < psz && (!buf || len < n))
{
#ifdef WC_UTF16
// cast is ok because wchar_t == wxUuint16 if WC_UTF16
size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
wxASSERT(pa != wxCONV_FAILED);
if (buf)
buf += pa;
opsz++;
len += pa;
#else
if (buf)
*buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
opsz++;
len++;
#endif
}
}
else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
{
while (opsz < psz && (!buf || len < n))
{
if ( buf && len + 3 < n )
{
unsigned char on = *opsz;
*buf++ = L'\\';
*buf++ = (wchar_t)( L'0' + on / 0100 );
*buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
*buf++ = (wchar_t)( L'0' + on % 010 );
}
opsz++;
len += 4;
}
}
else // MAP_INVALID_UTF8_NOT
{
return wxCONV_FAILED;
}
}
}
}
if ( isNulTerminated )
{
// Add the trailing NUL in this case if we have a large enough buffer.
if ( buf && (len < n) )
*buf = 0;
// And count it in any case.
len++;
}
return len;
}
static inline bool isoctal(wchar_t wch)
{
return L'0' <= wch && wch <= L'7';
}
size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
const wchar_t *psz, size_t srcLen) const
{
if ( m_options == MAP_INVALID_UTF8_NOT )
return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
size_t len = 0;
// The length can be either given explicitly or computed implicitly for the
// NUL-terminated strings.
const wchar_t* const end = srcLen == wxNO_LEN ? NULL : psz + srcLen;
while ((end ? psz < end : *psz) && ((!buf) || (len < n)))
{
wxUint32 cc;
#ifdef WC_UTF16
cc = wxDecodeSurrogate(&psz, end);
if ( !psz )
return wxCONV_FAILED;
#else
cc = (*psz++) & 0x7fffffff;
#endif
if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
&& cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
{
if (buf)
*buf++ = (char)(cc - wxUnicodePUA);
len++;
}
else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
&& cc == L'\\' && psz[0] == L'\\' )
{
if (buf)
*buf++ = (char)cc;
psz++;
len++;
}
else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
cc == L'\\' &&
isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
{
if (buf)
{
*buf++ = (char) ((psz[0] - L'0') * 0100 +
(psz[1] - L'0') * 010 +
(psz[2] - L'0'));
}
psz += 3;
len++;
}
else
{
unsigned cnt;
for (cnt = 0; cc > utf8_max[cnt]; cnt++)
{
}
if (!cnt)
{
// plain ASCII char
if (buf)
*buf++ = (char) cc;
len++;
}
else
{
len += cnt + 1;
if (buf)
{
*buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
while (cnt--)
*buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
}
}
}
}
if ( !end )
{
// Add the trailing NUL in this case if we have a large enough buffer.
if ( buf && (len < n) )
*buf = 0;
// And count it in any case.
len++;
}
return len;
}
// ============================================================================
// UTF-16
// ============================================================================
#ifdef WORDS_BIGENDIAN
#define wxMBConvUTF16straight wxMBConvUTF16BE
#define wxMBConvUTF16swap wxMBConvUTF16LE
#else
#define wxMBConvUTF16swap wxMBConvUTF16BE
#define wxMBConvUTF16straight wxMBConvUTF16LE
#endif
/* static */
size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
{
if ( srcLen == wxNO_LEN )
{
// count the number of bytes in input, including the trailing NULs
const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
for ( srcLen = 1; *inBuff++; srcLen++ )
;
srcLen *= BYTES_PER_CHAR;
}
else // we already have the length
{
// we can only convert an entire number of UTF-16 characters
if ( srcLen % BYTES_PER_CHAR )
return wxCONV_FAILED;
}
return srcLen;
}
// case when in-memory representation is UTF-16 too
#ifdef WC_UTF16
// ----------------------------------------------------------------------------
// conversions without endianness change
// ----------------------------------------------------------------------------
size_t
wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
const char *src, size_t srcLen) const
{
// set up the scene for using memcpy() (which is presumably more efficient
// than copying the bytes one by one)
srcLen = GetLength(src, srcLen);
if ( srcLen == wxNO_LEN )
return wxCONV_FAILED;
const size_t inLen = srcLen / BYTES_PER_CHAR;
if ( dst )
{
if ( dstLen < inLen )
return wxCONV_FAILED;
memcpy(dst, src, srcLen);
}
return inLen;
}
size_t
wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
const wchar_t *src, size_t srcLen) const
{
if ( srcLen == wxNO_LEN )
srcLen = wxWcslen(src) + 1;
srcLen *= BYTES_PER_CHAR;
if ( dst )
{
if ( dstLen < srcLen )
return wxCONV_FAILED;
memcpy(dst, src, srcLen);
}
return srcLen;
}
// ----------------------------------------------------------------------------
// endian-reversing conversions
// ----------------------------------------------------------------------------
size_t
wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
const char *src, size_t srcLen) const
{
srcLen = GetLength(src, srcLen);
if ( srcLen == wxNO_LEN )
return wxCONV_FAILED;
srcLen /= BYTES_PER_CHAR;
if ( dst )
{
if ( dstLen < srcLen )
return wxCONV_FAILED;
const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
for ( size_t n = 0; n < srcLen; n++, inBuff++ )
{
*dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
}
}
return srcLen;
}
size_t
wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
const wchar_t *src, size_t srcLen) const
{
if ( srcLen == wxNO_LEN )
srcLen = wxWcslen(src) + 1;
srcLen *= BYTES_PER_CHAR;
if ( dst )
{
if ( dstLen < srcLen )
return wxCONV_FAILED;
wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
{
*outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
}
}
return srcLen;
}
#else // !WC_UTF16: wchar_t is UTF-32
// ----------------------------------------------------------------------------
// conversions without endianness change
// ----------------------------------------------------------------------------
size_t
wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
const char *src, size_t srcLen) const
{
srcLen = GetLength(src, srcLen);
if ( srcLen == wxNO_LEN )
return wxCONV_FAILED;
const size_t inLen = srcLen / BYTES_PER_CHAR;
size_t outLen = 0;
const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
{
const wxUint32 ch = wxDecodeSurrogate(&inBuff, inEnd);
if ( !inBuff )
return wxCONV_FAILED;
outLen++;
if ( dst )
{
if ( outLen > dstLen )
return wxCONV_FAILED;
*dst++ = ch;
}
}
return outLen;
}
size_t
wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
const wchar_t *src, size_t srcLen) const
{
if ( srcLen == wxNO_LEN )
srcLen = wxWcslen(src) + 1;
size_t outLen = 0;
wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
for ( size_t n = 0; n < srcLen; n++ )
{
wxUint16 cc[2] = { 0 };
const size_t numChars = encode_utf16(*src++, cc);
if ( numChars == wxCONV_FAILED )
return wxCONV_FAILED;
outLen += numChars * BYTES_PER_CHAR;
if ( outBuff )
{
if ( outLen > dstLen )
return wxCONV_FAILED;
*outBuff++ = cc[0];
if ( numChars == 2 )
{
// second character of a surrogate
*outBuff++ = cc[1];
}
}
}
return outLen;
}
// ----------------------------------------------------------------------------
// endian-reversing conversions
// ----------------------------------------------------------------------------
size_t
wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
const char *src, size_t srcLen) const
{
srcLen = GetLength(src, srcLen);
if ( srcLen == wxNO_LEN )
return wxCONV_FAILED;
const size_t inLen = srcLen / BYTES_PER_CHAR;
size_t outLen = 0;
const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
{
wxUint16 tmp[2];
const wxUint16* tmpEnd = tmp;
tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
tmpEnd++;
if ( inBuff + 1 < inEnd )
{
// Normal case, we have a next character to decode.
tmp[1] = wxUINT16_SWAP_ALWAYS(inBuff[1]);
tmpEnd++;
}
const wxUint16* p = tmp;
const wxUint32 ch = wxDecodeSurrogate(&p, tmpEnd);
if ( !p )
return wxCONV_FAILED;
// Move the real pointer by the same amount as "p" was updated by.
inBuff += p - tmp;
outLen++;
if ( dst )
{
if ( outLen > dstLen )
return wxCONV_FAILED;
*dst++ = ch;
}
}
return outLen;
}
size_t
wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
const wchar_t *src, size_t srcLen) const
{
if ( srcLen == wxNO_LEN )
srcLen = wxWcslen(src) + 1;
size_t outLen = 0;
wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
{
wxUint16 cc[2] = { 0 };
const size_t numChars = encode_utf16(*src, cc);
if ( numChars == wxCONV_FAILED )
return wxCONV_FAILED;
outLen += numChars * BYTES_PER_CHAR;
if ( outBuff )
{
if ( outLen > dstLen )
return wxCONV_FAILED;
*outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
if ( numChars == 2 )
{
// second character of a surrogate
*outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
}
}
}
return outLen;
}
#endif // WC_UTF16/!WC_UTF16
// ============================================================================
// UTF-32
// ============================================================================
#ifdef WORDS_BIGENDIAN
#define wxMBConvUTF32straight wxMBConvUTF32BE
#define wxMBConvUTF32swap wxMBConvUTF32LE
#else
#define wxMBConvUTF32swap wxMBConvUTF32BE
#define wxMBConvUTF32straight wxMBConvUTF32LE
#endif
WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
/* static */
size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
{
if ( srcLen == wxNO_LEN )
{
// count the number of bytes in input, including the trailing NULs
const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
for ( srcLen = 1; *inBuff++; srcLen++ )
;
srcLen *= BYTES_PER_CHAR;
}
else // we already have the length
{
// we can only convert an entire number of UTF-32 characters
if ( srcLen % BYTES_PER_CHAR )
return wxCONV_FAILED;
}
return srcLen;
}
// case when in-memory representation is UTF-16
#ifdef WC_UTF16
// ----------------------------------------------------------------------------
// conversions without endianness change
// ----------------------------------------------------------------------------
size_t
wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
const char *src, size_t srcLen) const
{
srcLen = GetLength(src, srcLen);
if ( srcLen == wxNO_LEN )
return wxCONV_FAILED;
const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
const size_t inLen = srcLen / BYTES_PER_CHAR;
size_t outLen = 0;
for ( size_t n = 0; n < inLen; n++ )
{
wxUint16 cc[2] = { 0 };
const size_t numChars = encode_utf16(*inBuff++, cc);
if ( numChars == wxCONV_FAILED )
return wxCONV_FAILED;
outLen += numChars;
if ( dst )
{
if ( outLen > dstLen )
return wxCONV_FAILED;
*dst++ = cc[0];
if ( numChars == 2 )
{
// second character of a surrogate
*dst++ = cc[1];
}
}
}
return outLen;
}
size_t
wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
const wchar_t *src, size_t srcLen) const
{
if ( srcLen == wxNO_LEN )
srcLen = wxWcslen(src) + 1;
wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
size_t outLen = 0;
for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
{
const wxUint32 ch = wxDecodeSurrogate(&src, srcEnd);
if ( !src )
return wxCONV_FAILED;
outLen += BYTES_PER_CHAR;
if ( outBuff )
{
if ( outLen > dstLen )
return wxCONV_FAILED;
*outBuff++ = ch;
}
}
return outLen;
}
// ----------------------------------------------------------------------------
// endian-reversing conversions
// ----------------------------------------------------------------------------
size_t
wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
const char *src, size_t srcLen) const
{
srcLen = GetLength(src, srcLen);
if ( srcLen == wxNO_LEN )
return wxCONV_FAILED;
const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
const size_t inLen = srcLen / BYTES_PER_CHAR;
size_t outLen = 0;
for ( size_t n = 0; n < inLen; n++, inBuff++ )
{
wxUint16 cc[2] = { 0 };
const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
if ( numChars == wxCONV_FAILED )
return wxCONV_FAILED;
outLen += numChars;
if ( dst )
{
if ( outLen > dstLen )
return wxCONV_FAILED;
*dst++ = cc[0];
if ( numChars == 2 )
{
// second character of a surrogate
*dst++ = cc[1];
}
}
}
return outLen;
}
size_t
wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
const wchar_t *src, size_t srcLen) const
{
if ( srcLen == wxNO_LEN )
srcLen = wxWcslen(src) + 1;
wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
size_t outLen = 0;
for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
{
const wxUint32 ch = wxDecodeSurrogate(&src, srcEnd);
if ( !src )
return wxCONV_FAILED;
outLen += BYTES_PER_CHAR;
if ( outBuff )
{
if ( outLen > dstLen )
return wxCONV_FAILED;
*outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
}
}
return outLen;
}
#else // !WC_UTF16: wchar_t is UTF-32
// ----------------------------------------------------------------------------
// conversions without endianness change
// ----------------------------------------------------------------------------
size_t
wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
const char *src, size_t srcLen) const
{
// use memcpy() as it should be much faster than hand-written loop
srcLen = GetLength(src, srcLen);
if ( srcLen == wxNO_LEN )
return wxCONV_FAILED;
const size_t inLen = srcLen/BYTES_PER_CHAR;
if ( dst )
{
if ( dstLen < inLen )
return wxCONV_FAILED;
memcpy(dst, src, srcLen);
}
return inLen;
}
size_t
wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
const wchar_t *src, size_t srcLen) const
{
if ( srcLen == wxNO_LEN )
srcLen = wxWcslen(src) + 1;
srcLen *= BYTES_PER_CHAR;
if ( dst )
{
if ( dstLen < srcLen )
return wxCONV_FAILED;
memcpy(dst, src, srcLen);
}
return srcLen;
}
// ----------------------------------------------------------------------------
// endian-reversing conversions
// ----------------------------------------------------------------------------
size_t
wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
const char *src, size_t srcLen) const
{
srcLen = GetLength(src, srcLen);
if ( srcLen == wxNO_LEN )
return wxCONV_FAILED;
srcLen /= BYTES_PER_CHAR;
if ( dst )
{
if ( dstLen < srcLen )
return wxCONV_FAILED;
const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
for ( size_t n = 0; n < srcLen; n++, inBuff++ )
{
*dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
}
}
return srcLen;
}
size_t
wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
const wchar_t *src, size_t srcLen) const
{
if ( srcLen == wxNO_LEN )
srcLen = wxWcslen(src) + 1;
srcLen *= BYTES_PER_CHAR;
if ( dst )
{
if ( dstLen < srcLen )
return wxCONV_FAILED;
wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
{
*outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
}
}
return srcLen;
}
#endif // WC_UTF16/!WC_UTF16
// ============================================================================
// The classes doing conversion using the iconv_xxx() functions
// ============================================================================
#ifdef HAVE_ICONV
// VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
// E2BIG if output buffer is _exactly_ as big as needed. Such case is
// (unless there's yet another bug in glibc) the only case when iconv()
// returns with (size_t)-1 (which means error) and says there are 0 bytes
// left in the input buffer -- when _real_ error occurs,
// bytes-left-in-input buffer is non-zero. Hence, this alternative test for
// iconv() failure.
// [This bug does not appear in glibc 2.2.]
#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
#define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
(errno != E2BIG || bufLeft != 0))
#else
#define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
#endif
#define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
#define ICONV_T_INVALID ((iconv_t)-1)
#if SIZEOF_WCHAR_T == 4
#define WC_BSWAP wxUINT32_SWAP_ALWAYS
#define WC_ENC wxFONTENCODING_UTF32
#elif SIZEOF_WCHAR_T == 2
#define WC_BSWAP wxUINT16_SWAP_ALWAYS
#define WC_ENC wxFONTENCODING_UTF16
#else // sizeof(wchar_t) != 2 nor 4
// does this ever happen?
#error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
#endif
// ----------------------------------------------------------------------------
// wxMBConv_iconv: encapsulates an iconv character set
// ----------------------------------------------------------------------------
class wxMBConv_iconv : public wxMBConv
{
public:
wxMBConv_iconv(const char *name);
virtual ~wxMBConv_iconv();
// implement base class virtual methods
virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
const char *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
virtual size_t FromWChar(char *dst, size_t dstLen,
const wchar_t *src, size_t srcLen = wxNO_LEN) const wxOVERRIDE;
virtual size_t GetMBNulLen() const wxOVERRIDE;
virtual bool IsUTF8() const wxOVERRIDE;
virtual wxMBConv *Clone() const wxOVERRIDE
{
wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
p->m_minMBCharWidth = m_minMBCharWidth;
return p;
}
bool IsOk() const
{ return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
protected:
// the iconv handlers used to translate from multibyte
// to wide char and in the other direction
iconv_t m2w,
w2m;
#if wxUSE_THREADS
// guards access to m2w and w2m objects
wxMutex m_iconvMutex;
#endif
private:
// the name (for iconv_open()) of a wide char charset -- if none is
// available on this machine, it will remain NULL
static wxString ms_wcCharsetName;
// true if the wide char encoding we use (i.e. ms_wcCharsetName) has
// different endian-ness than the native one
static bool ms_wcNeedsSwap;
// name of the encoding handled by this conversion
const char *m_name;
// cached result of GetMBNulLen(); set to 0 meaning "unknown"
// initially
size_t m_minMBCharWidth;
};
// make the constructor available for unit testing
WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
{
wxMBConv_iconv* result = new wxMBConv_iconv( name );
if ( !result->IsOk() )
{
delete result;
return 0;
}
return result;
}
wxString wxMBConv_iconv::ms_wcCharsetName;
bool wxMBConv_iconv::ms_wcNeedsSwap = false;
wxMBConv_iconv::wxMBConv_iconv(const char *name)
: m_name(wxStrdup(name))
{
m_minMBCharWidth = 0;
// check for charset that represents wchar_t:
if ( ms_wcCharsetName.empty() )
{
wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
#if wxUSE_FONTMAP
const wxChar *const *names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
#else // !wxUSE_FONTMAP
static const wxChar *const names_static[] =
{
#if SIZEOF_WCHAR_T == 4
wxT("UCS-4"),
#elif SIZEOF_WCHAR_T == 2
wxT("UCS-2"),
#endif
NULL
};
const wxChar *const *names = names_static;
#endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
for ( ; *names && ms_wcCharsetName.empty(); ++names )
{
const wxString nameCS(*names);
// first try charset with explicit bytesex info (e.g. "UCS-4LE"):
wxString nameXE(nameCS);
#ifdef WORDS_BIGENDIAN
nameXE += wxT("BE");
#else // little endian
nameXE += wxT("LE");
#endif
wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
nameXE.c_str());
m2w = iconv_open(nameXE.ToAscii(), name);
if ( m2w == ICONV_T_INVALID )
{
// try charset w/o bytesex info (e.g. "UCS4")
wxLogTrace(TRACE_STRCONV, wxT(" trying charset \"%s\""),
nameCS.c_str());
m2w = iconv_open(nameCS.ToAscii(), name);
// and check for bytesex ourselves:
if ( m2w != ICONV_T_INVALID )
{
char buf[2], *bufPtr;
wchar_t wbuf[2];
size_t insz, outsz;
size_t res;
buf[0] = 'A';
buf[1] = 0;
wbuf[0] = 0;
insz = 2;
outsz = SIZEOF_WCHAR_T * 2;
char* wbufPtr = (char*)wbuf;
bufPtr = buf;
res = iconv(
m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
&wbufPtr, &outsz);
if (ICONV_FAILED(res, insz))
{
wxLogLastError(wxT("iconv"));
wxLogError(_("Conversion to charset '%s' doesn't work."),
nameCS.c_str());
}
else // ok, can convert to this encoding, remember it
{
ms_wcCharsetName = nameCS;
ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
}
}
}
else // use charset not requiring byte swapping
{
ms_wcCharsetName = nameXE;
}
}
wxLogTrace(TRACE_STRCONV,
wxT("iconv wchar_t charset is \"%s\"%s"),
ms_wcCharsetName.empty() ? wxString("<none>")
: ms_wcCharsetName,
ms_wcNeedsSwap ? wxT(" (needs swap)")
: wxT(""));
}
else // we already have ms_wcCharsetName
{
m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
}
if ( ms_wcCharsetName.empty() )
{
w2m = ICONV_T_INVALID;
}
else
{
w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
if ( w2m == ICONV_T_INVALID )
{
wxLogTrace(TRACE_STRCONV,
wxT("\"%s\" -> \"%s\" works but not the converse!?"),
ms_wcCharsetName.c_str(), name);
}
}
}
wxMBConv_iconv::~wxMBConv_iconv()
{
free(const_cast<char *>(m_name));
if ( m2w != ICONV_T_INVALID )
iconv_close(m2w);
if ( w2m != ICONV_T_INVALID )
iconv_close(w2m);
}
size_t
wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
const char *src, size_t srcLen) const
{
if ( srcLen == wxNO_LEN )
{
// find the string length: notice that must be done differently for
// NUL-terminated strings and UTF-16/32 which are terminated with 2/4
// consecutive NULs
const size_t nulLen = GetMBNulLen();
switch ( nulLen )
{
default:
return wxCONV_FAILED;
case 1:
srcLen = strlen(src); // arguably more optimized than our version
break;
case 2:
case 4:
// for UTF-16/32 not only we need to have 2/4 consecutive NULs
// but they also have to start at character boundary and not
// span two adjacent characters
const char *p;
for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
;
srcLen = p - src;
break;
}
// when we're determining the length of the string ourselves we count
// the terminating NUL(s) as part of it and always NUL-terminate the
// output
srcLen += nulLen;
}
// we express length in the number of (wide) characters but iconv always
// counts buffer sizes it in bytes
dstLen *= SIZEOF_WCHAR_T;
#if wxUSE_THREADS
// NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
// Unfortunately there are a couple of global wxCSConv objects such as
// wxConvLocal that are used all over wx code, so we have to make sure
// the handle is used by at most one thread at the time. Otherwise
// only a few wx classes would be safe to use from non-main threads
// as MB<->WC conversion would fail "randomly".
wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
#endif // wxUSE_THREADS
size_t res, cres;
const char *pszPtr = src;
if ( dst )
{
char* bufPtr = (char*)dst;
// have destination buffer, convert there
size_t dstLenOrig = dstLen;
cres = iconv(m2w,
ICONV_CHAR_CAST(&pszPtr), &srcLen,
&bufPtr, &dstLen);
// convert the number of bytes converted as returned by iconv to the
// number of (wide) characters converted that we need
res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
if (ms_wcNeedsSwap)
{
// convert to native endianness
for ( unsigned i = 0; i < res; i++ )
dst[i] = WC_BSWAP(dst[i]);
}
}
else // no destination buffer
{
// convert using temp buffer to calculate the size of the buffer needed
wchar_t tbuf[256];
res = 0;
do
{
char* bufPtr = (char*)tbuf;
dstLen = 8 * SIZEOF_WCHAR_T;
cres = iconv(m2w,
ICONV_CHAR_CAST(&pszPtr), &srcLen,
&bufPtr, &dstLen );
res += 8 - (dstLen / SIZEOF_WCHAR_T);
}
while ((cres == (size_t)-1) && (errno == E2BIG));
}
if (ICONV_FAILED(cres, srcLen))
{
//VS: it is ok if iconv fails, hence trace only
wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsgStr(wxSysErrorCode()));
return wxCONV_FAILED;
}
return res;
}
size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
const wchar_t *src, size_t srcLen) const
{
#if wxUSE_THREADS
// NB: explained in MB2WC
wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
#endif
if ( srcLen == wxNO_LEN )
srcLen = wxWcslen(src) + 1;
size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
size_t outbuflen = dstLen;
size_t res, cres;
wchar_t *tmpbuf = 0;
if (ms_wcNeedsSwap)
{
// need to copy to temp buffer to switch endianness
// (doing WC_BSWAP twice on the original buffer won't work, as it
// could be in read-only memory, or be accessed in some other thread)
tmpbuf = (wchar_t *)malloc(inbuflen);
for ( size_t i = 0; i < srcLen; i++ )
tmpbuf[i] = WC_BSWAP(src[i]);
src = tmpbuf;
}
char* inbuf = (char*)src;
if ( dst )
{
// have destination buffer, convert there
cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
res = dstLen - outbuflen;
}
else // no destination buffer
{
// convert using temp buffer to calculate the size of the buffer needed
char tbuf[256];
res = 0;
do
{
dst = tbuf;
outbuflen = WXSIZEOF(tbuf);
cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
res += WXSIZEOF(tbuf) - outbuflen;
}
while ((cres == (size_t)-1) && (errno == E2BIG));
}
if (ms_wcNeedsSwap)
{
free(tmpbuf);
}
if (ICONV_FAILED(cres, inbuflen))
{
wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsgStr(wxSysErrorCode()));
return wxCONV_FAILED;
}
return res;
}
size_t wxMBConv_iconv::GetMBNulLen() const
{
if ( m_minMBCharWidth == 0 )
{
wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
#if wxUSE_THREADS
// NB: explained in MB2WC
wxMutexLocker lock(self->m_iconvMutex);
#endif
const wchar_t *wnul = L"";
char buf[8]; // should be enough for NUL in any encoding
size_t inLen = sizeof(wchar_t),
outLen = WXSIZEOF(buf);
char *inBuff = (char *)wnul;
char *outBuff = buf;
if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
{
self->m_minMBCharWidth = (size_t)-1;
}
else // ok
{
self->m_minMBCharWidth = outBuff - buf;
}
}
return m_minMBCharWidth;
}
bool wxMBConv_iconv::IsUTF8() const
{
return wxStricmp(m_name, "UTF-8") == 0 ||
wxStricmp(m_name, "UTF8") == 0;
}
#endif // HAVE_ICONV
// ============================================================================
// Win32 conversion classes
// ============================================================================
#ifdef wxHAVE_WIN32_MB2WC
// from utils.cpp
#if wxUSE_FONTMAP
extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
#endif
extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
class wxMBConv_win32 : public wxMBConv
{
public:
wxMBConv_win32()
{
m_CodePage = CP_ACP;
m_minMBCharWidth = 0;
}
wxMBConv_win32(const wxMBConv_win32& conv)
: wxMBConv()
{
m_CodePage = conv.m_CodePage;
m_minMBCharWidth = conv.m_minMBCharWidth;
}
#if wxUSE_FONTMAP
wxMBConv_win32(const char* name)
{
m_CodePage = wxCharsetToCodepage(name);
m_minMBCharWidth = 0;
}
#endif // wxUSE_FONTMAP
wxMBConv_win32(wxFontEncoding encoding)
{
m_CodePage = wxEncodingToCodepage(encoding);
m_minMBCharWidth = 0;
}
virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const wxOVERRIDE
{
// note that we have to use MB_ERR_INVALID_CHARS flag as it without it
// the behaviour is not compatible with the Unix version (using iconv)
// and break the library itself, e.g. wxTextInputStream::NextChar()
// wouldn't work if reading an incomplete MB char didn't result in an
// error
//
// Moreover, MB_ERR_INVALID_CHARS is not supported for UTF-8 under XP
// and for UTF-7 under any Windows version, so we always use our own
// conversions in this case.
if ( m_CodePage == CP_UTF8 )
{
return wxMBConvUTF8().MB2WC(buf, psz, n);
}
if ( m_CodePage == CP_UTF7 )
{
return wxMBConvUTF7().MB2WC(buf, psz, n);
}
const size_t len = ::MultiByteToWideChar
(
m_CodePage, // code page
MB_ERR_INVALID_CHARS, // flags: fall on error
psz, // input string
-1, // its length (NUL-terminated)
buf, // output string
buf ? n : 0 // size of output buffer
);
if ( !len )
return wxCONV_FAILED;
// note that it returns count of written chars for buf != NULL and size
// of the needed buffer for buf == NULL so in either case the length of
// the string (which never includes the terminating NUL) is one less
return len - 1;
}
virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const wxOVERRIDE
{
/*
We need to WC_NO_BEST_FIT_CHARS to prevent WideCharToMultiByte()
from replacing characters unrepresentable in the target code page
with bad quality approximations such as turning "1/2" symbol
(U+00BD) into "1" for the code pages which don't have the fraction
symbol.
Unfortunately this flag can't be used with CJK encodings nor
UTF-7/8 and so if the code page is one of those, we need to resort
to a round trip to verify that no replacements have been done.
*/
BOOL usedDef wxDUMMY_INITIALIZE(false);
BOOL *pUsedDef;
int flags;
if ( m_CodePage < 50000 )
{
// it's our lucky day
flags = WC_NO_BEST_FIT_CHARS;
pUsedDef = &usedDef;
}
else // old system or unsupported encoding
{
flags = 0;
pUsedDef = NULL;
}
const size_t len = ::WideCharToMultiByte
(
m_CodePage, // code page
flags, // either none or no best fit
pwz, // input string
-1, // it is (wide) NUL-terminated
buf, // output buffer
buf ? n : 0, // and its size
NULL, // default "replacement" char
pUsedDef // [out] was it used?
);
if ( !len )
{
// function totally failed
return wxCONV_FAILED;
}
// we did something, check if we really succeeded
if ( flags )
{
// check if the conversion failed, i.e. if any replacements
// were done
if ( usedDef )
return wxCONV_FAILED;
}
else // we must resort to double tripping...
{
// first we need to ensure that we really have the MB data: this is
// not the case if we're called with NULL buffer, in which case we
// need to do the conversion yet again
wxCharBuffer bufDef;
if ( !buf )
{
bufDef = wxCharBuffer(len);
buf = bufDef.data();
if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
buf, len, NULL, NULL) )
return wxCONV_FAILED;
}
if ( !n )
n = wcslen(pwz);
wxWCharBuffer wcBuf(n);
if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
wcscmp(wcBuf, pwz) != 0 )
{
// we didn't obtain the same thing we started from, hence
// the conversion was lossy and we consider that it failed
return wxCONV_FAILED;
}
}
// see the comment above for the reason of "len - 1"
return len - 1;
}
virtual size_t GetMBNulLen() const wxOVERRIDE
{
if ( m_minMBCharWidth == 0 )
{
int len = ::WideCharToMultiByte
(
m_CodePage, // code page
0, // no flags
L"", // input string
1, // translate just the NUL
NULL, // output buffer
0, // and its size
NULL, // no replacement char
NULL // [out] don't care if it was used
);
wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
switch ( len )
{
default:
wxLogDebug(wxT("Unexpected NUL length %d"), len);
self->m_minMBCharWidth = (size_t)-1;
break;
case 0:
self->m_minMBCharWidth = (size_t)-1;
break;
case 1:
case 2:
case 4:
self->m_minMBCharWidth = len;
break;
}
}
return m_minMBCharWidth;
}
virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConv_win32(*this); }
bool IsOk() const { return m_CodePage != -1; }
private:
// the code page we're working with
long m_CodePage;
// cached result of GetMBNulLen(), set to 0 initially meaning
// "unknown"
size_t m_minMBCharWidth;
};
#endif // wxHAVE_WIN32_MB2WC
// ============================================================================
// wxEncodingConverter based conversion classes
// ============================================================================
#if wxUSE_FONTMAP
class wxMBConv_wxwin : public wxMBConv
{
private:
void Init()
{
// Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
// The wxMBConv_cf class does a better job.
m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
w2m.Init(wxFONTENCODING_UNICODE, m_enc);
}
public:
// temporarily just use wxEncodingConverter stuff,
// so that it works while a better implementation is built
wxMBConv_wxwin(const char* name)
{
if (name)
m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
else
m_enc = wxFONTENCODING_SYSTEM;
Init();
}
wxMBConv_wxwin(wxFontEncoding enc)
{
m_enc = enc;
Init();
}
size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const wxOVERRIDE
{
size_t inbuf = strlen(psz);
if (buf)
{
if (!m2w.Convert(psz, buf))
return wxCONV_FAILED;
}
return inbuf;
}
size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const wxOVERRIDE
{
const size_t inbuf = wxWcslen(psz);
if (buf)
{
if (!w2m.Convert(psz, buf))
return wxCONV_FAILED;
}
return inbuf;
}
virtual size_t GetMBNulLen() const wxOVERRIDE
{
switch ( m_enc )
{
case wxFONTENCODING_UTF16BE:
case wxFONTENCODING_UTF16LE:
return 2;
case wxFONTENCODING_UTF32BE:
case wxFONTENCODING_UTF32LE:
return 4;
default:
return 1;
}
}
virtual wxMBConv *Clone() const wxOVERRIDE { return new wxMBConv_wxwin(m_enc); }
bool IsOk() const { return m_ok; }
public:
wxFontEncoding m_enc;
wxEncodingConverter m2w, w2m;
private:
// were we initialized successfully?
bool m_ok;
wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
};
// make the constructors available for unit testing
WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
{
wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
if ( !result->IsOk() )
{
delete result;
return 0;
}
return result;
}
#endif // wxUSE_FONTMAP
// ============================================================================
// wxCSConv implementation
// ============================================================================
void wxCSConv::Init()
{
m_name = NULL;
m_convReal = NULL;
}
void wxCSConv::SetEncoding(wxFontEncoding encoding)
{
switch ( encoding )
{
case wxFONTENCODING_MAX:
case wxFONTENCODING_SYSTEM:
if ( m_name )
{
// It's ok to not have encoding value if we have a name for it.
m_encoding = wxFONTENCODING_SYSTEM;
}
else // No name neither.
{
// Fall back to the system default encoding in this case (not
// sure how much sense does this make but this is how the old
// code used to behave).
#if wxUSE_INTL
m_encoding = wxLocale::GetSystemEncoding();
if ( m_encoding == wxFONTENCODING_SYSTEM )
#endif // wxUSE_INTL
m_encoding = wxFONTENCODING_ISO8859_1;
}
break;
case wxFONTENCODING_DEFAULT:
// wxFONTENCODING_DEFAULT is same as US-ASCII in this context
m_encoding = wxFONTENCODING_ISO8859_1;
break;
default:
// Just use the provided encoding.
m_encoding = encoding;
}
}
wxCSConv::wxCSConv(const wxString& charset)
{
Init();
if ( !charset.empty() )
{
SetName(charset.ToAscii());
}
#if wxUSE_FONTMAP
SetEncoding(wxFontMapperBase::GetEncodingFromName(charset));
#else
SetEncoding(wxFONTENCODING_SYSTEM);
#endif
m_convReal = DoCreate();
}
wxCSConv::wxCSConv(wxFontEncoding encoding)
{
if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
{
wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
encoding = wxFONTENCODING_SYSTEM;
}
Init();
SetEncoding(encoding);
m_convReal = DoCreate();
}
wxCSConv::~wxCSConv()
{
Clear();
}
wxCSConv::wxCSConv(const wxCSConv& conv)
: wxMBConv()
{
Init();
SetName(conv.m_name);
SetEncoding(conv.m_encoding);
m_convReal = DoCreate();
}
wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
{
Clear();
SetName(conv.m_name);
SetEncoding(conv.m_encoding);
m_convReal = DoCreate();
return *this;
}
void wxCSConv::Clear()
{
free(m_name);
m_name = NULL;
wxDELETE(m_convReal);
}
void wxCSConv::SetName(const char *charset)
{
if ( charset )
m_name = wxStrdup(charset);
}
#if wxUSE_FONTMAP
WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
wxEncodingNameCache );
static wxEncodingNameCache gs_nameCache;
#endif
wxMBConv *wxCSConv::DoCreate() const
{
#if wxUSE_FONTMAP
wxLogTrace(TRACE_STRCONV,
wxT("creating conversion for %s"),
(m_name ? m_name
: (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
#endif // wxUSE_FONTMAP
// check for the special case of ASCII or ISO8859-1 charset: as we have
// special knowledge of it anyhow, we don't need to create a special
// conversion object
if ( m_encoding == wxFONTENCODING_ISO8859_1 )
{
// don't convert at all
return NULL;
}
// we trust OS to do conversion better than we can so try external
// conversion methods first
//
// the full order is:
// 1. OS conversion (iconv() under Unix or Win32 API)
// 2. hard coded conversions for UTF
// 3. wxEncodingConverter as fall back
// step (1)
#ifdef HAVE_ICONV
#if !wxUSE_FONTMAP
if ( m_name )
#endif // !wxUSE_FONTMAP
{
#if wxUSE_FONTMAP
wxFontEncoding encoding(m_encoding);
#endif
if ( m_name )
{
wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
if ( conv->IsOk() )
return conv;
delete conv;
#if wxUSE_FONTMAP
encoding =
wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
#endif // wxUSE_FONTMAP
}
#if wxUSE_FONTMAP
{
const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
if ( it != gs_nameCache.end() )
{
if ( it->second.empty() )
return NULL;
wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
if ( conv->IsOk() )
return conv;
delete conv;
}
const wxChar* const* names = wxFontMapperBase::GetAllEncodingNames(encoding);
// CS : in case this does not return valid names (eg for MacRoman)
// encoding got a 'failure' entry in the cache all the same,
// although it just has to be created using a different method, so
// only store failed iconv creation attempts (or perhaps we
// shoulnd't do this at all ?)
if ( names[0] != NULL )
{
for ( ; *names; ++names )
{
// FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
// will need changes that will obsolete this
wxString name(*names);
wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
if ( conv->IsOk() )
{
gs_nameCache[encoding] = *names;
return conv;
}
delete conv;
}
gs_nameCache[encoding] = wxT(""); // cache the failure
}
}
#endif // wxUSE_FONTMAP
}
#endif // HAVE_ICONV
#ifdef wxHAVE_WIN32_MB2WC
{
#if wxUSE_FONTMAP
wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
: new wxMBConv_win32(m_encoding);
#else
wxMBConv_win32* conv = new wxMBConv_win32(m_encoding);
#endif
if ( conv->IsOk() )
return conv;
delete conv;
}
#endif // wxHAVE_WIN32_MB2WC
#ifdef __DARWIN__
{
// leave UTF16 and UTF32 to the built-ins of wx
if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
{
#if wxUSE_FONTMAP
wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
: new wxMBConv_cf(m_encoding);
#else
wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
#endif
if ( conv->IsOk() )
return conv;
delete conv;
}
}
#endif // __DARWIN__
// step (2)
wxFontEncoding enc = m_encoding;
#if wxUSE_FONTMAP
if ( enc == wxFONTENCODING_SYSTEM && m_name )
{
// use "false" to suppress interactive dialogs -- we can be called from
// anywhere and popping up a dialog from here is the last thing we want to
// do
enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
}
#endif // wxUSE_FONTMAP
switch ( enc )
{
case wxFONTENCODING_UTF7:
return new wxMBConvUTF7;
case wxFONTENCODING_UTF8:
return new wxMBConvUTF8;
case wxFONTENCODING_UTF16BE:
return new wxMBConvUTF16BE;
case wxFONTENCODING_UTF16LE:
return new wxMBConvUTF16LE;
case wxFONTENCODING_UTF32BE:
return new wxMBConvUTF32BE;
case wxFONTENCODING_UTF32LE:
return new wxMBConvUTF32LE;
default:
// nothing to do but put here to suppress gcc warnings
break;
}
// step (3)
#if wxUSE_FONTMAP
{
wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
: new wxMBConv_wxwin(m_encoding);
if ( conv->IsOk() )
return conv;
delete conv;
}
wxLogTrace(TRACE_STRCONV,
wxT("encoding \"%s\" is not supported by this system"),
(m_name ? wxString(m_name)
: wxFontMapperBase::GetEncodingName(m_encoding)));
#endif // wxUSE_FONTMAP
return NULL;
}
bool wxCSConv::IsOk() const
{
// special case: no convReal created for wxFONTENCODING_ISO8859_1
if ( m_encoding == wxFONTENCODING_ISO8859_1 )
return true; // always ok as we do it ourselves
// m_convReal->IsOk() is called at its own creation, so we know it must
// be ok if m_convReal is non-NULL
return m_convReal != NULL;
}
size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
const char *src, size_t srcLen) const
{
if (m_convReal)
return m_convReal->ToWChar(dst, dstLen, src, srcLen);
// latin-1 (direct)
if ( srcLen == wxNO_LEN )
srcLen = strlen(src) + 1; // take trailing NUL too
if ( dst )
{
if ( dstLen < srcLen )
return wxCONV_FAILED;
for ( size_t n = 0; n < srcLen; n++ )
dst[n] = (unsigned char)(src[n]);
}
return srcLen;
}
size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
const wchar_t *src, size_t srcLen) const
{
if (m_convReal)
return m_convReal->FromWChar(dst, dstLen, src, srcLen);
// latin-1 (direct)
if ( srcLen == wxNO_LEN )
srcLen = wxWcslen(src) + 1;
if ( dst )
{
if ( dstLen < srcLen )
return wxCONV_FAILED;
for ( size_t n = 0; n < srcLen; n++ )
{
if ( src[n] > 0xFF )
return wxCONV_FAILED;
dst[n] = (char)src[n];
}
}
else // still need to check the input validity
{
for ( size_t n = 0; n < srcLen; n++ )
{
if ( src[n] > 0xFF )
return wxCONV_FAILED;
}
}
return srcLen;
}
size_t wxCSConv::GetMBNulLen() const
{
if ( m_convReal )
return m_convReal->GetMBNulLen();
// otherwise, we are ISO-8859-1
return 1;
}
bool wxCSConv::IsUTF8() const
{
if ( m_convReal )
return m_convReal->IsUTF8();
// otherwise, we are ISO-8859-1
return false;
}
// ============================================================================
// wxWhateverWorksConv
// ============================================================================
size_t
wxWhateverWorksConv::ToWChar(wchar_t *dst, size_t dstLen,
const char *src, size_t srcLen) const
{
size_t rc = wxConvUTF8.ToWChar(dst, dstLen, src, srcLen);
if ( rc != wxCONV_FAILED )
return rc;
rc = wxConvLibc.ToWChar(dst, dstLen, src, srcLen);
if ( rc != wxCONV_FAILED )
return rc;
rc = wxConvISO8859_1.ToWChar(dst, dstLen, src, srcLen);
return rc;
}
size_t
wxWhateverWorksConv::FromWChar(char *dst, size_t dstLen,
const wchar_t *src, size_t srcLen) const
{
size_t rc = wxConvLibc.FromWChar(dst, dstLen, src, srcLen);
if ( rc != wxCONV_FAILED )
return rc;
rc = wxConvUTF8.FromWChar(dst, dstLen, src, srcLen);
return rc;
}
// ----------------------------------------------------------------------------
// globals
// ----------------------------------------------------------------------------
// NB: The reason why we create converted objects in this convoluted way,
// using a factory function instead of global variable, is that they
// may be used at static initialization time (some of them are used by
// wxString ctors and there may be a global wxString object). In other
// words, possibly _before_ the converter global object would be
// initialized.
#undef wxConvLibc
#undef wxConvUTF8
#undef wxConvUTF7
#undef wxConvWhateverWorks
#undef wxConvLocal
#undef wxConvISO8859_1
#define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
{ \
static impl_klass name##Obj ctor_args; \
return &name##Obj; \
} \
/* this ensures that all global converter objects are created */ \
/* by the time static initialization is done, i.e. before any */ \
/* thread is launched: */ \
static klass* gs_##name##instance = wxGet_##name##Ptr()
#define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
#ifdef __INTELC__
// disable warning "variable 'xxx' was declared but never referenced"
#pragma warning(disable: 177)
#endif // Intel C++
#ifdef __WINDOWS__
WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
#elif 0 // defined(__WXOSX__)
WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc, (wxFONTENCODING_UTF8));
#else
WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
#endif
// NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
// passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
// provokes an error message about "not enough macro parameters"; and we
// can't use "()" here as the name##Obj declaration would be parsed as a
// function declaration then, so use a semicolon and live with an extra
// empty statement (and hope that no compilers warns about this)
WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
WX_DEFINE_GLOBAL_CONV(wxWhateverWorksConv, wxConvWhateverWorks, ;);
WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
#ifdef __DARWIN__
// It is important to use this conversion object under Darwin as it ensures
// that Unicode strings are (re)composed correctly even though xnu kernel uses
// decomposed form internally (at least for the file names).
static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
#endif
WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
#ifdef __DARWIN__
&wxConvMacUTF8DObj;
#else // !__DARWIN__
wxGet_wxConvWhateverWorksPtr();
#endif // __DARWIN__/!__DARWIN__