Merge fixes for handling Unicode characters outside of BMP

Closes https://github.com/wxWidgets/wxWidgets/pull/467
This commit is contained in:
Vadim Zeitlin
2017-06-21 18:53:40 +02:00
10 changed files with 483 additions and 119 deletions

View File

@@ -175,6 +175,16 @@
#define wxUSE_UTF8_LOCALE_ONLY 0
#endif
#ifndef SIZEOF_WCHAR_T
#error "SIZEOF_WCHAR_T must be defined before including this file in wx/defs.h"
#endif
#if wxUSE_UNICODE_WCHAR && SIZEOF_WCHAR_T == 2
#define wxUSE_UNICODE_UTF16 1
#else
#define wxUSE_UNICODE_UTF16 0
#endif
/* define char type used by wxString internal representation: */
#if wxUSE_UNICODE_WCHAR
typedef wchar_t wxStringCharType;

View File

@@ -672,47 +672,6 @@ typedef short int WXTYPE;
/* breaks C++ code) */
#include <stddef.h>
#ifdef __cplusplus
// everybody gets the assert and other debug macros
#include "wx/debug.h"
// delete pointer if it is not NULL and NULL it afterwards
template <typename T>
inline void wxDELETE(T*& ptr)
{
typedef char TypeIsCompleteCheck[sizeof(T)] WX_ATTRIBUTE_UNUSED;
if ( ptr != NULL )
{
delete ptr;
ptr = NULL;
}
}
// delete an array and NULL it (see comments above)
template <typename T>
inline void wxDELETEA(T*& ptr)
{
typedef char TypeIsCompleteCheck[sizeof(T)] WX_ATTRIBUTE_UNUSED;
if ( ptr != NULL )
{
delete [] ptr;
ptr = NULL;
}
}
// trivial implementation of std::swap() for primitive types
template <typename T>
inline void wxSwap(T& first, T& second)
{
T tmp(first);
first = second;
second = tmp;
}
#endif /*__cplusplus*/
/* size of statically declared array */
#define WXSIZEOF(array) (sizeof(array)/sizeof(array[0]))
@@ -1227,6 +1186,45 @@ typedef wxUint32 wxDword;
#endif
#ifdef __cplusplus
// everybody gets the assert and other debug macros
#include "wx/debug.h"
// delete pointer if it is not NULL and NULL it afterwards
template <typename T>
inline void wxDELETE(T*& ptr)
{
typedef char TypeIsCompleteCheck[sizeof(T)] WX_ATTRIBUTE_UNUSED;
if ( ptr != NULL )
{
delete ptr;
ptr = NULL;
}
}
// delete an array and NULL it (see comments above)
template <typename T>
inline void wxDELETEA(T*& ptr)
{
typedef char TypeIsCompleteCheck[sizeof(T)] WX_ATTRIBUTE_UNUSED;
if ( ptr != NULL )
{
delete [] ptr;
ptr = NULL;
}
}
// trivial implementation of std::swap() for primitive types
template <typename T>
inline void wxSwap(T& first, T& second)
{
T tmp(first);
first = second;
second = tmp;
}
/* And also define a couple of simple functions to cast pointer to/from it. */
inline wxUIntPtr wxPtrToUInt(const void *p)
{

View File

@@ -898,9 +898,6 @@ public:
wxStringIteratorNode m_node;
};
size_t IterToImplPos(wxString::iterator i) const
{ return wxStringImpl::const_iterator(i.impl()) - m_impl.begin(); }
iterator GetIterForNthChar(size_t n)
{ return iterator(this, m_impl.begin() + PosToImpl(n)); }
const_iterator GetIterForNthChar(size_t n) const
@@ -975,6 +972,9 @@ public:
const_iterator GetIterForNthChar(size_t n) const { return begin() + n; }
#endif // wxUSE_UNICODE_UTF8/!wxUSE_UNICODE_UTF8
size_t IterToImplPos(wxString::iterator i) const
{ return wxStringImpl::const_iterator(i.impl()) - m_impl.begin(); }
#undef WX_STR_ITERATOR_TAG
#undef WX_STR_ITERATOR_IMPL
@@ -1820,12 +1820,11 @@ public:
{
wxSTRING_INVALIDATE_CACHE();
#if wxUSE_UNICODE_UTF8
if ( !ch.IsAscii() )
m_impl = wxStringOperations::EncodeChar(ch);
else
#endif // wxUSE_UNICODE_UTF8
if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
m_impl = (wxStringCharType)ch;
else
m_impl = wxStringOperations::EncodeChar(ch);
return *this;
}
@@ -2410,20 +2409,18 @@ public:
// append n copies of ch
wxString& append(size_t n, wxUniChar ch)
{
#if wxUSE_UNICODE_UTF8
if ( !ch.IsAscii() )
{
wxSTRING_INVALIDATE_CACHED_LENGTH();
m_impl.append(wxStringOperations::EncodeNChars(n, ch));
}
else // ASCII
#endif
if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
{
wxSTRING_UPDATE_CACHED_LENGTH(n);
m_impl.append(n, (wxStringCharType)ch);
}
else
{
wxSTRING_INVALIDATE_CACHED_LENGTH();
m_impl.append(wxStringOperations::EncodeNChars(n, ch));
}
return *this;
}
@@ -2556,12 +2553,10 @@ public:
{
wxSTRING_SET_CACHED_LENGTH(n);
#if wxUSE_UNICODE_UTF8
if ( !ch.IsAscii() )
m_impl.assign(wxStringOperations::EncodeNChars(n, ch));
else
#endif
if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
m_impl.assign(n, (wxStringCharType)ch);
else
m_impl.assign(wxStringOperations::EncodeNChars(n, ch));
return *this;
}
@@ -2671,12 +2666,11 @@ public:
{
wxSTRING_UPDATE_CACHED_LENGTH(n);
#if wxUSE_UNICODE_UTF8
if ( !ch.IsAscii() )
m_impl.insert(PosToImpl(nPos), wxStringOperations::EncodeNChars(n, ch));
else
#endif
if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
m_impl.insert(PosToImpl(nPos), n, (wxStringCharType)ch);
else
m_impl.insert(PosToImpl(nPos), wxStringOperations::EncodeNChars(n, ch));
return *this;
}
@@ -2684,16 +2678,14 @@ public:
{
wxSTRING_UPDATE_CACHED_LENGTH(1);
#if wxUSE_UNICODE_UTF8
if ( !ch.IsAscii() )
if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
return iterator(this, m_impl.insert(it.impl(), (wxStringCharType)ch));
else
{
size_t pos = IterToImplPos(it);
m_impl.insert(pos, wxStringOperations::EncodeChar(ch));
return iterator(this, m_impl.begin() + pos);
}
else
#endif
return iterator(this, m_impl.insert(it.impl(), (wxStringCharType)ch));
}
void insert(iterator it, const_iterator first, const_iterator last)
@@ -2716,12 +2708,10 @@ public:
{
wxSTRING_UPDATE_CACHED_LENGTH(n);
#if wxUSE_UNICODE_UTF8
if ( !ch.IsAscii() )
m_impl.insert(IterToImplPos(it), wxStringOperations::EncodeNChars(n, ch));
else
#endif
if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
m_impl.insert(it.impl(), n, (wxStringCharType)ch);
else
m_impl.insert(IterToImplPos(it), wxStringOperations::EncodeNChars(n, ch));
}
// delete characters from nStart to nStart + nLen
@@ -2800,12 +2790,11 @@ public:
size_t from, len;
PosLenToImpl(nStart, nLen, &from, &len);
#if wxUSE_UNICODE_UTF8
if ( !ch.IsAscii() )
m_impl.replace(from, len, wxStringOperations::EncodeNChars(nCount, ch));
else
#endif
if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
m_impl.replace(from, len, nCount, (wxStringCharType)ch);
else
m_impl.replace(from, len, wxStringOperations::EncodeNChars(nCount, ch));
return *this;
}
@@ -2921,13 +2910,11 @@ public:
{
wxSTRING_INVALIDATE_CACHE();
#if wxUSE_UNICODE_UTF8
if ( !ch.IsAscii() )
if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
m_impl.replace(first.impl(), last.impl(), n, (wxStringCharType)ch);
else
m_impl.replace(first.impl(), last.impl(),
wxStringOperations::EncodeNChars(n, ch));
else
#endif
m_impl.replace(first.impl(), last.impl(), n, (wxStringCharType)ch);
return *this;
}
@@ -2988,15 +2975,12 @@ public:
// find the first occurrence of character ch after nStart
size_t find(wxUniChar ch, size_t nStart = 0) const
{
#if wxUSE_UNICODE_UTF8
if ( !ch.IsAscii() )
return PosFromImpl(m_impl.find(wxStringOperations::EncodeChar(ch),
PosToImpl(nStart)));
else
#endif
if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
return PosFromImpl(m_impl.find((wxStringCharType)ch,
PosToImpl(nStart)));
else
return PosFromImpl(m_impl.find(wxStringOperations::EncodeChar(ch),
PosToImpl(nStart)));
}
size_t find(wxUniCharRef ch, size_t nStart = 0) const
{ return find(wxUniChar(ch), nStart); }
@@ -3033,13 +3017,11 @@ public:
// as find, but from the end
size_t rfind(wxUniChar ch, size_t nStart = npos) const
{
#if wxUSE_UNICODE_UTF8
if ( !ch.IsAscii() )
return PosFromImpl(m_impl.rfind(wxStringOperations::EncodeChar(ch),
if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
return PosFromImpl(m_impl.rfind((wxStringCharType)ch,
PosToImpl(nStart)));
else
#endif
return PosFromImpl(m_impl.rfind((wxStringCharType)ch,
return PosFromImpl(m_impl.rfind(wxStringOperations::EncodeChar(ch),
PosToImpl(nStart)));
}
size_t rfind(wxUniCharRef ch, size_t nStart = npos) const
@@ -3301,12 +3283,11 @@ public:
{
wxSTRING_UPDATE_CACHED_LENGTH(1);
#if wxUSE_UNICODE_UTF8
if ( !ch.IsAscii() )
m_impl += wxStringOperations::EncodeChar(ch);
else
#endif
if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) )
m_impl += (wxStringCharType)ch;
else
m_impl += wxStringOperations::EncodeChar(ch);
return *this;
}
wxString& operator+=(wxUniCharRef ch) { return *this += wxUniChar(ch); }

View File

@@ -44,9 +44,36 @@ struct WXDLLIMPEXP_BASE wxStringOperationsWchar
static ptrdiff_t DiffIters(const Iterator& i1, const Iterator& i2)
{ return i1 - i2; }
#if wxUSE_UNICODE_UTF16
// encodes the characters as UTF-16:
struct Utf16CharBuffer
{
Utf16CharBuffer() : data() {}
wchar_t data[3];
operator const wchar_t*() const { return data; }
};
static Utf16CharBuffer EncodeChar(const wxUniChar& ch);
static wxWCharBuffer EncodeNChars(size_t n, const wxUniChar& ch);
static bool IsSingleCodeUnitCharacter(const wxUniChar& ch)
{ return !ch.IsSupplementary(); }
#else
// encodes the character to a form used to represent it in internal
// representation (returns a string in UTF8 version)
static wxChar EncodeChar(const wxUniChar& ch) { return (wxChar)ch; }
// representation
struct SingleCharBuffer
{
SingleCharBuffer() : data() {}
wxChar data[2];
operator const wxChar*() const { return data; }
};
static SingleCharBuffer EncodeChar(const wxUniChar& ch)
{
SingleCharBuffer buf;
buf.data[0] = (wxChar)ch;
return buf;
}
static wxWxCharBuffer EncodeNChars(size_t n, const wxUniChar& ch);
static bool IsSingleCodeUnitCharacter(const wxUniChar&) { return true; }
#endif
static wxUniChar DecodeChar(const wxStringImpl::const_iterator& i)
{ return *i; }
@@ -134,6 +161,9 @@ struct WXDLLIMPEXP_BASE wxStringOperationsUtf8
return dist;
}
static bool IsSingleCodeUnitCharacter(const wxUniChar& ch)
{ return ch.IsAscii(); }
// encodes the character as UTF-8:
typedef wxUniChar::Utf8CharBuffer Utf8CharBuffer;
static Utf8CharBuffer EncodeChar(const wxUniChar& ch)

View File

@@ -83,6 +83,38 @@ public:
return true;
}
// Returns true if the character is a BMP character:
static bool IsBMP(wxUint32 value) { return value < 0x10000; }
// Returns true if the character is a supplementary character:
static bool IsSupplementary(wxUint32 value) { return 0x10000 <= value && value < 0x110000; }
// Returns the high surrogate code unit for the supplementary character
static wxUint16 HighSurrogate(wxUint32 value)
{
wxASSERT_MSG(IsSupplementary(value), "wxUniChar::HighSurrogate() must be called on a supplementary character");
return 0xD800 | ((value - 0x10000) >> 10);
}
// Returns the low surrogate code unit for the supplementary character
static wxUint16 LowSurrogate(wxUint32 value)
{
wxASSERT_MSG(IsSupplementary(value), "wxUniChar::LowSurrogate() must be called on a supplementary character");
return 0xDC00 | ((value - 0x10000) & 0x03FF);
}
// Returns true if the character is a BMP character:
bool IsBMP() const { return IsBMP(m_value); }
// Returns true if the character is a supplementary character:
bool IsSupplementary() const { return IsSupplementary(m_value); }
// Returns the high surrogate code unit for the supplementary character
wxUint16 HighSurrogate() const { return HighSurrogate(m_value); }
// Returns the low surrogate code unit for the supplementary character
wxUint16 LowSurrogate() const { return LowSurrogate(m_value); }
// Conversions to char and wchar_t types: all of those are needed to be
// able to pass wxUniChars to verious standard narrow and wide character
// functions
@@ -216,6 +248,11 @@ public:
bool IsAscii() const { return UniChar().IsAscii(); }
bool GetAsChar(char *c) const { return UniChar().GetAsChar(c); }
bool IsBMP() const { return UniChar().IsBMP(); }
bool IsSupplementary() const { return UniChar().IsSupplementary(); }
wxUint16 HighSurrogate() const { return UniChar().HighSurrogate(); }
wxUint16 LowSurrogate() const { return UniChar().LowSurrogate(); }
// Assignment operators:
#if wxUSE_UNICODE_UTF8
wxUniCharRef& operator=(const wxUniChar& c);