From 61f0e0ce221325f5b9125fd766bb26c9825537f3 Mon Sep 17 00:00:00 2001 From: ARATA Mizuki Date: Mon, 24 Apr 2017 15:56:35 +0900 Subject: [PATCH 1/8] Add wxUSE_UNICODE_UTF16 macro to represent if the internal representation of wxString is UTF-16 or not The value of this macro is equal to (wxUSE_UNICODE_WCHAR && SIZEOF_WCHAR_T == 2). --- include/wx/chartype.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/wx/chartype.h b/include/wx/chartype.h index e8ccb35f05..0e81a3344d 100644 --- a/include/wx/chartype.h +++ b/include/wx/chartype.h @@ -175,6 +175,12 @@ #define wxUSE_UTF8_LOCALE_ONLY 0 #endif +#if wxUSE_UNICODE_WCHAR && SIZEOF_WCHAR_T == 2 + #define wxUSE_UNICODE_UTF16 1 +#else + #define wxUSE_UNICODE_UTF16 0 +#endif + /* define char type used by wxString internal representation: */ #if wxUSE_UNICODE_WCHAR typedef wchar_t wxStringCharType; From 90c990cf8366a5b4d6cf3dcb73eeeefcc71f4884 Mon Sep 17 00:00:00 2001 From: ARATA Mizuki Date: Mon, 1 May 2017 01:47:28 +0900 Subject: [PATCH 2/8] Add some functions to handle supplementary characters The added functions are: - wxUniChar::IsBMP() - wxUniChar::IsSupplementary() - wxUniChar::HighSurrogate() - wxUniChar::LowSurrogate() --- include/wx/unichar.h | 37 ++++++++++++++++++++ interface/wx/unichar.h | 76 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) diff --git a/include/wx/unichar.h b/include/wx/unichar.h index a966d5909e..bbd75466bf 100644 --- a/include/wx/unichar.h +++ b/include/wx/unichar.h @@ -83,6 +83,38 @@ public: return true; } + // Returns true if the character is a BMP character: + static bool IsBMP(wxUint32 value) { return value < 0x10000; } + + // Returns true if the character is a supplementary character: + static bool IsSupplementary(wxUint32 value) { return 0x10000 <= value && value < 0x110000; } + + // Returns the high surrogate code unit for the supplementary character + static wxUint16 HighSurrogate(wxUint32 value) + { + wxASSERT_MSG(IsSupplementary(value), "wxUniChar::HighSurrogate() must be called on a supplementary character"); + return 0xD800 | ((value - 0x10000) >> 10); + } + + // Returns the low surrogate code unit for the supplementary character + static wxUint16 LowSurrogate(wxUint32 value) + { + wxASSERT_MSG(IsSupplementary(value), "wxUniChar::LowSurrogate() must be called on a supplementary character"); + return 0xDC00 | ((value - 0x10000) & 0x03FF); + } + + // Returns true if the character is a BMP character: + bool IsBMP() const { return IsBMP(m_value); } + + // Returns true if the character is a supplementary character: + bool IsSupplementary() const { return IsSupplementary(m_value); } + + // Returns the high surrogate code unit for the supplementary character + wxUint16 HighSurrogate() const { return HighSurrogate(m_value); } + + // Returns the low surrogate code unit for the supplementary character + wxUint16 LowSurrogate() const { return LowSurrogate(m_value); } + // Conversions to char and wchar_t types: all of those are needed to be // able to pass wxUniChars to verious standard narrow and wide character // functions @@ -216,6 +248,11 @@ public: bool IsAscii() const { return UniChar().IsAscii(); } bool GetAsChar(char *c) const { return UniChar().GetAsChar(c); } + bool IsBMP() const { return UniChar().IsBMP(); } + bool IsSupplementary() const { return UniChar().IsSupplementary(); } + wxUint16 HighSurrogate() const { return UniChar().HighSurrogate(); } + wxUint16 LowSurrogate() const { return UniChar().LowSurrogate(); } + // Assignment operators: #if wxUSE_UNICODE_UTF8 wxUniCharRef& operator=(const wxUniChar& c); diff --git a/interface/wx/unichar.h b/interface/wx/unichar.h index 75012879f2..0536c50dd4 100644 --- a/interface/wx/unichar.h +++ b/interface/wx/unichar.h @@ -83,6 +83,82 @@ public: */ bool GetAsChar(char *c) const; + /** + Returns true if the character is a BMP character (i.e.\ if its value is less than 0x10000). + + @since 3.1.1 + */ + bool IsBMP() const; + + /** + Returns true if the character is a BMP character (i.e.\ if its value is less than 0x10000). + + @param value + The Unicode code point of the character. + + @since 3.1.1 + */ + static bool IsBMP(wxUint32 value); + + /** + Returns true if the character is a supplementary character (i.e.\ between 0x10000 and 0x10FFFF). + + @since 3.1.1 + */ + bool IsSupplementary() const; + + /** + Returns true if the character is a supplementary character (i.e.\ between 0x10000 and 0x10FFFF). + + @param value + The Unicode code point of the character. + + @since 3.1.1 + */ + static bool IsSupplementary(wxUint32 value); + + /** + Returns the high surrogate code unit for the supplementary character. + + @pre IsSupplementary() const + + @since 3.1.1 + */ + wxUint16 HighSurrogate() const; + + /** + Returns the high surrogate code unit for the supplementary character. + + @param value + The Unicode code point of the character. + + @pre IsSupplementary(wxUint32 value) + + @since 3.1.1 + */ + static wxUint16 HighSurrogate(wxUint32 value); + + /** + Returns the low surrogate code unit for the supplementary character. + + @pre IsSupplementary() const + + @since 3.1.1 + */ + wxUint16 LowSurrogate() const; + + /** + Returns the low surrogate code unit for the supplementary character. + + @param value + The Unicode code point of the character. + + @pre IsSupplementary(wxUint32 value) + + @since 3.1.1 + */ + static wxUint16 LowSurrogate(wxUint32 value); + //@{ /** Conversions to char and wchar_t types: all of those are needed to be From 8a29c5c09fbcf6fa89be9a1fa807861909b7fdb5 Mon Sep 17 00:00:00 2001 From: ARATA Mizuki Date: Mon, 1 May 2017 14:44:45 +0900 Subject: [PATCH 3/8] Use the added wxUniChar functions in the existing code --- src/common/strconv.cpp | 16 ++++++++-------- src/common/ustring.cpp | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index 1e40478ee8..83c08ec547 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -80,27 +80,27 @@ static bool NotAllNULs(const char *p, size_t n) static size_t encode_utf16(wxUint32 input, wxUint16 *output) { - if (input <= 0xffff) + if (wxUniChar::IsBMP(input)) { if (output) *output = (wxUint16) input; return 1; } - else if (input >= 0x110000) - { - return wxCONV_FAILED; - } - else + else if (wxUniChar::IsSupplementary(input)) { if (output) { - *output++ = (wxUint16) ((input >> 10) + 0xd7c0); - *output = (wxUint16) ((input & 0x3ff) + 0xdc00); + *output++ = wxUniChar::HighSurrogate(input); + *output = wxUniChar::LowSurrogate(input); } return 2; } + else + { + return wxCONV_FAILED; + } } static size_t decode_utf16(const wxUint16* input, wxUint32& output) diff --git a/src/common/ustring.cpp b/src/common/ustring.cpp index ab70ce5ae2..87d5158234 100644 --- a/src/common/ustring.cpp +++ b/src/common/ustring.cpp @@ -502,7 +502,7 @@ wxScopedU16CharBuffer wxUString::utf16_str() const // TODO: error range checks - if (code < 0x10000) + if (wxUniChar::IsBMP(code)) utf16_length++; else utf16_length += 2; @@ -520,15 +520,15 @@ wxScopedU16CharBuffer wxUString::utf16_str() const // TODO: error range checks - if (code < 0x10000) + if (wxUniChar::IsBMP(code)) { out[0] = code; out++; } else { - out[0] = (code - 0x10000) / 0x400 + 0xd800; - out[1] = (code - 0x10000) % 0x400 + 0xdc00; + out[0] = wxUniChar::HighSurrogate(code); + out[1] = wxUniChar::LowSurrogate(code); out += 2; } } From ad4785707247ddb73c824c96ff2b645789d61bcc Mon Sep 17 00:00:00 2001 From: ARATA Mizuki Date: Fri, 21 Apr 2017 04:23:52 +0900 Subject: [PATCH 4/8] Add a test case for constructing wxString with supplementary wxUniChar values See #11827 --- tests/strings/strings.cpp | 170 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) diff --git a/tests/strings/strings.cpp b/tests/strings/strings.cpp index 3562510b51..1cf282b0eb 100644 --- a/tests/strings/strings.cpp +++ b/tests/strings/strings.cpp @@ -63,6 +63,7 @@ private: CPPUNIT_TEST( IndexedAccess ); CPPUNIT_TEST( BeforeAndAfter ); CPPUNIT_TEST( ScopedBuffers ); + CPPUNIT_TEST( SupplementaryUniChar ); CPPUNIT_TEST_SUITE_END(); void String(); @@ -98,6 +99,7 @@ private: void IndexedAccess(); void BeforeAndAfter(); void ScopedBuffers(); + void SupplementaryUniChar(); wxDECLARE_NO_COPY_CLASS(StringTestCase); }; @@ -1142,3 +1144,171 @@ void StringTestCase::ScopedBuffers() buf5.extend(len); CPPUNIT_ASSERT_EQUAL('\0', buf5.data()[len]); } + +void StringTestCase::SupplementaryUniChar() +{ +#if wxUSE_UNICODE + // Test wxString(wxUniChar ch, size_t nRepeat = 1), + // which is implemented upon assign(size_t n, wxUniChar ch). + { + wxString s(wxUniChar(0x12345)); +#if wxUSE_UNICODE_UTF16 + CPPUNIT_ASSERT_EQUAL(2, s.length()); + CPPUNIT_ASSERT_EQUAL(0xD808, s[0].GetValue()); + CPPUNIT_ASSERT_EQUAL(0xDF45, s[1].GetValue()); +#else + CPPUNIT_ASSERT_EQUAL(1, s.length()); + CPPUNIT_ASSERT_EQUAL(0x12345, s[0].GetValue()); +#endif + } + + // Test operator=(wxUniChar ch). + { + wxString s; + s = wxUniChar(0x23456); +#if wxUSE_UNICODE_UTF16 + CPPUNIT_ASSERT_EQUAL(2, s.length()); + CPPUNIT_ASSERT_EQUAL(0xD84D, s[0].GetValue()); + CPPUNIT_ASSERT_EQUAL(0xDC56, s[1].GetValue()); +#else + CPPUNIT_ASSERT_EQUAL(1, s.length()); + CPPUNIT_ASSERT_EQUAL(0x23456, s[0].GetValue()); +#endif + } + + // Test operator+=(wxUniChar ch). + { + wxString s = "A"; + s += wxUniChar(0x34567); +#if wxUSE_UNICODE_UTF16 + CPPUNIT_ASSERT_EQUAL(3, s.length()); + CPPUNIT_ASSERT_EQUAL(0xD891, s[1].GetValue()); + CPPUNIT_ASSERT_EQUAL(0xDD67, s[2].GetValue()); +#else + CPPUNIT_ASSERT_EQUAL(2, s.length()); + CPPUNIT_ASSERT_EQUAL(0x34567, s[1].GetValue()); +#endif + } + + // Test operator<<(wxUniChar ch), + // which is implemented upon append(size_t n, wxUniChar ch). + { + wxString s = "A"; + s << wxUniChar(0x45678); +#if wxUSE_UNICODE_UTF16 + CPPUNIT_ASSERT_EQUAL(3, s.length()); + CPPUNIT_ASSERT_EQUAL(0xD8D5, s[1].GetValue()); + CPPUNIT_ASSERT_EQUAL(0xDE78, s[2].GetValue()); +#else + CPPUNIT_ASSERT_EQUAL(2, s.length()); + CPPUNIT_ASSERT_EQUAL(0x45678, s[1].GetValue()); +#endif + } + + // Test insert(size_t nPos, size_t n, wxUniChar ch). + { + wxString s = L"\x3042\x208\x3059"; + s.insert(1, 2, wxUniChar(0x12345)); +#if wxUSE_UNICODE_UTF16 + CPPUNIT_ASSERT_EQUAL(7, s.length()); + CPPUNIT_ASSERT_EQUAL(0xD808, s[1].GetValue()); + CPPUNIT_ASSERT_EQUAL(0xDF45, s[2].GetValue()); + CPPUNIT_ASSERT_EQUAL(0xD808, s[3].GetValue()); + CPPUNIT_ASSERT_EQUAL(0xDF45, s[4].GetValue()); +#else + CPPUNIT_ASSERT_EQUAL(5, s.length()); + CPPUNIT_ASSERT_EQUAL(0x12345, s[1].GetValue()); + CPPUNIT_ASSERT_EQUAL(0x12345, s[2].GetValue()); +#endif + } + + // Test insert(iterator it, wxUniChar ch). + { + wxString s = L"\x3042\x208\x3059"; + s.insert(s.begin() + 1, wxUniChar(0x23456)); +#if wxUSE_UNICODE_UTF16 + CPPUNIT_ASSERT_EQUAL(5, s.length()); + CPPUNIT_ASSERT_EQUAL(0xD84D, s[1].GetValue()); + CPPUNIT_ASSERT_EQUAL(0xDC56, s[2].GetValue()); +#else + CPPUNIT_ASSERT_EQUAL(4, s.length()); + CPPUNIT_ASSERT_EQUAL(0x23456, s[1].GetValue()); +#endif + } + + // Test insert(iterator it, size_type n, wxUniChar ch). + { + wxString s = L"\x3042\x208\x3059"; + s.insert(s.begin() + 1, 2, wxUniChar(0x34567)); +#if wxUSE_UNICODE_UTF16 + CPPUNIT_ASSERT_EQUAL(7, s.length()); + CPPUNIT_ASSERT_EQUAL(0xD891, s[1].GetValue()); + CPPUNIT_ASSERT_EQUAL(0xDD67, s[2].GetValue()); +#else + CPPUNIT_ASSERT_EQUAL(5, s.length()); + CPPUNIT_ASSERT_EQUAL(0x34567, s[1].GetValue()); +#endif + } + + // Test replace(size_t nStart, size_t nLen, size_t nCount, wxUniChar ch). + { + wxString s = L"\x3042\x208\x3059"; + s.replace(1, 2, 2, wxUniChar(0x45678)); +#if wxUSE_UNICODE_UTF16 + CPPUNIT_ASSERT_EQUAL(5, s.length()); + CPPUNIT_ASSERT_EQUAL(0xD8D5, s[1].GetValue()); + CPPUNIT_ASSERT_EQUAL(0xDE78, s[2].GetValue()); + CPPUNIT_ASSERT_EQUAL(0xD8D5, s[3].GetValue()); + CPPUNIT_ASSERT_EQUAL(0xDE78, s[4].GetValue()); +#else + CPPUNIT_ASSERT_EQUAL(3, s.length()); + CPPUNIT_ASSERT_EQUAL(0x45678, s[1].GetValue()); + CPPUNIT_ASSERT_EQUAL(0x45678, s[2].GetValue()); +#endif + } + + // Test replace(iterator first, iterator last, size_type n, wxUniChar ch). + { + wxString s = L"\x3042\x208\x3059"; + s.replace(s.begin() + 1, s.end(), 2, wxUniChar(0x34567)); +#if wxUSE_UNICODE_UTF16 + CPPUNIT_ASSERT_EQUAL(5, s.length()); + CPPUNIT_ASSERT_EQUAL(0xD891, s[1].GetValue()); + CPPUNIT_ASSERT_EQUAL(0xDD67, s[2].GetValue()); + CPPUNIT_ASSERT_EQUAL(0xD891, s[3].GetValue()); + CPPUNIT_ASSERT_EQUAL(0xDD67, s[4].GetValue()); +#else + CPPUNIT_ASSERT_EQUAL(3, s.length()); + CPPUNIT_ASSERT_EQUAL(0x34567, s[1].GetValue()); + CPPUNIT_ASSERT_EQUAL(0x34567, s[2].GetValue()); +#endif + } + + // Test find(wxUniChar ch, size_t nStart = 0) + // and rfind(wxUniChar ch, size_t nStart = npos). + { + wxString s = L"\x308\x2063"; + s << wxUniChar(0x12345); + s << "x"; + s += wxUniChar(0x12345); + s += "y"; +#if wxUSE_UNICODE_UTF16 + CPPUNIT_ASSERT_EQUAL(8, s.length()); + CPPUNIT_ASSERT_EQUAL(2, s.find(wxUniChar(0x12345))); + CPPUNIT_ASSERT_EQUAL(5, s.find(wxUniChar(0x12345), 3)); + CPPUNIT_ASSERT_EQUAL(5, s.rfind(wxUniChar(0x12345))); + CPPUNIT_ASSERT_EQUAL(2, s.rfind(wxUniChar(0x12345), 4)); +#else + CPPUNIT_ASSERT_EQUAL(6, s.length()); + CPPUNIT_ASSERT_EQUAL(2, s.find(wxUniChar(0x12345))); + CPPUNIT_ASSERT_EQUAL(4, s.find(wxUniChar(0x12345), 3)); + CPPUNIT_ASSERT_EQUAL(4, s.rfind(wxUniChar(0x12345))); + CPPUNIT_ASSERT_EQUAL(2, s.rfind(wxUniChar(0x12345), 3)); +#endif + } + + /* Not tested here: + find_first_of, find_last_of, find_first_not_of, find_last_not_of + */ +#endif +} From 58d940690abbb7f59fd6b3e189328afed7f53821 Mon Sep 17 00:00:00 2001 From: ARATA Mizuki Date: Fri, 21 Apr 2017 04:32:32 +0900 Subject: [PATCH 5/8] Better handling of supplementary wxUniChar values in some of wxString methods On MSW, the Unicode code point is now properly encoded as UTF-16 when assigned or appended to a wxString. Closes #11827 --- include/wx/string.h | 109 ++++++++++++++++----------------------- include/wx/stringops.h | 34 +++++++++++- src/common/stringops.cpp | 62 ++++++++++++++++++++++ 3 files changed, 139 insertions(+), 66 deletions(-) diff --git a/include/wx/string.h b/include/wx/string.h index 3441a7f7d6..b80998cc44 100644 --- a/include/wx/string.h +++ b/include/wx/string.h @@ -898,9 +898,6 @@ public: wxStringIteratorNode m_node; }; - size_t IterToImplPos(wxString::iterator i) const - { return wxStringImpl::const_iterator(i.impl()) - m_impl.begin(); } - iterator GetIterForNthChar(size_t n) { return iterator(this, m_impl.begin() + PosToImpl(n)); } const_iterator GetIterForNthChar(size_t n) const @@ -975,6 +972,9 @@ public: const_iterator GetIterForNthChar(size_t n) const { return begin() + n; } #endif // wxUSE_UNICODE_UTF8/!wxUSE_UNICODE_UTF8 + size_t IterToImplPos(wxString::iterator i) const + { return wxStringImpl::const_iterator(i.impl()) - m_impl.begin(); } + #undef WX_STR_ITERATOR_TAG #undef WX_STR_ITERATOR_IMPL @@ -1820,12 +1820,11 @@ public: { wxSTRING_INVALIDATE_CACHE(); -#if wxUSE_UNICODE_UTF8 - if ( !ch.IsAscii() ) - m_impl = wxStringOperations::EncodeChar(ch); - else -#endif // wxUSE_UNICODE_UTF8 + if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) ) m_impl = (wxStringCharType)ch; + else + m_impl = wxStringOperations::EncodeChar(ch); + return *this; } @@ -2410,20 +2409,18 @@ public: // append n copies of ch wxString& append(size_t n, wxUniChar ch) { -#if wxUSE_UNICODE_UTF8 - if ( !ch.IsAscii() ) - { - wxSTRING_INVALIDATE_CACHED_LENGTH(); - - m_impl.append(wxStringOperations::EncodeNChars(n, ch)); - } - else // ASCII -#endif + if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) ) { wxSTRING_UPDATE_CACHED_LENGTH(n); m_impl.append(n, (wxStringCharType)ch); } + else + { + wxSTRING_INVALIDATE_CACHED_LENGTH(); + + m_impl.append(wxStringOperations::EncodeNChars(n, ch)); + } return *this; } @@ -2556,12 +2553,10 @@ public: { wxSTRING_SET_CACHED_LENGTH(n); -#if wxUSE_UNICODE_UTF8 - if ( !ch.IsAscii() ) - m_impl.assign(wxStringOperations::EncodeNChars(n, ch)); - else -#endif + if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) ) m_impl.assign(n, (wxStringCharType)ch); + else + m_impl.assign(wxStringOperations::EncodeNChars(n, ch)); return *this; } @@ -2671,12 +2666,11 @@ public: { wxSTRING_UPDATE_CACHED_LENGTH(n); -#if wxUSE_UNICODE_UTF8 - if ( !ch.IsAscii() ) - m_impl.insert(PosToImpl(nPos), wxStringOperations::EncodeNChars(n, ch)); - else -#endif + if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) ) m_impl.insert(PosToImpl(nPos), n, (wxStringCharType)ch); + else + m_impl.insert(PosToImpl(nPos), wxStringOperations::EncodeNChars(n, ch)); + return *this; } @@ -2684,16 +2678,14 @@ public: { wxSTRING_UPDATE_CACHED_LENGTH(1); -#if wxUSE_UNICODE_UTF8 - if ( !ch.IsAscii() ) + if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) ) + return iterator(this, m_impl.insert(it.impl(), (wxStringCharType)ch)); + else { size_t pos = IterToImplPos(it); m_impl.insert(pos, wxStringOperations::EncodeChar(ch)); return iterator(this, m_impl.begin() + pos); } - else -#endif - return iterator(this, m_impl.insert(it.impl(), (wxStringCharType)ch)); } void insert(iterator it, const_iterator first, const_iterator last) @@ -2716,12 +2708,10 @@ public: { wxSTRING_UPDATE_CACHED_LENGTH(n); -#if wxUSE_UNICODE_UTF8 - if ( !ch.IsAscii() ) - m_impl.insert(IterToImplPos(it), wxStringOperations::EncodeNChars(n, ch)); - else -#endif + if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) ) m_impl.insert(it.impl(), n, (wxStringCharType)ch); + else + m_impl.insert(IterToImplPos(it), wxStringOperations::EncodeNChars(n, ch)); } // delete characters from nStart to nStart + nLen @@ -2800,12 +2790,11 @@ public: size_t from, len; PosLenToImpl(nStart, nLen, &from, &len); -#if wxUSE_UNICODE_UTF8 - if ( !ch.IsAscii() ) - m_impl.replace(from, len, wxStringOperations::EncodeNChars(nCount, ch)); - else -#endif + + if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) ) m_impl.replace(from, len, nCount, (wxStringCharType)ch); + else + m_impl.replace(from, len, wxStringOperations::EncodeNChars(nCount, ch)); return *this; } @@ -2921,13 +2910,11 @@ public: { wxSTRING_INVALIDATE_CACHE(); -#if wxUSE_UNICODE_UTF8 - if ( !ch.IsAscii() ) + if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) ) + m_impl.replace(first.impl(), last.impl(), n, (wxStringCharType)ch); + else m_impl.replace(first.impl(), last.impl(), wxStringOperations::EncodeNChars(n, ch)); - else -#endif - m_impl.replace(first.impl(), last.impl(), n, (wxStringCharType)ch); return *this; } @@ -2988,15 +2975,12 @@ public: // find the first occurrence of character ch after nStart size_t find(wxUniChar ch, size_t nStart = 0) const { -#if wxUSE_UNICODE_UTF8 - if ( !ch.IsAscii() ) - return PosFromImpl(m_impl.find(wxStringOperations::EncodeChar(ch), - PosToImpl(nStart))); - else -#endif + if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) ) return PosFromImpl(m_impl.find((wxStringCharType)ch, PosToImpl(nStart))); - + else + return PosFromImpl(m_impl.find(wxStringOperations::EncodeChar(ch), + PosToImpl(nStart))); } size_t find(wxUniCharRef ch, size_t nStart = 0) const { return find(wxUniChar(ch), nStart); } @@ -3033,13 +3017,11 @@ public: // as find, but from the end size_t rfind(wxUniChar ch, size_t nStart = npos) const { -#if wxUSE_UNICODE_UTF8 - if ( !ch.IsAscii() ) - return PosFromImpl(m_impl.rfind(wxStringOperations::EncodeChar(ch), + if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) ) + return PosFromImpl(m_impl.rfind((wxStringCharType)ch, PosToImpl(nStart))); else -#endif - return PosFromImpl(m_impl.rfind((wxStringCharType)ch, + return PosFromImpl(m_impl.rfind(wxStringOperations::EncodeChar(ch), PosToImpl(nStart))); } size_t rfind(wxUniCharRef ch, size_t nStart = npos) const @@ -3301,12 +3283,11 @@ public: { wxSTRING_UPDATE_CACHED_LENGTH(1); -#if wxUSE_UNICODE_UTF8 - if ( !ch.IsAscii() ) - m_impl += wxStringOperations::EncodeChar(ch); - else -#endif + if ( wxStringOperations::IsSingleCodeUnitCharacter(ch) ) m_impl += (wxStringCharType)ch; + else + m_impl += wxStringOperations::EncodeChar(ch); + return *this; } wxString& operator+=(wxUniCharRef ch) { return *this += wxUniChar(ch); } diff --git a/include/wx/stringops.h b/include/wx/stringops.h index 21c6121787..fd6695116f 100644 --- a/include/wx/stringops.h +++ b/include/wx/stringops.h @@ -44,9 +44,36 @@ struct WXDLLIMPEXP_BASE wxStringOperationsWchar static ptrdiff_t DiffIters(const Iterator& i1, const Iterator& i2) { return i1 - i2; } +#if wxUSE_UNICODE_UTF16 + // encodes th characters as UTF-16: + struct Utf16CharBuffer + { + Utf16CharBuffer() : data() {} + wchar_t data[3]; + operator const wchar_t*() const { return data; } + }; + static Utf16CharBuffer EncodeChar(const wxUniChar& ch); + static wxWCharBuffer EncodeNChars(size_t n, const wxUniChar& ch); + static bool IsSingleCodeUnitCharacter(const wxUniChar& ch) + { return !ch.IsSupplementary(); } +#else // encodes the character to a form used to represent it in internal - // representation (returns a string in UTF8 version) - static wxChar EncodeChar(const wxUniChar& ch) { return (wxChar)ch; } + // representation + struct SingleCharBuffer + { + SingleCharBuffer() : data() {} + wxChar data[2]; + operator const wxChar*() const { return data; } + }; + static SingleCharBuffer EncodeChar(const wxUniChar& ch) + { + SingleCharBuffer buf; + buf.data[0] = (wxChar)ch; + return buf; + } + static wxWxCharBuffer EncodeNChars(size_t n, const wxUniChar& ch); + static bool IsSingleCodeUnitCharacter(const wxUniChar&) { return true; } +#endif static wxUniChar DecodeChar(const wxStringImpl::const_iterator& i) { return *i; } @@ -134,6 +161,9 @@ struct WXDLLIMPEXP_BASE wxStringOperationsUtf8 return dist; } + static bool IsSingleCodeUnitCharacter(const wxUniChar& ch) + { return ch.IsAscii(); } + // encodes the character as UTF-8: typedef wxUniChar::Utf8CharBuffer Utf8CharBuffer; static Utf8CharBuffer EncodeChar(const wxUniChar& ch) diff --git a/src/common/stringops.cpp b/src/common/stringops.cpp index 36ff4045a4..2d8fcaee3e 100644 --- a/src/common/stringops.cpp +++ b/src/common/stringops.cpp @@ -27,6 +27,68 @@ // implementation // =========================================================================== +#if wxUSE_UNICODE_WCHAR || !wxUSE_UNICODE + +#if wxUSE_UNICODE_UTF16 + +wxStringOperationsWchar::Utf16CharBuffer wxStringOperationsWchar::EncodeChar(const wxUniChar& ch) +{ + Utf16CharBuffer buf; + if ( ch.IsSupplementary() ) + { + buf.data[0] = (wchar_t)ch.HighSurrogate(); + buf.data[1] = (wchar_t)ch.LowSurrogate(); + } + else + { + // Assume ch is a BMP character + buf.data[0] = (wchar_t)ch; + } + return buf; +} + +wxWCharBuffer wxStringOperationsWchar::EncodeNChars(size_t n, const wxUniChar& ch) +{ + if ( ch.IsSupplementary() ) + { + wxWCharBuffer buf(n * 2); + wchar_t s[2] = { + (wchar_t)ch.HighSurrogate(), + (wchar_t)ch.LowSurrogate(), + }; + wchar_t *ptr = buf.data(); + for (size_t i = 0; i < n; i++, ptr += 2) + { + wmemcpy(ptr, s, 2); + } + return buf; + } + else + { + // Assume ch is a BMP character + wxWCharBuffer buf(n); + wmemset(buf.data(), (wchar_t)ch, n); + return buf; + } +} + +#else + +wxWxCharBuffer wxStringOperationsWchar::EncodeNChars(size_t n, const wxUniChar& ch) +{ + wxWxCharBuffer buf(n); +#if wxUSE_UNICODE_WCHAR + wmemset(buf.data(), (wchar_t)ch, n); +#else // ANSI + memset(buf.data(), (unsigned char)ch, n); +#endif + return buf; +} + +#endif // wxUSE_UNICODE_UTF16 + +#endif // wxUSE_UNICODE_WCHAR || !wxUSE_UNICODE + #if wxUSE_UNICODE_UTF8 // --------------------------------------------------------------------------- From 24f3ff3b787d17262e8ddf09fb5b1a3f41e4c7df Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Wed, 21 Jun 2017 19:07:13 +0200 Subject: [PATCH 6/8] No changes, just fix a typo in a recently added comment --- include/wx/stringops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/wx/stringops.h b/include/wx/stringops.h index fd6695116f..740603e8f7 100644 --- a/include/wx/stringops.h +++ b/include/wx/stringops.h @@ -45,7 +45,7 @@ struct WXDLLIMPEXP_BASE wxStringOperationsWchar { return i1 - i2; } #if wxUSE_UNICODE_UTF16 - // encodes th characters as UTF-16: + // encodes the characters as UTF-16: struct Utf16CharBuffer { Utf16CharBuffer() : data() {} From a86d0f8d6564b6346faf2afe2ea2569ffdc062c0 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Wed, 21 Jun 2017 19:07:43 +0200 Subject: [PATCH 7/8] Move wx/debug.h inclusion after SIZEOF_WCHAR_T in wx/defs.h This is required now because wx/debug.h includes wx/chartype.h which uses SIZEOF_WCHAR_T to define wxUSE_UNICODE_UTF16. --- include/wx/defs.h | 80 +++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 41 deletions(-) diff --git a/include/wx/defs.h b/include/wx/defs.h index f21937cd23..b63bd7491c 100644 --- a/include/wx/defs.h +++ b/include/wx/defs.h @@ -672,47 +672,6 @@ typedef short int WXTYPE; /* breaks C++ code) */ #include -#ifdef __cplusplus - -// everybody gets the assert and other debug macros -#include "wx/debug.h" - - // delete pointer if it is not NULL and NULL it afterwards - template - inline void wxDELETE(T*& ptr) - { - typedef char TypeIsCompleteCheck[sizeof(T)] WX_ATTRIBUTE_UNUSED; - - if ( ptr != NULL ) - { - delete ptr; - ptr = NULL; - } - } - - // delete an array and NULL it (see comments above) - template - inline void wxDELETEA(T*& ptr) - { - typedef char TypeIsCompleteCheck[sizeof(T)] WX_ATTRIBUTE_UNUSED; - - if ( ptr != NULL ) - { - delete [] ptr; - ptr = NULL; - } - } - - // trivial implementation of std::swap() for primitive types - template - inline void wxSwap(T& first, T& second) - { - T tmp(first); - first = second; - second = tmp; - } -#endif /*__cplusplus*/ - /* size of statically declared array */ #define WXSIZEOF(array) (sizeof(array)/sizeof(array[0])) @@ -1227,6 +1186,45 @@ typedef wxUint32 wxDword; #endif #ifdef __cplusplus + +// everybody gets the assert and other debug macros +#include "wx/debug.h" + + // delete pointer if it is not NULL and NULL it afterwards + template + inline void wxDELETE(T*& ptr) + { + typedef char TypeIsCompleteCheck[sizeof(T)] WX_ATTRIBUTE_UNUSED; + + if ( ptr != NULL ) + { + delete ptr; + ptr = NULL; + } + } + + // delete an array and NULL it (see comments above) + template + inline void wxDELETEA(T*& ptr) + { + typedef char TypeIsCompleteCheck[sizeof(T)] WX_ATTRIBUTE_UNUSED; + + if ( ptr != NULL ) + { + delete [] ptr; + ptr = NULL; + } + } + + // trivial implementation of std::swap() for primitive types + template + inline void wxSwap(T& first, T& second) + { + T tmp(first); + first = second; + second = tmp; + } + /* And also define a couple of simple functions to cast pointer to/from it. */ inline wxUIntPtr wxPtrToUInt(const void *p) { From 8311715bdf2d0e3446c6875efec945ff1b87fb6b Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Wed, 21 Jun 2017 19:08:27 +0200 Subject: [PATCH 8/8] Ensure that SIZEOF_WCHAR_T is defined in wx/chartype.h This should always the case now, but wasn't when not using configure (e.g. in MSVC builds) before, so verify this explicitly to ensure that we don't just silently define wxUSE_UNICODE_UTF16 wrongly, as it happened before the fix in the previous commit. --- include/wx/chartype.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/wx/chartype.h b/include/wx/chartype.h index 0e81a3344d..3ff445f602 100644 --- a/include/wx/chartype.h +++ b/include/wx/chartype.h @@ -175,6 +175,10 @@ #define wxUSE_UTF8_LOCALE_ONLY 0 #endif +#ifndef SIZEOF_WCHAR_T + #error "SIZEOF_WCHAR_T must be defined before including this file in wx/defs.h" +#endif + #if wxUSE_UNICODE_WCHAR && SIZEOF_WCHAR_T == 2 #define wxUSE_UNICODE_UTF16 1 #else