From 2bfe310d47cfd1796b17dd7da815b82747297d0b Mon Sep 17 00:00:00 2001 From: Simon Rozman Date: Thu, 5 Jun 2025 12:06:07 +0200 Subject: [PATCH] unicode: inject invalid character on encode failure ...rather than throw. This mimics Windows behaviour. Signed-off-by: Simon Rozman --- UnitTests/unicode.cpp | 43 +++++++++++++++++++++++---------------- include/stdex/unicode.hpp | 16 +++++++++++---- 2 files changed, 37 insertions(+), 22 deletions(-) diff --git a/UnitTests/unicode.cpp b/UnitTests/unicode.cpp index 67ecf1244..31b25b695 100644 --- a/UnitTests/unicode.cpp +++ b/UnitTests/unicode.cpp @@ -64,26 +64,33 @@ namespace UnitTests void unicode::charset_encoder() { - stdex::charset_encoder win1250_to_utf8(stdex::charset_id::windows1250, stdex::charset_id::utf8); + { + stdex::charset_encoder win1250_to_utf8(stdex::charset_id::windows1250, stdex::charset_id::utf8); - Assert::AreEqual( - "This is a test.", - win1250_to_utf8.convert("This is a test.").c_str()); - Assert::AreEqual( - "Thíš i· a teşt.", - win1250_to_utf8.convert("Th\xed\x9a i\xb7 a te\xbat.").c_str()); - std::string src, dst; - for (size_t i = 0; i < 1000; i++) { - src += "V ko\x9eu\x9a\xe8ku zlobnega mizarja stopiclja fant in kli\xe8" "e 0123456789.\r\n"; - dst += "V kožuščku zlobnega mizarja stopiclja fant in kliče 0123456789.\r\n"; + Assert::AreEqual( + "This is a test.", + win1250_to_utf8.convert("This is a test.").c_str()); + Assert::AreEqual( + "Thíš i· a teşt.", + win1250_to_utf8.convert("Th\xed\x9a i\xb7 a te\xbat.").c_str()); + std::string src, dst; + for (size_t i = 0; i < 1000; i++) { + src += "V ko\x9eu\x9a\xe8ku zlobnega mizarja stopiclja fant in kli\xe8" "e 0123456789.\r\n"; + dst += "V kožuščku zlobnega mizarja stopiclja fant in kliče 0123456789.\r\n"; + } + Assert::AreEqual(dst.c_str(), win1250_to_utf8.convert(src).c_str()); + Assert::AreEqual( + "", + win1250_to_utf8.convert("test", 0).c_str()); + Assert::AreEqual( + "", + win1250_to_utf8.convert(nullptr, 0).c_str()); + } + + { + stdex::charset_encoder encode(stdex::charset_id::utf16, stdex::charset_id::ascii, '?'); + Assert::AreEqual("Te?t.", encode.convert(u"Tešt.").c_str()); } - Assert::AreEqual(dst.c_str(), win1250_to_utf8.convert(src).c_str()); - Assert::AreEqual( - "", - win1250_to_utf8.convert("test", 0).c_str()); - Assert::AreEqual( - "", - win1250_to_utf8.convert(nullptr, 0).c_str()); } void unicode::normalize() diff --git a/include/stdex/unicode.hpp b/include/stdex/unicode.hpp index f61aaccb5..7b22f3dcf 100644 --- a/include/stdex/unicode.hpp +++ b/include/stdex/unicode.hpp @@ -1,4 +1,4 @@ -/* +/* SPDX-License-Identifier: MIT Copyright © 2023-2025 Amebis */ @@ -153,11 +153,13 @@ namespace stdex { protected: charset_id m_from, m_to; + T_to m_invalid; public: - charset_encoder(_In_ charset_id from, _In_ charset_id to) : + charset_encoder(_In_ charset_id from, _In_ charset_id to, _In_ T_to invalid = '?') : m_from(from), - m_to(to) + m_to(to), + m_invalid(invalid) { #ifdef _WIN32 m_from_wincp = to_encoding(from); @@ -198,7 +200,6 @@ namespace stdex #ifdef _WIN32 constexpr DWORD dwFlagsWCMB = 0; - constexpr LPCCH lpDefaultChar = NULL; stdex_assert(src); if (m_from_wincp == m_to_wincp) _Unlikely_{ @@ -235,6 +236,7 @@ namespace stdex #pragma warning(suppress: 4127) // Can't use precompiler #if on template arguments, using "if" makes MSVC warnings. if constexpr (sizeof(T_from) == sizeof(wchar_t) && sizeof(T_to) == sizeof(char)) { stdex_assert(count_src < INT_MAX || count_src == SIZE_MAX); + LPCCH lpDefaultChar = m_to_wincp == charset_id::utf8 || m_to_wincp == charset_id::utf7 ? NULL : &m_invalid; // Try to convert to stack buffer first. CHAR szStackBuffer[1024 / sizeof(CHAR)]; @@ -261,6 +263,7 @@ namespace stdex #pragma warning(suppress: 4127) // Can't use precompiler #if on template arguments, using "if" makes MSVC warnings. if constexpr (sizeof(T_from) == sizeof(char) && sizeof(T_to) == sizeof(char)) { stdex_assert(count_src < INT_MAX || count_src == SIZE_MAX); + LPCCH lpDefaultChar = m_to_wincp == charset_id::utf8 || m_to_wincp == charset_id::utf7 ? NULL : &m_invalid; // Try to convert to stack buffer first. DWORD dwResult; @@ -325,6 +328,11 @@ namespace stdex break; if (errno == E2BIG) continue; + if (errno == EILSEQ) { + dst.append(1, m_invalid); + ++src; src_size -= sizeof(T_from); + continue; + } throw std::system_error(errno, std::system_category(), "iconv failed"); } #endif