unicode: extend conversion with reusable charset_encoder

Windows takes care of internal converter state in MultiByteToWideChar
and WideCharToMultiByte and keeps them thread-safe. On other platforms,
iconv requires user to setup and keep converter state for thread-safe
conversions. This sounds time consuming for every string conversion,
therefore the concept of string converter (or converter state) has been
extended to Windows too, allowing uniform client code. On Windows, using
charset_encoder has no performance benefit, where on Linux and macOS,
there should be. To be measured...

Signed-off-by: Simon Rozman <simon@rozman.si>
This commit is contained in:
2023-09-14 12:28:05 +02:00
parent 66f8a6c3b7
commit 13703b1747
3 changed files with 360 additions and 62 deletions

View File

@@ -28,6 +28,7 @@ int main(int argc, const char * argv[])
UnitTests::stream::open_close();
UnitTests::unicode::str2wstr();
UnitTests::unicode::wstr2str();
UnitTests::unicode::charset_encoder();
std::cout << "PASS\n";
return 0;
}

View File

@@ -23,6 +23,13 @@ namespace UnitTests
Assert::AreEqual(
L"Th\u00ed\u0161 i\u22c5 a te\u0073\u0304t. 😀😅",
stdex::str2wstr("Thíš i⋅ a tes̄t. 😀😅", stdex::charset_id::utf8).c_str());
string src;
wstring dst;
for (size_t i = 0; i < 2000; i++) {
src += "🐔Test🐮\r\n";
dst += L"🐔Test🐮\r\n";
}
Assert::AreEqual(dst.c_str(), stdex::str2wstr(src, stdex::charset_id::utf8).c_str());
Assert::AreEqual(
L"",
stdex::str2wstr("test", 0, stdex::charset_id::utf8).c_str());
@@ -39,6 +46,13 @@ namespace UnitTests
Assert::AreEqual(
"Th\xc3\xad\xc5\xa1 i\xe2\x8b\x85 a tes\xcc\x84t. \xf0\x9f\x98\x80\xf0\x9f\x98\x85",
stdex::wstr2str(L"Thíš i⋅ a tes̄t. 😀😅", stdex::charset_id::utf8).c_str());
wstring src;
string dst;
for (size_t i = 0; i < 2000; i++) {
src += L"🐔Test🐮\r\n";
dst += "🐔Test🐮\r\n";
}
Assert::AreEqual(dst.c_str(), stdex::wstr2str(src, stdex::charset_id::utf8).c_str());
Assert::AreEqual(
"",
stdex::wstr2str(L"test", 0, stdex::charset_id::utf8).c_str());
@@ -46,5 +60,29 @@ namespace UnitTests
"",
stdex::wstr2str(nullptr, 0, stdex::charset_id::utf8).c_str());
}
TEST_METHOD(charset_encoder)
{
stdex::charset_encoder<char, char> win1250_to_utf8(stdex::charset_id::windows1250, stdex::charset_id::utf8);
Assert::AreEqual(
"This is a test.",
win1250_to_utf8.convert("This is a test.").c_str());
Assert::AreEqual(
"Thíš i· a teşt.",
win1250_to_utf8.convert("Th\xed\x9a i\xb7 a te\xbat.").c_str());
string src, dst;
for (size_t i = 0; i < 1000; i++) {
src += "V ko\x9eu\x9a\xe8ku zlobnega mizarja stopiclja fant in kli\xe8" "e 0123456789.\r\n";
dst += "V kožuščku zlobnega mizarja stopiclja fant in kliče 0123456789.\r\n";
}
Assert::AreEqual(dst.c_str(), win1250_to_utf8.convert(src).c_str());
Assert::AreEqual(
"",
win1250_to_utf8.convert("test", 0).c_str());
Assert::AreEqual(
"",
win1250_to_utf8.convert(nullptr, 0).c_str());
}
};
}