unicode: extend conversion with reusable charset_encoder

Windows takes care of internal converter state in MultiByteToWideChar and WideCharToMultiByte and keeps them thread-safe. On other platforms, iconv requires user to setup and keep converter state for thread-safe conversions. This sounds time consuming for every string conversion, therefore the concept of string converter (or converter state) has been extended to Windows too, allowing uniform client code. On Windows, using charset_encoder has no performance benefit, where on Linux and macOS, there should be. To be measured... Signed-off-by: Simon Rozman <simon@rozman.si>
2023-09-14 12:28:05 +02:00
parent 66f8a6c3b7
commit 13703b1747
3 changed files with 360 additions and 62 deletions
--- a/UnitTests/main.cpp
+++ b/UnitTests/main.cpp
@@ -28,6 +28,7 @@ int main(int argc, const char * argv[])
 		UnitTests::stream::open_close();
 		UnitTests::unicode::str2wstr();
 		UnitTests::unicode::wstr2str();
+		UnitTests::unicode::charset_encoder();
 		std::cout << "PASS\n";
 		return 0;
 	}
--- a/UnitTests/unicode.cpp
+++ b/UnitTests/unicode.cpp
@@ -23,6 +23,13 @@ namespace UnitTests
 			Assert::AreEqual(
 				L"Th\u00ed\u0161 i\u22c5 a te\u0073\u0304t. 😀😅",
 				stdex::str2wstr("Thíš i⋅ a tes̄t. 😀😅", stdex::charset_id::utf8).c_str());
+			string src;
+			wstring dst;
+			for (size_t i = 0; i < 2000; i++) {
+				src += "🐔Test🐮\r\n";
+				dst += L"🐔Test🐮\r\n";
+			}
+			Assert::AreEqual(dst.c_str(), stdex::str2wstr(src, stdex::charset_id::utf8).c_str());
 			Assert::AreEqual(
 				L"",
 				stdex::str2wstr("test", 0, stdex::charset_id::utf8).c_str());
@@ -39,6 +46,13 @@ namespace UnitTests
 			Assert::AreEqual(
 				"Th\xc3\xad\xc5\xa1 i\xe2\x8b\x85 a tes\xcc\x84t. \xf0\x9f\x98\x80\xf0\x9f\x98\x85",
 				stdex::wstr2str(L"Thíš i⋅ a tes̄t. 😀😅", stdex::charset_id::utf8).c_str());
+			wstring src;
+			string dst;
+			for (size_t i = 0; i < 2000; i++) {
+				src += L"🐔Test🐮\r\n";
+				dst += "🐔Test🐮\r\n";
+			}
+			Assert::AreEqual(dst.c_str(), stdex::wstr2str(src, stdex::charset_id::utf8).c_str());
 			Assert::AreEqual(
 				"",
 				stdex::wstr2str(L"test", 0, stdex::charset_id::utf8).c_str());
@@ -46,5 +60,29 @@ namespace UnitTests
 				"",
 				stdex::wstr2str(nullptr, 0, stdex::charset_id::utf8).c_str());
 		}
+
+		TEST_METHOD(charset_encoder)
+		{
+			stdex::charset_encoder<char, char> win1250_to_utf8(stdex::charset_id::windows1250, stdex::charset_id::utf8);
+
+			Assert::AreEqual(
+				"This is a test.",
+				win1250_to_utf8.convert("This is a test.").c_str());
+			Assert::AreEqual(
+				"Thíš i· a teşt.",
+				win1250_to_utf8.convert("Th\xed\x9a i\xb7 a te\xbat.").c_str());
+			string src, dst;
+			for (size_t i = 0; i < 1000; i++) {
+				src += "V ko\x9eu\x9a\xe8ku zlobnega mizarja stopiclja fant in kli\xe8" "e 0123456789.\r\n";
+				dst += "V kožuščku zlobnega mizarja stopiclja fant in kliče 0123456789.\r\n";
+			}
+			Assert::AreEqual(dst.c_str(), win1250_to_utf8.convert(src).c_str());
+			Assert::AreEqual(
+				"",
+				win1250_to_utf8.convert("test", 0).c_str());
+			Assert::AreEqual(
+				"",
+				win1250_to_utf8.convert(nullptr, 0).c_str());
+		}
 	};
 }