stream: fix wchar_t/UTF-16 confusion and revise readln and write_array

On non-Windows, wchar_t is UTF-32. This adds preliminary support for UTF-32 and changes readln and write_array members to use charset_encoder for better performance on non-Windows platforms. Signed-off-by: Simon Rozman <simon@rozman.si>
2023-09-22 13:04:41 +02:00 · 2023-09-22 13:04:41 +02:00 · b724ce1333
commit b724ce1333
parent 921d235459
1 changed files with 48 additions and 44 deletions
--- a/include/stdex/stream.hpp
+++ b/include/stdex/stream.hpp
@ -1,4 +1,4 @@
-/*
+/*
 	SPDX-License-Identifier: MIT
 	Copyright © 2023 Amebis
 */
@ -66,7 +66,8 @@ namespace stdex

 		constexpr size_t iterate_count = 0x10;
 		constexpr size_t default_block_size = 0x10000; ///< Amount of space used by copy or reallocation increments
-		constexpr wchar_t utf16_bom = L'\ufeff'; ///< Byte-order-mark written at each UTF-16 file start
+		constexpr char16_t utf16_bom = u'\ufeff'; ///< Byte-order-mark written at each UTF-16 file start
+		constexpr char32_t utf32_bom = U'\ufeff'; ///< Byte-order-mark written at each UTF-32 file start
 		constexpr const char utf8_bom[3] = { '\xef', '\xbb', '\xbf' }; ///> UTF-8 byte-order-mark

 		///
@ -323,15 +324,14 @@ namespace stdex
 			///
 			/// \return Number of read characters
 			///
-			template<class _Traits = std::char_traits<wchar_t>, class _Ax = std::allocator<wchar_t>>
-			size_t readln(_Inout_ std::basic_string<wchar_t, _Traits, _Ax>& wstr, _In_ charset_id charset)
+			template<class T_from, class T_to, class _Traits = std::char_traits<T_to>, class _Ax = std::allocator<T_to>>
+			size_t readln(_Inout_ std::basic_string<T_to, _Traits, _Ax>& wstr, _In_ charset_encoder<T_from, T_to>& encoder)
 			{
-				if (charset == charset_id::utf16)
+				if (encoder.from_encoding() == encoder.to_encoding())
 					return readln(wstr);
 				std::string str;
 				readln_and_attach(str);
-				wstr.clear();
-				str2wstr(wstr, str, charset);
+				encoder.strcpy(wstr, str);
 				return wstr.size();
 			}

@ -361,14 +361,14 @@ namespace stdex
 			///
 			/// \return Total number of chars in str
 			///
-			template<class _Traits = std::char_traits<wchar_t>, class _Ax = std::allocator<wchar_t>>
-			size_t readln_and_attach(_Inout_ std::basic_string<wchar_t, _Traits, _Ax>& wstr, _In_ charset_id charset)
+			template<class T_from, class T_to, class _Traits = std::char_traits<T_to>, class _Ax = std::allocator<T_to>>
+			size_t readln_and_attach(_Inout_ std::basic_string<T_to, _Traits, _Ax>& wstr, _In_ charset_encoder<T_from, T_to>& encoder)
 			{
-				if (charset == charset_id::utf16)
+				if (encoder.from_encoding() == encoder.to_encoding())
 					return readln_and_attach(wstr);
 				std::string str;
 				readln_and_attach(str);
-				str2wstr(wstr, str, charset);
+				encoder.strcat(wstr, str);
 				return wstr.size();
 			}

@ -404,20 +404,20 @@ namespace stdex
 			/// Writes array of characters to the stream
 			///
 			/// \param[in] wstr       String to write. Must be zero-terminated.
-			/// \param[in] charset    Charset to convert string to
+			/// \param[in] encoder    Encoder for encoding string
 			///
 			/// \return Number of code units written
 			///
-			size_t write_array(_In_z_ const wchar_t* wstr, _In_ charset_id charset)
+			template <class T_from, class T_to>
+			size_t write_array(_In_z_ const T_from* wstr, _In_ charset_encoder<T_from, T_to>& encoder)
 			{
 				if (!ok()) _Unlikely_
 					return 0;
 				size_t num_chars = stdex::strlen(wstr);
-				if (charset != charset_id::utf16) {
-					std::string str(wstr2str(wstr, num_chars, charset));
-					return write_array(str.data(), sizeof(char), str.size());
-				}
-				return write_array(wstr, sizeof(wchar_t), num_chars);
+				if (encoder.from_encoding() == encoder.to_encoding())
+					return write_array(wstr, sizeof(T_from), num_chars);
+				std::basic_string<T_to> str(encoder.convert(wstr, num_chars));
+				return write_array(str.data(), sizeof(T_to), str.size());
 			}

 			///
@ -425,40 +425,39 @@ namespace stdex
 			///
 			/// \param[in] wstr       String to write
 			/// \param[in] num_chars  String code unit count limit
-			/// \param[in] charset    Charset to convert string to
+			/// \param[in] encoder    Encoder for encoding string
 			///
 			/// \return Number of code units written
 			///
-			size_t write_array(_In_reads_or_z_opt_(num_chars) const wchar_t* wstr, _In_ size_t num_chars, _In_ charset_id charset)
+			template <class T_from, class T_to>
+			size_t write_array(_In_reads_or_z_opt_(num_chars) const T_from* wstr, _In_ size_t num_chars, _In_ charset_encoder<T_from, T_to>& encoder)
 			{
 				if (!ok()) _Unlikely_
 					return 0;
 				num_chars = stdex::strnlen(wstr, num_chars);
-				if (charset != charset_id::utf16) {
-					std::string str(wstr2str(wstr, num_chars, charset));
-					return write_array(str.data(), sizeof(char), str.size());
-				}
-				return write_array(wstr, sizeof(wchar_t), num_chars);
+				if (encoder.from_encoding() == encoder.to_encoding())
+					return write_array(wstr, sizeof(T_from), num_chars);
+				std::basic_string<T_to> str(encoder.convert(wstr, num_chars));
+				return write_array(str.data(), sizeof(T_to), str.size());
 			}

 			///
 			/// Writes array of characters to the stream
 			///
 			/// \param[in] wstr       String to write
-			/// \param[in] charset    Charset to convert string to
+			/// \param[in] encoder    Encoder for encoding string
 			///
 			/// \return Number of code units written
 			///
-			template<class _Traits = std::char_traits<wchar_t>, class _Ax = std::allocator<wchar_t>>
-			size_t write_array(_In_ const std::basic_string<wchar_t, _Traits, _Ax>& wstr, _In_ charset_id charset)
+			template<class T_from, class T_to, class _Traits = std::char_traits<T_from>, class _Ax = std::allocator<T_from>>
+			size_t write_array(_In_ const std::basic_string<T_from, _Traits, _Ax>& wstr, _In_ charset_encoder<T_from, T_to>& encoder)
 			{
 				if (!ok()) _Unlikely_
 					return 0;
-				if (charset != charset_id::utf16) {
-					std::string str(wstr2str(wstr, charset));
-					return write_array(str.data(), sizeof(char), str.size());
-				}
-				return write_array(wstr.data(), sizeof(wchar_t), wstr.size());
+				if (encoder.from_encoding() == encoder.to_encoding())
+					return write_array(wstr.data(), sizeof(T_from), wstr.size());
+				std::basic_string<T_to> str(encoder.convert(wstr));
+				return write_array(str.data(), sizeof(T_to), str.size());
 			}

 			///
@ -556,11 +555,13 @@ namespace stdex
 			}

 			///
-			/// Writes UTF8 or UTF-16 byte-order-mark
+			/// Writes UTF8, UTF-16 or UTF-32 byte-order-mark
 			///
 			void write_charset(_In_ charset_id charset)
 			{
-				if (charset == charset_id::utf16)
+				if (charset == charset_id::utf32)
+					write_data(utf32_bom);
+				else if (charset == charset_id::utf16)
 					write_data(utf16_bom);
 				else if (charset == charset_id::utf8)
 					write_array(utf8_bom, sizeof(utf8_bom), 1);
@ -854,7 +855,7 @@ namespace stdex
 #endif

 			///
-			/// Attempts to detect textfile charset based on UTF16 or UTF8 BOM.
+			/// Attempts to detect textfile charset based on UTF-32, UTF-16 or UTF-8 BOM.
 			///
 			/// \param[in] default_charset  Fallback charset to return when no BOM detected.
 			///
@ -862,20 +863,23 @@ namespace stdex
 			{
 				if (seek(0) != 0)
 					throw std::runtime_error("failed to seek");
-				wchar_t id_utf16;
-				read_array(&id_utf16, sizeof(wchar_t), 1);
-				if (!ok()) _Unlikely_
-					return default_charset;
-				if (id_utf16 == utf16_bom)
+				char32_t id_utf32;
+				read_array(&id_utf32, sizeof(char32_t), 1);
+				if (ok() && id_utf32 == utf32_bom)
+					return charset_id::utf32;
+
+				if (seek(0) != 0)
+					throw std::runtime_error("failed to seek");
+				char16_t id_utf16;
+				read_array(&id_utf16, sizeof(char16_t), 1);
+				if (ok() && id_utf16 == utf16_bom)
 					return charset_id::utf16;

 				if (seek(0) != 0)
 					throw std::runtime_error("failed to seek");
 				char id_utf8[3] = { 0 };
 				read_array(id_utf8, sizeof(id_utf8), 1);
-				if (!ok()) _Unlikely_
-					return default_charset;
-				if (strncmp(id_utf8, _countof(id_utf8), utf8_bom, _countof(utf8_bom)) == 0)
+				if (ok() && strncmp(id_utf8, _countof(id_utf8), utf8_bom, _countof(utf8_bom)) == 0)
 					return charset_id::utf8;

 				if (seek(0) != 0)