stream: fix wchar_t/UTF-16 confusion and revise readln and write_array

On non-Windows, wchar_t is UTF-32.  This adds preliminary support for UTF-32
and changes readln and write_array members to use charset_encoder for better
performance on non-Windows platforms.

Signed-off-by: Simon Rozman <simon@rozman.si>
This commit is contained in:
Simon Rozman 2023-09-22 13:04:41 +02:00
parent 921d235459
commit b724ce1333

View File

@ -1,4 +1,4 @@
/* /*
SPDX-License-Identifier: MIT SPDX-License-Identifier: MIT
Copyright © 2023 Amebis Copyright © 2023 Amebis
*/ */
@ -66,7 +66,8 @@ namespace stdex
constexpr size_t iterate_count = 0x10; constexpr size_t iterate_count = 0x10;
constexpr size_t default_block_size = 0x10000; ///< Amount of space used by copy or reallocation increments constexpr size_t default_block_size = 0x10000; ///< Amount of space used by copy or reallocation increments
constexpr wchar_t utf16_bom = L'\ufeff'; ///< Byte-order-mark written at each UTF-16 file start constexpr char16_t utf16_bom = u'\ufeff'; ///< Byte-order-mark written at each UTF-16 file start
constexpr char32_t utf32_bom = U'\ufeff'; ///< Byte-order-mark written at each UTF-32 file start
constexpr const char utf8_bom[3] = { '\xef', '\xbb', '\xbf' }; ///> UTF-8 byte-order-mark constexpr const char utf8_bom[3] = { '\xef', '\xbb', '\xbf' }; ///> UTF-8 byte-order-mark
/// ///
@ -323,15 +324,14 @@ namespace stdex
/// ///
/// \return Number of read characters /// \return Number of read characters
/// ///
template<class _Traits = std::char_traits<wchar_t>, class _Ax = std::allocator<wchar_t>> template<class T_from, class T_to, class _Traits = std::char_traits<T_to>, class _Ax = std::allocator<T_to>>
size_t readln(_Inout_ std::basic_string<wchar_t, _Traits, _Ax>& wstr, _In_ charset_id charset) size_t readln(_Inout_ std::basic_string<T_to, _Traits, _Ax>& wstr, _In_ charset_encoder<T_from, T_to>& encoder)
{ {
if (charset == charset_id::utf16) if (encoder.from_encoding() == encoder.to_encoding())
return readln(wstr); return readln(wstr);
std::string str; std::string str;
readln_and_attach(str); readln_and_attach(str);
wstr.clear(); encoder.strcpy(wstr, str);
str2wstr(wstr, str, charset);
return wstr.size(); return wstr.size();
} }
@ -361,14 +361,14 @@ namespace stdex
/// ///
/// \return Total number of chars in str /// \return Total number of chars in str
/// ///
template<class _Traits = std::char_traits<wchar_t>, class _Ax = std::allocator<wchar_t>> template<class T_from, class T_to, class _Traits = std::char_traits<T_to>, class _Ax = std::allocator<T_to>>
size_t readln_and_attach(_Inout_ std::basic_string<wchar_t, _Traits, _Ax>& wstr, _In_ charset_id charset) size_t readln_and_attach(_Inout_ std::basic_string<T_to, _Traits, _Ax>& wstr, _In_ charset_encoder<T_from, T_to>& encoder)
{ {
if (charset == charset_id::utf16) if (encoder.from_encoding() == encoder.to_encoding())
return readln_and_attach(wstr); return readln_and_attach(wstr);
std::string str; std::string str;
readln_and_attach(str); readln_and_attach(str);
str2wstr(wstr, str, charset); encoder.strcat(wstr, str);
return wstr.size(); return wstr.size();
} }
@ -404,20 +404,20 @@ namespace stdex
/// Writes array of characters to the stream /// Writes array of characters to the stream
/// ///
/// \param[in] wstr String to write. Must be zero-terminated. /// \param[in] wstr String to write. Must be zero-terminated.
/// \param[in] charset Charset to convert string to /// \param[in] encoder Encoder for encoding string
/// ///
/// \return Number of code units written /// \return Number of code units written
/// ///
size_t write_array(_In_z_ const wchar_t* wstr, _In_ charset_id charset) template <class T_from, class T_to>
size_t write_array(_In_z_ const T_from* wstr, _In_ charset_encoder<T_from, T_to>& encoder)
{ {
if (!ok()) _Unlikely_ if (!ok()) _Unlikely_
return 0; return 0;
size_t num_chars = stdex::strlen(wstr); size_t num_chars = stdex::strlen(wstr);
if (charset != charset_id::utf16) { if (encoder.from_encoding() == encoder.to_encoding())
std::string str(wstr2str(wstr, num_chars, charset)); return write_array(wstr, sizeof(T_from), num_chars);
return write_array(str.data(), sizeof(char), str.size()); std::basic_string<T_to> str(encoder.convert(wstr, num_chars));
} return write_array(str.data(), sizeof(T_to), str.size());
return write_array(wstr, sizeof(wchar_t), num_chars);
} }
/// ///
@ -425,40 +425,39 @@ namespace stdex
/// ///
/// \param[in] wstr String to write /// \param[in] wstr String to write
/// \param[in] num_chars String code unit count limit /// \param[in] num_chars String code unit count limit
/// \param[in] charset Charset to convert string to /// \param[in] encoder Encoder for encoding string
/// ///
/// \return Number of code units written /// \return Number of code units written
/// ///
size_t write_array(_In_reads_or_z_opt_(num_chars) const wchar_t* wstr, _In_ size_t num_chars, _In_ charset_id charset) template <class T_from, class T_to>
size_t write_array(_In_reads_or_z_opt_(num_chars) const T_from* wstr, _In_ size_t num_chars, _In_ charset_encoder<T_from, T_to>& encoder)
{ {
if (!ok()) _Unlikely_ if (!ok()) _Unlikely_
return 0; return 0;
num_chars = stdex::strnlen(wstr, num_chars); num_chars = stdex::strnlen(wstr, num_chars);
if (charset != charset_id::utf16) { if (encoder.from_encoding() == encoder.to_encoding())
std::string str(wstr2str(wstr, num_chars, charset)); return write_array(wstr, sizeof(T_from), num_chars);
return write_array(str.data(), sizeof(char), str.size()); std::basic_string<T_to> str(encoder.convert(wstr, num_chars));
} return write_array(str.data(), sizeof(T_to), str.size());
return write_array(wstr, sizeof(wchar_t), num_chars);
} }
/// ///
/// Writes array of characters to the stream /// Writes array of characters to the stream
/// ///
/// \param[in] wstr String to write /// \param[in] wstr String to write
/// \param[in] charset Charset to convert string to /// \param[in] encoder Encoder for encoding string
/// ///
/// \return Number of code units written /// \return Number of code units written
/// ///
template<class _Traits = std::char_traits<wchar_t>, class _Ax = std::allocator<wchar_t>> template<class T_from, class T_to, class _Traits = std::char_traits<T_from>, class _Ax = std::allocator<T_from>>
size_t write_array(_In_ const std::basic_string<wchar_t, _Traits, _Ax>& wstr, _In_ charset_id charset) size_t write_array(_In_ const std::basic_string<T_from, _Traits, _Ax>& wstr, _In_ charset_encoder<T_from, T_to>& encoder)
{ {
if (!ok()) _Unlikely_ if (!ok()) _Unlikely_
return 0; return 0;
if (charset != charset_id::utf16) { if (encoder.from_encoding() == encoder.to_encoding())
std::string str(wstr2str(wstr, charset)); return write_array(wstr.data(), sizeof(T_from), wstr.size());
return write_array(str.data(), sizeof(char), str.size()); std::basic_string<T_to> str(encoder.convert(wstr));
} return write_array(str.data(), sizeof(T_to), str.size());
return write_array(wstr.data(), sizeof(wchar_t), wstr.size());
} }
/// ///
@ -556,11 +555,13 @@ namespace stdex
} }
/// ///
/// Writes UTF8 or UTF-16 byte-order-mark /// Writes UTF8, UTF-16 or UTF-32 byte-order-mark
/// ///
void write_charset(_In_ charset_id charset) void write_charset(_In_ charset_id charset)
{ {
if (charset == charset_id::utf16) if (charset == charset_id::utf32)
write_data(utf32_bom);
else if (charset == charset_id::utf16)
write_data(utf16_bom); write_data(utf16_bom);
else if (charset == charset_id::utf8) else if (charset == charset_id::utf8)
write_array(utf8_bom, sizeof(utf8_bom), 1); write_array(utf8_bom, sizeof(utf8_bom), 1);
@ -854,7 +855,7 @@ namespace stdex
#endif #endif
/// ///
/// Attempts to detect textfile charset based on UTF16 or UTF8 BOM. /// Attempts to detect textfile charset based on UTF-32, UTF-16 or UTF-8 BOM.
/// ///
/// \param[in] default_charset Fallback charset to return when no BOM detected. /// \param[in] default_charset Fallback charset to return when no BOM detected.
/// ///
@ -862,20 +863,23 @@ namespace stdex
{ {
if (seek(0) != 0) if (seek(0) != 0)
throw std::runtime_error("failed to seek"); throw std::runtime_error("failed to seek");
wchar_t id_utf16; char32_t id_utf32;
read_array(&id_utf16, sizeof(wchar_t), 1); read_array(&id_utf32, sizeof(char32_t), 1);
if (!ok()) _Unlikely_ if (ok() && id_utf32 == utf32_bom)
return default_charset; return charset_id::utf32;
if (id_utf16 == utf16_bom)
if (seek(0) != 0)
throw std::runtime_error("failed to seek");
char16_t id_utf16;
read_array(&id_utf16, sizeof(char16_t), 1);
if (ok() && id_utf16 == utf16_bom)
return charset_id::utf16; return charset_id::utf16;
if (seek(0) != 0) if (seek(0) != 0)
throw std::runtime_error("failed to seek"); throw std::runtime_error("failed to seek");
char id_utf8[3] = { 0 }; char id_utf8[3] = { 0 };
read_array(id_utf8, sizeof(id_utf8), 1); read_array(id_utf8, sizeof(id_utf8), 1);
if (!ok()) _Unlikely_ if (ok() && strncmp(id_utf8, _countof(id_utf8), utf8_bom, _countof(utf8_bom)) == 0)
return default_charset;
if (strncmp(id_utf8, _countof(id_utf8), utf8_bom, _countof(utf8_bom)) == 0)
return charset_id::utf8; return charset_id::utf8;
if (seek(0) != 0) if (seek(0) != 0)