stdex
Additional custom or not Standard C++ covered algorithms
Loading...
Searching...
No Matches
unicode.hpp
1/*
2 SPDX-License-Identifier: MIT
3 Copyright © 2023 Amebis
4*/
5
6#pragma once
7
8#include "compat.hpp"
9#include "endian.hpp"
10#include "math.hpp"
11#include "system.hpp"
12#include <assert.h>
13#include <stdint.h>
14#ifndef _WIN32
15#include <iconv.h>
16#include <langinfo.h>
17#endif
18#include <memory>
19#include <string>
20
21namespace stdex
22{
23 enum class charset_id : uint16_t {
24#ifdef _WIN32
25 system = CP_ACP,
26 oem = CP_OEMCP,
27 utf8 = CP_UTF8,
28 utf16 = 1200 /*CP_WINUNICODE*/,
29 windows1250 = 1250,
30 windows1251 = 1251,
31 windows1252 = 1252,
32#else
33 system = 0,
34 utf8,
35 utf16,
36 utf32,
37 windows1250,
38 windows1251,
39 windows1252,
40
41 _max
42#endif
43 };
44
45#ifdef _WIN32
46 constexpr charset_id wchar_t_charset = charset_id::utf16;
47#else
48 constexpr charset_id wchar_t_charset = charset_id::utf32;
49#endif
50
54 template <typename T_from, typename T_to>
56 {
57 public:
58 charset_encoder(_In_ charset_id from, _In_ charset_id to)
59 {
60#ifdef _WIN32
61 m_from = to_encoding(from);
62 m_to = to_encoding(to);
63#else
64 m_handle = iconv_open(to_encoding(to), to_encoding(from));
65 if (m_handle == (iconv_t)-1)
66 throw std::runtime_error("iconv_open failed");
67#endif
68 }
69
70#ifndef _WIN32
72 {
73 iconv_close(m_handle);
74 }
75#endif
76
84 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>>
85 void strcat(
86 _Inout_ std::basic_string<T_to, _Traits_to, _Alloc_to> &dst,
87 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src)
88 {
89 constexpr DWORD dwFlagsMBWC = MB_PRECOMPOSED;
90 constexpr DWORD dwFlagsWCMB = 0;
91 constexpr LPCCH lpDefaultChar = NULL;
92
93 assert(src || !count_src);
94 count_src = stdex::strnlen(src, count_src);
95 if (!count_src) _Unlikely_
96 return;
97#ifdef _WIN32
98 _Analysis_assume_(src);
99 if (m_from == m_to) _Unlikely_{
100 dst.append(reinterpret_cast<const T_to*>(src), count_src);
101 return;
102 }
103
104 if _Constexpr_ (sizeof(T_from) == sizeof(char) && sizeof(T_to) == sizeof(wchar_t)) {
105 assert(count_src < INT_MAX || count_src == SIZE_MAX);
106
107 // Try to convert to stack buffer first.
108 WCHAR szStackBuffer[1024 / sizeof(WCHAR)];
109#pragma warning(suppress: 6387) // Testing indicates src may be NULL when count_src is also 0. Is SAL of the lpMultiByteStr parameter wrong?
110 int cch = MultiByteToWideChar(static_cast<UINT>(m_from), dwFlagsMBWC, reinterpret_cast<LPCCH>(src), static_cast<int>(count_src), szStackBuffer, _countof(szStackBuffer));
111 if (cch) {
112 // Append from stack.
113 dst.append(reinterpret_cast<const T_to*>(szStackBuffer), count_src != SIZE_MAX ? wcsnlen(szStackBuffer, cch) : static_cast<size_t>(cch) - 1);
114 return;
115 }
116 if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
117 // Query the required output size. Allocate buffer. Then convert again.
118 cch = MultiByteToWideChar(static_cast<UINT>(m_from), dwFlagsMBWC, reinterpret_cast<LPCCH>(src), static_cast<int>(count_src), NULL, 0);
119 std::unique_ptr<WCHAR[]> szBuffer(new WCHAR[cch]);
120 cch = MultiByteToWideChar(static_cast<UINT>(m_from), dwFlagsMBWC, reinterpret_cast<LPCCH>(src), static_cast<int>(count_src), szBuffer.get(), cch);
121 dst.append(reinterpret_cast<const T_to*>(szBuffer.get()), count_src != SIZE_MAX ? wcsnlen(szBuffer.get(), cch) : static_cast<size_t>(cch) - 1);
122 return;
123 }
124 throw std::runtime_error("MultiByteToWideChar failed");
125 }
126
127 if _Constexpr_ (sizeof(T_from) == sizeof(wchar_t) && sizeof(T_to) == sizeof(char)) {
128 assert(count_src < INT_MAX || count_src == SIZE_MAX);
129
130 // Try to convert to stack buffer first.
131 CHAR szStackBuffer[1024 / sizeof(CHAR)];
132#pragma warning(suppress: 6387) // Testing indicates src may be NULL when count_src is also 0. Is SAL of the lpWideCharStr parameter wrong?
133 int cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, reinterpret_cast<LPCWCH>(src), static_cast<int>(count_src), szStackBuffer, _countof(szStackBuffer), lpDefaultChar, NULL);
134 if (cch) {
135 // Copy from stack. Be careful not to include zero terminator.
136 dst.append(reinterpret_cast<const T_to*>(szStackBuffer), count_src != SIZE_MAX ? strnlen(szStackBuffer, cch) : static_cast<size_t>(cch) - 1);
137 return;
138 }
139 if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
140 // Query the required output size. Allocate buffer. Then convert again.
141 cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, reinterpret_cast<LPCWCH>(src), static_cast<int>(count_src), NULL, 0, lpDefaultChar, NULL);
142 std::unique_ptr<CHAR[]> szBuffer(new CHAR[cch]);
143 cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, reinterpret_cast<LPCWCH>(src), static_cast<int>(count_src), szBuffer.get(), cch, lpDefaultChar, NULL);
144 dst.append(reinterpret_cast<const T_to*>(szBuffer.get()), count_src != SIZE_MAX ? strnlen(szBuffer.get(), cch) : static_cast<size_t>(cch) - 1);
145 return;
146 }
147 throw std::runtime_error("WideCharToMultiByte failed");
148 }
149
150 if _Constexpr_ (sizeof(T_from) == sizeof(char) && sizeof(T_to) == sizeof(char)) {
151 assert(count_src < INT_MAX || count_src == SIZE_MAX);
152
153 // Try to convert to stack buffer first.
154 WCHAR szStackBufferMBWC[512 / sizeof(WCHAR)];
155#pragma warning(suppress: 6387) // Testing indicates src may be NULL when count_src is also 0. Is SAL of the lpMultiByteStr parameter wrong?
156 int cch = MultiByteToWideChar(static_cast<UINT>(m_from), dwFlagsMBWC, reinterpret_cast<LPCCH>(src), static_cast<int>(count_src), szStackBufferMBWC, _countof(szStackBufferMBWC));
157 if (cch) {
158 // Append from stack.
159 size_t count_inter = count_src != SIZE_MAX ? wcsnlen(szStackBufferMBWC, cch) : static_cast<size_t>(cch) - 1;
160 assert(count_inter < INT_MAX);
161
162 // Try to convert to stack buffer first.
163 CHAR szStackBufferWCMB[512 / sizeof(CHAR)];
164#pragma warning(suppress: 6387) // Testing indicates szStackBufferMBWC may be NULL when count_inter is also 0. Is SAL of the lpWideCharStr parameter wrong?
165 cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, szStackBufferMBWC, static_cast<int>(count_inter), szStackBufferWCMB, _countof(szStackBufferWCMB), lpDefaultChar, NULL);
166 if (cch) {
167 // Copy from stack. Be careful not to include zero terminator.
168 dst.append(reinterpret_cast<const T_to*>(szStackBufferWCMB), strnlen(szStackBufferWCMB, cch));
169 return;
170 }
171 if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
172 // Query the required output size. Allocate buffer. Then convert again.
173 cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, szStackBufferMBWC, static_cast<int>(count_inter), NULL, 0, lpDefaultChar, NULL);
174 std::unique_ptr<CHAR[]> szBufferWCMB(new CHAR[cch]);
175 cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, szStackBufferMBWC, static_cast<int>(count_inter), szBufferWCMB.get(), cch, lpDefaultChar, NULL);
176 dst.append(reinterpret_cast<const T_to*>(szBufferWCMB.get()), strnlen(szBufferWCMB.get(), cch));
177 return;
178 }
179 throw std::runtime_error("WideCharToMultiByte failed");
180 }
181 if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
182 // Query the required output size. Allocate buffer. Then convert again.
183 cch = MultiByteToWideChar(static_cast<UINT>(m_from), dwFlagsMBWC, reinterpret_cast<LPCCH>(src), static_cast<int>(count_src), NULL, 0);
184 std::unique_ptr<WCHAR[]> szBufferMBWC(new WCHAR[cch]);
185 cch = MultiByteToWideChar(static_cast<UINT>(m_from), dwFlagsMBWC, reinterpret_cast<LPCCH>(src), static_cast<int>(count_src), szBufferMBWC.get(), cch);
186 size_t count_inter = count_src != SIZE_MAX ? wcsnlen(szBufferMBWC.get(), cch) : static_cast<size_t>(cch) - 1;
187
188 // Query the required output size. Allocate buffer. Then convert again.
189 cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, szBufferMBWC.get(), static_cast<int>(count_inter), NULL, 0, lpDefaultChar, NULL);
190 std::unique_ptr<CHAR[]> szBufferWCMB(new CHAR[cch]);
191 cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, szBufferMBWC.get(), static_cast<int>(count_inter), szBufferWCMB.get(), cch, lpDefaultChar, NULL);
192 dst.append(reinterpret_cast<const T_to*>(szBufferWCMB.get()), strnlen(szBufferWCMB.get(), cch));
193 return;
194 }
195 throw std::runtime_error("MultiByteToWideChar failed");
196 }
197#else
198 T_to buf[1024 / sizeof(T_to)];
199 size_t src_size = stdex::mul(sizeof(T_from), count_src);
200 do {
201 T_to* output = &buf[0];
202 size_t output_size = sizeof(buf);
203 errno = 0;
204 iconv(m_handle, (char**)&src, &src_size, (char**)&output, &output_size);
205 if (errno)
206 throw std::runtime_error("iconv failed");
207 dst.append(buf, reinterpret_cast<T_to*>(reinterpret_cast<char*>(buf) + sizeof(buf) - output_size));
208 } while (src_size);
209#endif
210 }
211
218 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>>
219 inline void strcat(
220 _Inout_ std::basic_string<T_to, _Traits_to, _Alloc_to>& dst,
221 _In_z_ const T_from* src)
222 {
223 strcat(dst, src, SIZE_MAX);
224 }
225
232 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>, class _Traits_from = std::char_traits<T_from>, class _Alloc_from = std::allocator<T_from>>
233 inline void strcat(
234 _Inout_ std::basic_string<T_to, _Traits_to, _Alloc_to>& dst,
235 _In_ const std::basic_string<T_from, _Traits_from, _Alloc_from>& src)
236 {
237 strcat(dst, src.data(), src.size());
238 }
239
247 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>>
248 inline void strcpy(
249 _Inout_ std::basic_string<T_to, _Traits_to, _Alloc_to>& dst,
250 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src)
251 {
252 dst.clear();
253 strcat(dst, src, count_src);
254 }
255
262 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>>
263 inline void strcpy(
264 _Inout_ std::basic_string<T_to, _Traits_to, _Alloc_to>& dst,
265 _In_z_ const T_from* src)
266 {
267 strcpy(dst, src, SIZE_MAX);
268 }
269
276 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>, class _Traits_from = std::char_traits<T_from>, class _Alloc_from = std::allocator<T_from>>
277 inline void strcpy(
278 _Inout_ std::basic_string<T_to, _Traits_to, _Alloc_to>& dst,
279 _In_ const std::basic_string<T_from, _Traits_from, _Alloc_from>& src)
280 {
281 strcpy(dst, src.data(), src.size());
282 }
283
290 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>>
291 inline std::basic_string<T_to, _Traits_to, _Alloc_to> convert(_In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src)
292 {
293 std::basic_string<T_to, _Traits_to, _Alloc_to> dst;
294 strcat(dst, src, count_src);
295 return dst;
296 }
297
303 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>>
304 inline std::basic_string<T_to, _Traits_to, _Alloc_to> convert(_In_z_ const T_from* src)
305 {
306 return convert(src, SIZE_MAX);
307 }
308
314 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>, class _Traits_from = std::char_traits<T_from>, class _Alloc_from = std::allocator<T_from>>
315 inline std::basic_string<T_to, _Traits_to, _Alloc_to> convert(_In_ const std::basic_string<T_from, _Traits_from, _Alloc_from>& src)
316 {
317 return convert(src.data(), src.size());
318 }
319
320 inline void clear()
321 {
322#ifndef _WIN32
323 iconv(m_handle, NULL, NULL, NULL, NULL);
324#endif
325 }
326
327#ifdef _WIN32
328 protected:
329 static UINT to_encoding(_In_ charset_id charset)
330 {
331 return
332 charset == charset_id::system ? GetACP() :
333 charset == charset_id::oem ? GetOEMCP() :
334 static_cast<UINT>(charset);
335 }
336
337 protected:
338 UINT m_from, m_to;
339#else
340 protected:
341 static const char* to_encoding(_In_ charset_id charset)
342 {
343 static const char* const encodings[static_cast<std::underlying_type_t<charset_id>>(charset_id::_max)] = {
344 "", // system
345 "UTF-8", // utf8
346#if BYTE_ORDER == BIG_ENDIAN
347 "UTF-16BE", // utf16
348 "UTF-32BE", // utf32
349#else
350 "UTF-16LE", // utf16
351 "UTF-32LE", // utf32
352#endif
353 "CP1250", // windows1250
354 "CP1251", // windows1251
355 "CP1252", // windows1252
356 }
357 return
358 charset == charset_id::system ? nl_langinfo(LC_CTYPE) :
359 encodings[static_cast<std::underlying_type_t<charset_id>>(charset))];
360 }
361
362 protected:
363 iconv_t m_handle;
364#endif
365 };
366
377 inline void strcat(
378 _Inout_ std::wstring& dst,
379 _In_reads_or_z_opt_(count_src) const char* src, _In_ size_t count_src,
380 _In_ charset_id charset = charset_id::system)
381 {
382 charset_encoder<char, wchar_t>(charset, wchar_t_charset).strcat(dst, src, count_src);
383 }
384
385 _Deprecated_("Use stdex::strcat")
386 inline void str2wstr(
387 _Inout_ std::wstring& dst,
388 _In_reads_or_z_opt_(count_src) const char* src, _In_ size_t count_src,
389 _In_ charset_id charset = charset_id::system)
390 {
391 strcat(dst, src, count_src, charset);
392 }
393
403 inline void strcat(
404 _Inout_ std::wstring& dst,
405 _In_ const std::string& src,
406 _In_ charset_id charset = charset_id::system)
407 {
408 strcat(dst, src.data(), src.size(), charset);
409 }
410
411 _Deprecated_("Use stdex::strcat")
412 inline void str2wstr(
413 _Inout_ std::wstring& dst,
414 _In_ const std::string& src,
415 _In_ charset_id charset = charset_id::system)
416 {
417 strcat(dst, src, charset);
418 }
419
430 inline void strcpy(
431 _Inout_ std::wstring& dst,
432 _In_reads_or_z_opt_(count_src) const char* src, _In_ size_t count_src,
433 _In_ charset_id charset = charset_id::system)
434 {
435 dst.clear();
436 strcat(dst, src, count_src, charset);
437 }
438
448 inline void strcpy(
449 _Inout_ std::wstring& dst,
450 _In_ const std::string& src,
451 _In_ charset_id charset = charset_id::system)
452 {
453 strcpy(dst, src.data(), src.size(), charset);
454 }
455
466 inline std::wstring str2wstr(
467 _In_z_ const char* src,
468 _In_ charset_id charset = charset_id::system)
469 {
470 std::wstring dst;
471 strcat(dst, src, SIZE_MAX, charset);
472 return dst;
473 }
474
486 inline std::wstring str2wstr(
487 _In_reads_or_z_opt_(count_src) const char* src, _In_ size_t count_src,
488 _In_ charset_id charset = charset_id::system)
489 {
490 std::wstring dst;
491 strcat(dst, src, count_src, charset);
492 return dst;
493 }
494
505 inline std::wstring str2wstr(
506 _In_ const std::string& src,
507 _In_ charset_id charset = charset_id::system)
508 {
509 return str2wstr(src.c_str(), src.size(), charset);
510 }
511
522 inline void strcat(
523 _Inout_ std::string& dst,
524 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
525 _In_ charset_id charset = charset_id::system)
526 {
527 charset_encoder<wchar_t, char>(wchar_t_charset, charset).strcat(dst, src, count_src);
528 }
529
530 _Deprecated_("Use stdex::strcat")
531 inline void wstr2str(
532 _Inout_ std::string& dst,
533 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
534 _In_ charset_id charset = charset_id::system)
535 {
536 strcat(dst, src, count_src, charset);
537 }
538
548 inline void strcat(
549 _Inout_ std::string& dst,
550 _In_ const std::wstring& src,
551 _In_ charset_id charset = charset_id::system)
552 {
553 strcat(dst, src.c_str(), src.size(), charset);
554 }
555
556 _Deprecated_("Use stdex::strcat")
557 inline void wstr2str(
558 _Inout_ std::string& dst,
559 _In_ const std::wstring& src,
560 _In_ charset_id charset = charset_id::system)
561 {
562 strcat(dst, src, charset);
563 }
564
575 inline void strcpy(
576 _Inout_ std::string& dst,
577 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
578 _In_ charset_id charset = charset_id::system)
579 {
580 dst.clear();
581 strcat(dst, src, count_src, charset);
582 }
583
593 inline void strcpy(
594 _Inout_ std::string& dst,
595 _In_ const std::wstring& src,
596 _In_ charset_id charset = charset_id::system)
597 {
598 strcpy(dst, src.data(), src.size(), charset);
599 }
600
611 inline std::string wstr2str(
612 _In_z_ const wchar_t* src,
613 _In_ charset_id charset = charset_id::system)
614 {
615 std::string dst;
616 strcat(dst, src, SIZE_MAX, charset);
617 return dst;
618 }
619
631 inline std::string wstr2str(
632 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
633 _In_ charset_id charset = charset_id::system)
634 {
635 std::string dst;
636 strcat(dst, src, count_src, charset);
637 return dst;
638 }
639
650 inline std::string wstr2str(
651 _In_ const std::wstring& src,
652 _In_ charset_id charset = charset_id::system)
653 {
654 return wstr2str(src.c_str(), src.size(), charset);
655 }
656}
Encoding converter context.
Definition unicode.hpp:56
void strcpy(std::basic_string< T_to, _Traits_to, _Alloc_to > &dst, const std::basic_string< T_from, _Traits_from, _Alloc_from > &src)
Convert string.
Definition unicode.hpp:277
std::basic_string< T_to, _Traits_to, _Alloc_to > convert(const std::basic_string< T_from, _Traits_from, _Alloc_from > &src)
Return converted string.
Definition unicode.hpp:315
std::basic_string< T_to, _Traits_to, _Alloc_to > convert(const T_from *src)
Return converted string.
Definition unicode.hpp:304
void strcat(std::basic_string< T_to, _Traits_to, _Alloc_to > &dst, const std::basic_string< T_from, _Traits_from, _Alloc_from > &src)
Convert string and append to string.
Definition unicode.hpp:233
void strcpy(std::basic_string< T_to, _Traits_to, _Alloc_to > &dst, const T_from *src)
Convert string.
Definition unicode.hpp:263
void strcat(std::basic_string< T_to, _Traits_to, _Alloc_to > &dst, _In_reads_or_z_opt_(count_src) const T_from *src, size_t count_src)
Convert string and append to string.
Definition unicode.hpp:85
void strcat(std::basic_string< T_to, _Traits_to, _Alloc_to > &dst, const T_from *src)
Convert string and append to string.
Definition unicode.hpp:219
void strcpy(std::basic_string< T_to, _Traits_to, _Alloc_to > &dst, _In_reads_or_z_opt_(count_src) const T_from *src, size_t count_src)
Convert string.
Definition unicode.hpp:248
std::basic_string< T_to, _Traits_to, _Alloc_to > convert(_In_reads_or_z_opt_(count_src) const T_from *src, size_t count_src)
Return converted string.
Definition unicode.hpp:291