stdex
Additional custom or not Standard C++ covered algorithms
Loading...
Searching...
No Matches
unicode.hpp
1/*
2 SPDX-License-Identifier: MIT
3 Copyright © 2023 Amebis
4*/
5
6#pragma once
7
8#include "compat.hpp"
9#include "endian.hpp"
10#include "math.hpp"
11#include "system.hpp"
12#include <assert.h>
13#include <stdint.h>
14#ifndef _WIN32
15#include <iconv.h>
16#include <langinfo.h>
17#endif
18#include <memory>
19#include <string>
20
21namespace stdex
22{
23 enum class charset_id : uint16_t {
24#ifdef _WIN32
25 system = CP_ACP,
26 oem = CP_OEMCP,
27 utf8 = CP_UTF8,
28 utf16 = 1200 /*CP_WINUNICODE*/,
29 windows1250 = 1250,
30 windows1251 = 1251,
31 windows1252 = 1252,
32#else
33 system = 0,
34 utf8,
35 utf16,
36 utf32,
37 windows1250,
38 windows1251,
39 windows1252,
40
41 _max
42#endif
43 };
44
45#ifdef _WIN32
46 constexpr charset_id wchar_t_charset = charset_id::utf16;
47#else
48 constexpr charset_id wchar_t_charset = charset_id::utf32;
49#endif
50
54 template <typename T_from, typename T_to>
56 {
57 public:
58 charset_encoder(_In_ charset_id from, _In_ charset_id to)
59 {
60#ifdef _WIN32
61 m_from = to_encoding(from);
62 m_to = to_encoding(to);
63#else
64 m_handle = iconv_open(to_encoding(to), to_encoding(from));
65 if (m_handle == (iconv_t)-1)
66 throw std::runtime_error("iconv_open failed");
67#endif
68 }
69
70#ifndef _WIN32
72 {
73 iconv_close(m_handle);
74 }
75#endif
76
84 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>>
85 void strcat(
86 _Inout_ std::basic_string<T_to, _Traits_to, _Alloc_to> &dst,
87 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src)
88 {
89 assert(src || !count_src);
90 count_src = stdex::strnlen(src, count_src);
91 if (!count_src) _Unlikely_
92 return;
93
94#ifdef _WIN32
95 constexpr DWORD dwFlagsMBWC = MB_PRECOMPOSED;
96 constexpr DWORD dwFlagsWCMB = 0;
97 constexpr LPCCH lpDefaultChar = NULL;
98
99 _Analysis_assume_(src);
100 if (m_from == m_to) _Unlikely_{
101 dst.append(reinterpret_cast<const T_to*>(src), count_src);
102 return;
103 }
104
105 if _Constexpr_ (sizeof(T_from) == sizeof(char) && sizeof(T_to) == sizeof(wchar_t)) {
106 assert(count_src < INT_MAX || count_src == SIZE_MAX);
107
108 // Try to convert to stack buffer first.
109 WCHAR szStackBuffer[1024 / sizeof(WCHAR)];
110#pragma warning(suppress: 6387) // Testing indicates src may be NULL when count_src is also 0. Is SAL of the lpMultiByteStr parameter wrong?
111 int cch = MultiByteToWideChar(static_cast<UINT>(m_from), dwFlagsMBWC, reinterpret_cast<LPCCH>(src), static_cast<int>(count_src), szStackBuffer, _countof(szStackBuffer));
112 if (cch) {
113 // Append from stack.
114 dst.append(reinterpret_cast<const T_to*>(szStackBuffer), count_src != SIZE_MAX ? wcsnlen(szStackBuffer, cch) : static_cast<size_t>(cch) - 1);
115 return;
116 }
117 if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
118 // Query the required output size. Allocate buffer. Then convert again.
119 cch = MultiByteToWideChar(static_cast<UINT>(m_from), dwFlagsMBWC, reinterpret_cast<LPCCH>(src), static_cast<int>(count_src), NULL, 0);
120 std::unique_ptr<WCHAR[]> szBuffer(new WCHAR[cch]);
121 cch = MultiByteToWideChar(static_cast<UINT>(m_from), dwFlagsMBWC, reinterpret_cast<LPCCH>(src), static_cast<int>(count_src), szBuffer.get(), cch);
122 dst.append(reinterpret_cast<const T_to*>(szBuffer.get()), count_src != SIZE_MAX ? wcsnlen(szBuffer.get(), cch) : static_cast<size_t>(cch) - 1);
123 return;
124 }
125 throw std::runtime_error("MultiByteToWideChar failed");
126 }
127
128 if _Constexpr_ (sizeof(T_from) == sizeof(wchar_t) && sizeof(T_to) == sizeof(char)) {
129 assert(count_src < INT_MAX || count_src == SIZE_MAX);
130
131 // Try to convert to stack buffer first.
132 CHAR szStackBuffer[1024 / sizeof(CHAR)];
133#pragma warning(suppress: 6387) // Testing indicates src may be NULL when count_src is also 0. Is SAL of the lpWideCharStr parameter wrong?
134 int cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, reinterpret_cast<LPCWCH>(src), static_cast<int>(count_src), szStackBuffer, _countof(szStackBuffer), lpDefaultChar, NULL);
135 if (cch) {
136 // Copy from stack. Be careful not to include zero terminator.
137 dst.append(reinterpret_cast<const T_to*>(szStackBuffer), count_src != SIZE_MAX ? strnlen(szStackBuffer, cch) : static_cast<size_t>(cch) - 1);
138 return;
139 }
140 if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
141 // Query the required output size. Allocate buffer. Then convert again.
142 cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, reinterpret_cast<LPCWCH>(src), static_cast<int>(count_src), NULL, 0, lpDefaultChar, NULL);
143 std::unique_ptr<CHAR[]> szBuffer(new CHAR[cch]);
144 cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, reinterpret_cast<LPCWCH>(src), static_cast<int>(count_src), szBuffer.get(), cch, lpDefaultChar, NULL);
145 dst.append(reinterpret_cast<const T_to*>(szBuffer.get()), count_src != SIZE_MAX ? strnlen(szBuffer.get(), cch) : static_cast<size_t>(cch) - 1);
146 return;
147 }
148 throw std::runtime_error("WideCharToMultiByte failed");
149 }
150
151 if _Constexpr_ (sizeof(T_from) == sizeof(char) && sizeof(T_to) == sizeof(char)) {
152 assert(count_src < INT_MAX || count_src == SIZE_MAX);
153
154 // Try to convert to stack buffer first.
155 WCHAR szStackBufferMBWC[512 / sizeof(WCHAR)];
156#pragma warning(suppress: 6387) // Testing indicates src may be NULL when count_src is also 0. Is SAL of the lpMultiByteStr parameter wrong?
157 int cch = MultiByteToWideChar(static_cast<UINT>(m_from), dwFlagsMBWC, reinterpret_cast<LPCCH>(src), static_cast<int>(count_src), szStackBufferMBWC, _countof(szStackBufferMBWC));
158 if (cch) {
159 // Append from stack.
160 size_t count_inter = count_src != SIZE_MAX ? wcsnlen(szStackBufferMBWC, cch) : static_cast<size_t>(cch) - 1;
161 assert(count_inter < INT_MAX);
162
163 // Try to convert to stack buffer first.
164 CHAR szStackBufferWCMB[512 / sizeof(CHAR)];
165#pragma warning(suppress: 6387) // Testing indicates szStackBufferMBWC may be NULL when count_inter is also 0. Is SAL of the lpWideCharStr parameter wrong?
166 cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, szStackBufferMBWC, static_cast<int>(count_inter), szStackBufferWCMB, _countof(szStackBufferWCMB), lpDefaultChar, NULL);
167 if (cch) {
168 // Copy from stack. Be careful not to include zero terminator.
169 dst.append(reinterpret_cast<const T_to*>(szStackBufferWCMB), strnlen(szStackBufferWCMB, cch));
170 return;
171 }
172 if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
173 // Query the required output size. Allocate buffer. Then convert again.
174 cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, szStackBufferMBWC, static_cast<int>(count_inter), NULL, 0, lpDefaultChar, NULL);
175 std::unique_ptr<CHAR[]> szBufferWCMB(new CHAR[cch]);
176 cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, szStackBufferMBWC, static_cast<int>(count_inter), szBufferWCMB.get(), cch, lpDefaultChar, NULL);
177 dst.append(reinterpret_cast<const T_to*>(szBufferWCMB.get()), strnlen(szBufferWCMB.get(), cch));
178 return;
179 }
180 throw std::runtime_error("WideCharToMultiByte failed");
181 }
182 if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
183 // Query the required output size. Allocate buffer. Then convert again.
184 cch = MultiByteToWideChar(static_cast<UINT>(m_from), dwFlagsMBWC, reinterpret_cast<LPCCH>(src), static_cast<int>(count_src), NULL, 0);
185 std::unique_ptr<WCHAR[]> szBufferMBWC(new WCHAR[cch]);
186 cch = MultiByteToWideChar(static_cast<UINT>(m_from), dwFlagsMBWC, reinterpret_cast<LPCCH>(src), static_cast<int>(count_src), szBufferMBWC.get(), cch);
187 size_t count_inter = count_src != SIZE_MAX ? wcsnlen(szBufferMBWC.get(), cch) : static_cast<size_t>(cch) - 1;
188
189 // Query the required output size. Allocate buffer. Then convert again.
190 cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, szBufferMBWC.get(), static_cast<int>(count_inter), NULL, 0, lpDefaultChar, NULL);
191 std::unique_ptr<CHAR[]> szBufferWCMB(new CHAR[cch]);
192 cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, szBufferMBWC.get(), static_cast<int>(count_inter), szBufferWCMB.get(), cch, lpDefaultChar, NULL);
193 dst.append(reinterpret_cast<const T_to*>(szBufferWCMB.get()), strnlen(szBufferWCMB.get(), cch));
194 return;
195 }
196 throw std::runtime_error("MultiByteToWideChar failed");
197 }
198#else
199 dst.reserve(dst.size() + count_src);
200 T_to buf[1024 / sizeof(T_to)];
201 size_t src_size = stdex::mul(sizeof(T_from), count_src);
202 for (;;) {
203 T_to* output = &buf[0];
204 size_t output_size = sizeof(buf);
205 errno = 0;
206 iconv(m_handle, const_cast<char**>(reinterpret_cast<const char**>(&src)), &src_size, reinterpret_cast<char**>(&output), &output_size);
207 dst.append(buf, reinterpret_cast<T_to*>(reinterpret_cast<char*>(buf) + sizeof(buf) - output_size));
208 if (!errno)
209 break;
210 if (errno == E2BIG)
211 continue;
212 throw std::runtime_error("iconv failed");
213 }
214#endif
215 }
216
223 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>>
224 inline void strcat(
225 _Inout_ std::basic_string<T_to, _Traits_to, _Alloc_to>& dst,
226 _In_z_ const T_from* src)
227 {
228 strcat(dst, src, SIZE_MAX);
229 }
230
237 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>, class _Traits_from = std::char_traits<T_from>, class _Alloc_from = std::allocator<T_from>>
238 inline void strcat(
239 _Inout_ std::basic_string<T_to, _Traits_to, _Alloc_to>& dst,
240 _In_ const std::basic_string<T_from, _Traits_from, _Alloc_from>& src)
241 {
242 strcat(dst, src.data(), src.size());
243 }
244
252 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>>
253 inline void strcpy(
254 _Inout_ std::basic_string<T_to, _Traits_to, _Alloc_to>& dst,
255 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src)
256 {
257 dst.clear();
258 strcat(dst, src, count_src);
259 }
260
267 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>>
268 inline void strcpy(
269 _Inout_ std::basic_string<T_to, _Traits_to, _Alloc_to>& dst,
270 _In_z_ const T_from* src)
271 {
272 strcpy(dst, src, SIZE_MAX);
273 }
274
281 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>, class _Traits_from = std::char_traits<T_from>, class _Alloc_from = std::allocator<T_from>>
282 inline void strcpy(
283 _Inout_ std::basic_string<T_to, _Traits_to, _Alloc_to>& dst,
284 _In_ const std::basic_string<T_from, _Traits_from, _Alloc_from>& src)
285 {
286 strcpy(dst, src.data(), src.size());
287 }
288
295 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>>
296 inline std::basic_string<T_to, _Traits_to, _Alloc_to> convert(_In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src)
297 {
298 std::basic_string<T_to, _Traits_to, _Alloc_to> dst;
299 strcat(dst, src, count_src);
300 return dst;
301 }
302
308 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>>
309 inline std::basic_string<T_to, _Traits_to, _Alloc_to> convert(_In_z_ const T_from* src)
310 {
311 return convert(src, SIZE_MAX);
312 }
313
319 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>, class _Traits_from = std::char_traits<T_from>, class _Alloc_from = std::allocator<T_from>>
320 inline std::basic_string<T_to, _Traits_to, _Alloc_to> convert(_In_ const std::basic_string<T_from, _Traits_from, _Alloc_from>& src)
321 {
322 return convert(src.data(), src.size());
323 }
324
325 inline void clear()
326 {
327#ifndef _WIN32
328 iconv(m_handle, NULL, NULL, NULL, NULL);
329#endif
330 }
331
332 static charset_id system_charset()
333 {
334#ifdef _WIN32
335 return static_cast<charset_id>(GetACP());
336#else
337 const char* lctype = nl_langinfo(CODESET);
338 if (strcmp(lctype, "UTF-8") == 0) return charset_id::utf8;
339 if (strcmp(lctype, "UTF-16") == 0) return charset_id::utf16;
340#if BYTE_ORDER == BIG_ENDIAN
341 if (strcmp(lctype, "UTF-16BE") == 0) return charset_id::utf16;
342#else
343 if (strcmp(lctype, "UTF-16LE") == 0) return charset_id::utf16;
344#endif
345 if (strcmp(lctype, "UTF-32") == 0) return charset_id::utf32;
346#if BYTE_ORDER == BIG_ENDIAN
347 if (strcmp(lctype, "UTF-32BE") == 0) return charset_id::utf32;
348#else
349 if (strcmp(lctype, "UTF-32LE") == 0) return charset_id::utf32;
350#endif
351 if (strcmp(lctype, "CP1250") == 0) return charset_id::windows1250;
352 if (strcmp(lctype, "CP1251") == 0) return charset_id::windows1251;
353 if (strcmp(lctype, "CP1252") == 0) return charset_id::windows1252;
354 return charset_id::system;
355#endif
356 }
357
358#ifdef _WIN32
359 protected:
360 static UINT to_encoding(_In_ charset_id charset)
361 {
362 return
363 charset == charset_id::system ? GetACP() :
364 charset == charset_id::oem ? GetOEMCP() :
365 static_cast<UINT>(charset);
366 }
367
368 protected:
369 UINT m_from, m_to;
370#else
371 protected:
372 static const char* to_encoding(_In_ charset_id charset)
373 {
374 static const char* const encodings[static_cast<std::underlying_type_t<charset_id>>(charset_id::_max)] = {
375 "", // system
376 "UTF-8", // utf8
377#if BYTE_ORDER == BIG_ENDIAN
378 "UTF-16BE", // utf16
379 "UTF-32BE", // utf32
380#else
381 "UTF-16LE", // utf16
382 "UTF-32LE", // utf32
383#endif
384 "CP1250", // windows1250
385 "CP1251", // windows1251
386 "CP1252", // windows1252
387 };
388 return
389 charset == charset_id::system ? nl_langinfo(CODESET) :
390 encodings[static_cast<std::underlying_type_t<charset_id>>(charset)];
391 }
392
393 protected:
394 iconv_t m_handle;
395#endif
396 };
397
408 inline void strcat(
409 _Inout_ std::wstring& dst,
410 _In_reads_or_z_opt_(count_src) const char* src, _In_ size_t count_src,
411 _In_ charset_id charset = charset_id::system)
412 {
413 charset_encoder<char, wchar_t>(charset, wchar_t_charset).strcat(dst, src, count_src);
414 }
415
416 _Deprecated_("Use stdex::strcat")
417 inline void str2wstr(
418 _Inout_ std::wstring& dst,
419 _In_reads_or_z_opt_(count_src) const char* src, _In_ size_t count_src,
420 _In_ charset_id charset = charset_id::system)
421 {
422 strcat(dst, src, count_src, charset);
423 }
424
434 inline void strcat(
435 _Inout_ std::wstring& dst,
436 _In_ const std::string& src,
437 _In_ charset_id charset = charset_id::system)
438 {
439 strcat(dst, src.data(), src.size(), charset);
440 }
441
442 _Deprecated_("Use stdex::strcat")
443 inline void str2wstr(
444 _Inout_ std::wstring& dst,
445 _In_ const std::string& src,
446 _In_ charset_id charset = charset_id::system)
447 {
448 strcat(dst, src, charset);
449 }
450
461 inline void strcpy(
462 _Inout_ std::wstring& dst,
463 _In_reads_or_z_opt_(count_src) const char* src, _In_ size_t count_src,
464 _In_ charset_id charset = charset_id::system)
465 {
466 dst.clear();
467 strcat(dst, src, count_src, charset);
468 }
469
479 inline void strcpy(
480 _Inout_ std::wstring& dst,
481 _In_ const std::string& src,
482 _In_ charset_id charset = charset_id::system)
483 {
484 strcpy(dst, src.data(), src.size(), charset);
485 }
486
497 inline std::wstring str2wstr(
498 _In_z_ const char* src,
499 _In_ charset_id charset = charset_id::system)
500 {
501 std::wstring dst;
502 strcat(dst, src, SIZE_MAX, charset);
503 return dst;
504 }
505
517 inline std::wstring str2wstr(
518 _In_reads_or_z_opt_(count_src) const char* src, _In_ size_t count_src,
519 _In_ charset_id charset = charset_id::system)
520 {
521 std::wstring dst;
522 strcat(dst, src, count_src, charset);
523 return dst;
524 }
525
536 inline std::wstring str2wstr(
537 _In_ const std::string& src,
538 _In_ charset_id charset = charset_id::system)
539 {
540 return str2wstr(src.c_str(), src.size(), charset);
541 }
542
553 inline void strcat(
554 _Inout_ std::string& dst,
555 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
556 _In_ charset_id charset = charset_id::system)
557 {
558 charset_encoder<wchar_t, char>(wchar_t_charset, charset).strcat(dst, src, count_src);
559 }
560
561 _Deprecated_("Use stdex::strcat")
562 inline void wstr2str(
563 _Inout_ std::string& dst,
564 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
565 _In_ charset_id charset = charset_id::system)
566 {
567 strcat(dst, src, count_src, charset);
568 }
569
579 inline void strcat(
580 _Inout_ std::string& dst,
581 _In_ const std::wstring& src,
582 _In_ charset_id charset = charset_id::system)
583 {
584 strcat(dst, src.c_str(), src.size(), charset);
585 }
586
587 _Deprecated_("Use stdex::strcat")
588 inline void wstr2str(
589 _Inout_ std::string& dst,
590 _In_ const std::wstring& src,
591 _In_ charset_id charset = charset_id::system)
592 {
593 strcat(dst, src, charset);
594 }
595
606 inline void strcpy(
607 _Inout_ std::string& dst,
608 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
609 _In_ charset_id charset = charset_id::system)
610 {
611 dst.clear();
612 strcat(dst, src, count_src, charset);
613 }
614
624 inline void strcpy(
625 _Inout_ std::string& dst,
626 _In_ const std::wstring& src,
627 _In_ charset_id charset = charset_id::system)
628 {
629 strcpy(dst, src.data(), src.size(), charset);
630 }
631
642 inline std::string wstr2str(
643 _In_z_ const wchar_t* src,
644 _In_ charset_id charset = charset_id::system)
645 {
646 std::string dst;
647 strcat(dst, src, SIZE_MAX, charset);
648 return dst;
649 }
650
662 inline std::string wstr2str(
663 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
664 _In_ charset_id charset = charset_id::system)
665 {
666 std::string dst;
667 strcat(dst, src, count_src, charset);
668 return dst;
669 }
670
681 inline std::string wstr2str(
682 _In_ const std::wstring& src,
683 _In_ charset_id charset = charset_id::system)
684 {
685 return wstr2str(src.c_str(), src.size(), charset);
686 }
687}
Encoding converter context.
Definition unicode.hpp:56
void strcpy(std::basic_string< T_to, _Traits_to, _Alloc_to > &dst, const std::basic_string< T_from, _Traits_from, _Alloc_from > &src)
Convert string.
Definition unicode.hpp:282
std::basic_string< T_to, _Traits_to, _Alloc_to > convert(const std::basic_string< T_from, _Traits_from, _Alloc_from > &src)
Return converted string.
Definition unicode.hpp:320
std::basic_string< T_to, _Traits_to, _Alloc_to > convert(const T_from *src)
Return converted string.
Definition unicode.hpp:309
void strcat(std::basic_string< T_to, _Traits_to, _Alloc_to > &dst, const std::basic_string< T_from, _Traits_from, _Alloc_from > &src)
Convert string and append to string.
Definition unicode.hpp:238
void strcpy(std::basic_string< T_to, _Traits_to, _Alloc_to > &dst, const T_from *src)
Convert string.
Definition unicode.hpp:268
void strcat(std::basic_string< T_to, _Traits_to, _Alloc_to > &dst, _In_reads_or_z_opt_(count_src) const T_from *src, size_t count_src)
Convert string and append to string.
Definition unicode.hpp:85
void strcat(std::basic_string< T_to, _Traits_to, _Alloc_to > &dst, const T_from *src)
Convert string and append to string.
Definition unicode.hpp:224
void strcpy(std::basic_string< T_to, _Traits_to, _Alloc_to > &dst, _In_reads_or_z_opt_(count_src) const T_from *src, size_t count_src)
Convert string.
Definition unicode.hpp:253
std::basic_string< T_to, _Traits_to, _Alloc_to > convert(_In_reads_or_z_opt_(count_src) const T_from *src, size_t count_src)
Return converted string.
Definition unicode.hpp:296