stdex
Additional custom or not Standard C++ covered algorithms
Loading...
Searching...
No Matches
unicode.hpp
1/*
2 SPDX-License-Identifier: MIT
3 Copyright © 2023 Amebis
4*/
5
6#pragma once
7
8#include "compat.hpp"
9#include "endian.hpp"
10#include "math.hpp"
11#include "system.hpp"
12#include <assert.h>
13#include <stdint.h>
14#ifndef _WIN32
15#include <iconv.h>
16#include <langinfo.h>
17#endif
18#include <memory>
19#include <string>
20
21namespace stdex
22{
23 enum class charset_id : uint16_t {
24#ifdef _WIN32
25 system = CP_ACP,
26 oem = CP_OEMCP,
27 utf8 = CP_UTF8,
28 utf16 = 1200 /*CP_WINUNICODE*/,
29 windows1250 = 1250,
30 windows1251 = 1251,
31 windows1252 = 1252,
32#else
33 system = 0,
34 utf8,
35 utf16,
36 utf32,
37 windows1250,
38 windows1251,
39 windows1252,
40
41 _max
42#endif
43 };
44
45#ifdef _WIN32
46 constexpr charset_id wchar_t_charset = charset_id::utf16;
47#else
48 constexpr charset_id wchar_t_charset = charset_id::utf32;
49#endif
50
54 template <typename T_from, typename T_to>
56 {
57 public:
58 charset_encoder(_In_ charset_id from, _In_ charset_id to)
59 {
60#ifdef _WIN32
61 m_from = to_encoding(from);
62 m_to = to_encoding(to);
63#else
64 m_handle = iconv_open(to_encoding(to), to_encoding(from));
65 if (m_handle == (iconv_t)-1)
66 throw std::runtime_error("iconv_open failed");
67#endif
68 }
69
70#ifndef _WIN32
72 {
73 iconv_close(m_handle);
74 }
75#endif
76
84 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>>
85 void strcat(
86 _Inout_ std::basic_string<T_to, _Traits_to, _Alloc_to> &dst,
87 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src)
88 {
89 assert(src || !count_src);
90 count_src = stdex::strnlen(src, count_src);
91 if (!count_src) _Unlikely_
92 return;
93
94#ifdef _WIN32
95 constexpr DWORD dwFlagsMBWC = MB_PRECOMPOSED;
96 constexpr DWORD dwFlagsWCMB = 0;
97 constexpr LPCCH lpDefaultChar = NULL;
98
99 _Analysis_assume_(src);
100 if (m_from == m_to) _Unlikely_{
101 dst.append(reinterpret_cast<const T_to*>(src), count_src);
102 return;
103 }
104
105 if _Constexpr_ (sizeof(T_from) == sizeof(char) && sizeof(T_to) == sizeof(wchar_t)) {
106 assert(count_src < INT_MAX || count_src == SIZE_MAX);
107
108 // Try to convert to stack buffer first.
109 WCHAR szStackBuffer[1024 / sizeof(WCHAR)];
110#pragma warning(suppress: 6387) // Testing indicates src may be NULL when count_src is also 0. Is SAL of the lpMultiByteStr parameter wrong?
111 int cch = MultiByteToWideChar(static_cast<UINT>(m_from), dwFlagsMBWC, reinterpret_cast<LPCCH>(src), static_cast<int>(count_src), szStackBuffer, _countof(szStackBuffer));
112 if (cch) {
113 // Append from stack.
114 dst.append(reinterpret_cast<const T_to*>(szStackBuffer), count_src != SIZE_MAX ? wcsnlen(szStackBuffer, cch) : static_cast<size_t>(cch) - 1);
115 return;
116 }
117 if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
118 // Query the required output size. Allocate buffer. Then convert again.
119 cch = MultiByteToWideChar(static_cast<UINT>(m_from), dwFlagsMBWC, reinterpret_cast<LPCCH>(src), static_cast<int>(count_src), NULL, 0);
120 std::unique_ptr<WCHAR[]> szBuffer(new WCHAR[cch]);
121 cch = MultiByteToWideChar(static_cast<UINT>(m_from), dwFlagsMBWC, reinterpret_cast<LPCCH>(src), static_cast<int>(count_src), szBuffer.get(), cch);
122 dst.append(reinterpret_cast<const T_to*>(szBuffer.get()), count_src != SIZE_MAX ? wcsnlen(szBuffer.get(), cch) : static_cast<size_t>(cch) - 1);
123 return;
124 }
125 throw std::runtime_error("MultiByteToWideChar failed");
126 }
127
128 if _Constexpr_ (sizeof(T_from) == sizeof(wchar_t) && sizeof(T_to) == sizeof(char)) {
129 assert(count_src < INT_MAX || count_src == SIZE_MAX);
130
131 // Try to convert to stack buffer first.
132 CHAR szStackBuffer[1024 / sizeof(CHAR)];
133#pragma warning(suppress: 6387) // Testing indicates src may be NULL when count_src is also 0. Is SAL of the lpWideCharStr parameter wrong?
134 int cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, reinterpret_cast<LPCWCH>(src), static_cast<int>(count_src), szStackBuffer, _countof(szStackBuffer), lpDefaultChar, NULL);
135 if (cch) {
136 // Copy from stack. Be careful not to include zero terminator.
137 dst.append(reinterpret_cast<const T_to*>(szStackBuffer), count_src != SIZE_MAX ? strnlen(szStackBuffer, cch) : static_cast<size_t>(cch) - 1);
138 return;
139 }
140 if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
141 // Query the required output size. Allocate buffer. Then convert again.
142 cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, reinterpret_cast<LPCWCH>(src), static_cast<int>(count_src), NULL, 0, lpDefaultChar, NULL);
143 std::unique_ptr<CHAR[]> szBuffer(new CHAR[cch]);
144 cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, reinterpret_cast<LPCWCH>(src), static_cast<int>(count_src), szBuffer.get(), cch, lpDefaultChar, NULL);
145 dst.append(reinterpret_cast<const T_to*>(szBuffer.get()), count_src != SIZE_MAX ? strnlen(szBuffer.get(), cch) : static_cast<size_t>(cch) - 1);
146 return;
147 }
148 throw std::runtime_error("WideCharToMultiByte failed");
149 }
150
151 if _Constexpr_ (sizeof(T_from) == sizeof(char) && sizeof(T_to) == sizeof(char)) {
152 assert(count_src < INT_MAX || count_src == SIZE_MAX);
153
154 // Try to convert to stack buffer first.
155 WCHAR szStackBufferMBWC[512 / sizeof(WCHAR)];
156#pragma warning(suppress: 6387) // Testing indicates src may be NULL when count_src is also 0. Is SAL of the lpMultiByteStr parameter wrong?
157 int cch = MultiByteToWideChar(static_cast<UINT>(m_from), dwFlagsMBWC, reinterpret_cast<LPCCH>(src), static_cast<int>(count_src), szStackBufferMBWC, _countof(szStackBufferMBWC));
158 if (cch) {
159 // Append from stack.
160 size_t count_inter = count_src != SIZE_MAX ? wcsnlen(szStackBufferMBWC, cch) : static_cast<size_t>(cch) - 1;
161 assert(count_inter < INT_MAX);
162
163 // Try to convert to stack buffer first.
164 CHAR szStackBufferWCMB[512 / sizeof(CHAR)];
165#pragma warning(suppress: 6387) // Testing indicates szStackBufferMBWC may be NULL when count_inter is also 0. Is SAL of the lpWideCharStr parameter wrong?
166 cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, szStackBufferMBWC, static_cast<int>(count_inter), szStackBufferWCMB, _countof(szStackBufferWCMB), lpDefaultChar, NULL);
167 if (cch) {
168 // Copy from stack. Be careful not to include zero terminator.
169 dst.append(reinterpret_cast<const T_to*>(szStackBufferWCMB), strnlen(szStackBufferWCMB, cch));
170 return;
171 }
172 if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
173 // Query the required output size. Allocate buffer. Then convert again.
174 cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, szStackBufferMBWC, static_cast<int>(count_inter), NULL, 0, lpDefaultChar, NULL);
175 std::unique_ptr<CHAR[]> szBufferWCMB(new CHAR[cch]);
176 cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, szStackBufferMBWC, static_cast<int>(count_inter), szBufferWCMB.get(), cch, lpDefaultChar, NULL);
177 dst.append(reinterpret_cast<const T_to*>(szBufferWCMB.get()), strnlen(szBufferWCMB.get(), cch));
178 return;
179 }
180 throw std::runtime_error("WideCharToMultiByte failed");
181 }
182 if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
183 // Query the required output size. Allocate buffer. Then convert again.
184 cch = MultiByteToWideChar(static_cast<UINT>(m_from), dwFlagsMBWC, reinterpret_cast<LPCCH>(src), static_cast<int>(count_src), NULL, 0);
185 std::unique_ptr<WCHAR[]> szBufferMBWC(new WCHAR[cch]);
186 cch = MultiByteToWideChar(static_cast<UINT>(m_from), dwFlagsMBWC, reinterpret_cast<LPCCH>(src), static_cast<int>(count_src), szBufferMBWC.get(), cch);
187 size_t count_inter = count_src != SIZE_MAX ? wcsnlen(szBufferMBWC.get(), cch) : static_cast<size_t>(cch) - 1;
188
189 // Query the required output size. Allocate buffer. Then convert again.
190 cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, szBufferMBWC.get(), static_cast<int>(count_inter), NULL, 0, lpDefaultChar, NULL);
191 std::unique_ptr<CHAR[]> szBufferWCMB(new CHAR[cch]);
192 cch = WideCharToMultiByte(static_cast<UINT>(m_to), dwFlagsWCMB, szBufferMBWC.get(), static_cast<int>(count_inter), szBufferWCMB.get(), cch, lpDefaultChar, NULL);
193 dst.append(reinterpret_cast<const T_to*>(szBufferWCMB.get()), strnlen(szBufferWCMB.get(), cch));
194 return;
195 }
196 throw std::runtime_error("MultiByteToWideChar failed");
197 }
198#else
199 dst.reserve(dst.size() + count_src);
200 T_to buf[1024 / sizeof(T_to)];
201 size_t src_size = stdex::mul(sizeof(T_from), count_src);
202 for (;;) {
203 T_to* output = &buf[0];
204 size_t output_size = sizeof(buf);
205 errno = 0;
206 iconv(m_handle, const_cast<char**>(reinterpret_cast<const char**>(&src)), &src_size, reinterpret_cast<char**>(&output), &output_size);
207 dst.append(buf, reinterpret_cast<T_to*>(reinterpret_cast<char*>(buf) + sizeof(buf) - output_size));
208 if (!errno)
209 break;
210 if (errno == E2BIG)
211 continue;
212 throw std::runtime_error("iconv failed");
213 }
214#endif
215 }
216
223 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>>
224 inline void strcat(
225 _Inout_ std::basic_string<T_to, _Traits_to, _Alloc_to>& dst,
226 _In_z_ const T_from* src)
227 {
228 strcat(dst, src, SIZE_MAX);
229 }
230
237 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>, class _Traits_from = std::char_traits<T_from>, class _Alloc_from = std::allocator<T_from>>
238 inline void strcat(
239 _Inout_ std::basic_string<T_to, _Traits_to, _Alloc_to>& dst,
240 _In_ const std::basic_string<T_from, _Traits_from, _Alloc_from>& src)
241 {
242 strcat(dst, src.data(), src.size());
243 }
244
252 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>>
253 inline void strcpy(
254 _Inout_ std::basic_string<T_to, _Traits_to, _Alloc_to>& dst,
255 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src)
256 {
257 dst.clear();
258 strcat(dst, src, count_src);
259 }
260
267 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>>
268 inline void strcpy(
269 _Inout_ std::basic_string<T_to, _Traits_to, _Alloc_to>& dst,
270 _In_z_ const T_from* src)
271 {
272 strcpy(dst, src, SIZE_MAX);
273 }
274
281 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>, class _Traits_from = std::char_traits<T_from>, class _Alloc_from = std::allocator<T_from>>
282 inline void strcpy(
283 _Inout_ std::basic_string<T_to, _Traits_to, _Alloc_to>& dst,
284 _In_ const std::basic_string<T_from, _Traits_from, _Alloc_from>& src)
285 {
286 strcpy(dst, src.data(), src.size());
287 }
288
295 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>>
296 inline std::basic_string<T_to, _Traits_to, _Alloc_to> convert(_In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src)
297 {
298 std::basic_string<T_to, _Traits_to, _Alloc_to> dst;
299 strcat(dst, src, count_src);
300 return dst;
301 }
302
308 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>>
309 inline std::basic_string<T_to, _Traits_to, _Alloc_to> convert(_In_z_ const T_from* src)
310 {
311 return convert(src, SIZE_MAX);
312 }
313
319 template <class _Traits_to = std::char_traits<T_to>, class _Alloc_to = std::allocator<T_to>, class _Traits_from = std::char_traits<T_from>, class _Alloc_from = std::allocator<T_from>>
320 inline std::basic_string<T_to, _Traits_to, _Alloc_to> convert(_In_ const std::basic_string<T_from, _Traits_from, _Alloc_from>& src)
321 {
322 return convert(src.data(), src.size());
323 }
324
325 inline void clear()
326 {
327#ifndef _WIN32
328 iconv(m_handle, NULL, NULL, NULL, NULL);
329#endif
330 }
331
332#ifdef _WIN32
333 protected:
334 static UINT to_encoding(_In_ charset_id charset)
335 {
336 return
337 charset == charset_id::system ? GetACP() :
338 charset == charset_id::oem ? GetOEMCP() :
339 static_cast<UINT>(charset);
340 }
341
342 protected:
343 UINT m_from, m_to;
344#else
345 protected:
346 static const char* to_encoding(_In_ charset_id charset)
347 {
348 static const char* const encodings[static_cast<std::underlying_type_t<charset_id>>(charset_id::_max)] = {
349 "", // system
350 "UTF-8", // utf8
351#if BYTE_ORDER == BIG_ENDIAN
352 "UTF-16BE", // utf16
353 "UTF-32BE", // utf32
354#else
355 "UTF-16LE", // utf16
356 "UTF-32LE", // utf32
357#endif
358 "CP1250", // windows1250
359 "CP1251", // windows1251
360 "CP1252", // windows1252
361 };
362 return
363 charset == charset_id::system ? nl_langinfo(LC_CTYPE) :
364 encodings[static_cast<std::underlying_type_t<charset_id>>(charset)];
365 }
366
367 protected:
368 iconv_t m_handle;
369#endif
370 };
371
382 inline void strcat(
383 _Inout_ std::wstring& dst,
384 _In_reads_or_z_opt_(count_src) const char* src, _In_ size_t count_src,
385 _In_ charset_id charset = charset_id::system)
386 {
387 charset_encoder<char, wchar_t>(charset, wchar_t_charset).strcat(dst, src, count_src);
388 }
389
390 _Deprecated_("Use stdex::strcat")
391 inline void str2wstr(
392 _Inout_ std::wstring& dst,
393 _In_reads_or_z_opt_(count_src) const char* src, _In_ size_t count_src,
394 _In_ charset_id charset = charset_id::system)
395 {
396 strcat(dst, src, count_src, charset);
397 }
398
408 inline void strcat(
409 _Inout_ std::wstring& dst,
410 _In_ const std::string& src,
411 _In_ charset_id charset = charset_id::system)
412 {
413 strcat(dst, src.data(), src.size(), charset);
414 }
415
416 _Deprecated_("Use stdex::strcat")
417 inline void str2wstr(
418 _Inout_ std::wstring& dst,
419 _In_ const std::string& src,
420 _In_ charset_id charset = charset_id::system)
421 {
422 strcat(dst, src, charset);
423 }
424
435 inline void strcpy(
436 _Inout_ std::wstring& dst,
437 _In_reads_or_z_opt_(count_src) const char* src, _In_ size_t count_src,
438 _In_ charset_id charset = charset_id::system)
439 {
440 dst.clear();
441 strcat(dst, src, count_src, charset);
442 }
443
453 inline void strcpy(
454 _Inout_ std::wstring& dst,
455 _In_ const std::string& src,
456 _In_ charset_id charset = charset_id::system)
457 {
458 strcpy(dst, src.data(), src.size(), charset);
459 }
460
471 inline std::wstring str2wstr(
472 _In_z_ const char* src,
473 _In_ charset_id charset = charset_id::system)
474 {
475 std::wstring dst;
476 strcat(dst, src, SIZE_MAX, charset);
477 return dst;
478 }
479
491 inline std::wstring str2wstr(
492 _In_reads_or_z_opt_(count_src) const char* src, _In_ size_t count_src,
493 _In_ charset_id charset = charset_id::system)
494 {
495 std::wstring dst;
496 strcat(dst, src, count_src, charset);
497 return dst;
498 }
499
510 inline std::wstring str2wstr(
511 _In_ const std::string& src,
512 _In_ charset_id charset = charset_id::system)
513 {
514 return str2wstr(src.c_str(), src.size(), charset);
515 }
516
527 inline void strcat(
528 _Inout_ std::string& dst,
529 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
530 _In_ charset_id charset = charset_id::system)
531 {
532 charset_encoder<wchar_t, char>(wchar_t_charset, charset).strcat(dst, src, count_src);
533 }
534
535 _Deprecated_("Use stdex::strcat")
536 inline void wstr2str(
537 _Inout_ std::string& dst,
538 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
539 _In_ charset_id charset = charset_id::system)
540 {
541 strcat(dst, src, count_src, charset);
542 }
543
553 inline void strcat(
554 _Inout_ std::string& dst,
555 _In_ const std::wstring& src,
556 _In_ charset_id charset = charset_id::system)
557 {
558 strcat(dst, src.c_str(), src.size(), charset);
559 }
560
561 _Deprecated_("Use stdex::strcat")
562 inline void wstr2str(
563 _Inout_ std::string& dst,
564 _In_ const std::wstring& src,
565 _In_ charset_id charset = charset_id::system)
566 {
567 strcat(dst, src, charset);
568 }
569
580 inline void strcpy(
581 _Inout_ std::string& dst,
582 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
583 _In_ charset_id charset = charset_id::system)
584 {
585 dst.clear();
586 strcat(dst, src, count_src, charset);
587 }
588
598 inline void strcpy(
599 _Inout_ std::string& dst,
600 _In_ const std::wstring& src,
601 _In_ charset_id charset = charset_id::system)
602 {
603 strcpy(dst, src.data(), src.size(), charset);
604 }
605
616 inline std::string wstr2str(
617 _In_z_ const wchar_t* src,
618 _In_ charset_id charset = charset_id::system)
619 {
620 std::string dst;
621 strcat(dst, src, SIZE_MAX, charset);
622 return dst;
623 }
624
636 inline std::string wstr2str(
637 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
638 _In_ charset_id charset = charset_id::system)
639 {
640 std::string dst;
641 strcat(dst, src, count_src, charset);
642 return dst;
643 }
644
655 inline std::string wstr2str(
656 _In_ const std::wstring& src,
657 _In_ charset_id charset = charset_id::system)
658 {
659 return wstr2str(src.c_str(), src.size(), charset);
660 }
661}
Encoding converter context.
Definition unicode.hpp:56
void strcpy(std::basic_string< T_to, _Traits_to, _Alloc_to > &dst, const std::basic_string< T_from, _Traits_from, _Alloc_from > &src)
Convert string.
Definition unicode.hpp:282
std::basic_string< T_to, _Traits_to, _Alloc_to > convert(const std::basic_string< T_from, _Traits_from, _Alloc_from > &src)
Return converted string.
Definition unicode.hpp:320
std::basic_string< T_to, _Traits_to, _Alloc_to > convert(const T_from *src)
Return converted string.
Definition unicode.hpp:309
void strcat(std::basic_string< T_to, _Traits_to, _Alloc_to > &dst, const std::basic_string< T_from, _Traits_from, _Alloc_from > &src)
Convert string and append to string.
Definition unicode.hpp:238
void strcpy(std::basic_string< T_to, _Traits_to, _Alloc_to > &dst, const T_from *src)
Convert string.
Definition unicode.hpp:268
void strcat(std::basic_string< T_to, _Traits_to, _Alloc_to > &dst, _In_reads_or_z_opt_(count_src) const T_from *src, size_t count_src)
Convert string and append to string.
Definition unicode.hpp:85
void strcat(std::basic_string< T_to, _Traits_to, _Alloc_to > &dst, const T_from *src)
Convert string and append to string.
Definition unicode.hpp:224
void strcpy(std::basic_string< T_to, _Traits_to, _Alloc_to > &dst, _In_reads_or_z_opt_(count_src) const T_from *src, size_t count_src)
Convert string.
Definition unicode.hpp:253
std::basic_string< T_to, _Traits_to, _Alloc_to > convert(_In_reads_or_z_opt_(count_src) const T_from *src, size_t count_src)
Return converted string.
Definition unicode.hpp:296