stdex
Additional custom or not Standard C++ covered algorithms
Loading...
Searching...
No Matches
sgml.hpp
1/*
2 SPDX-License-Identifier: MIT
3 Copyright © 2023 Amebis
4*/
5
6#pragma once
7
8#include "compat.hpp"
9#include "mapping.hpp"
10#include "sgml_unicode.hpp"
11#include "string.hpp"
12#include <exception>
13#include <string>
14
15namespace stdex
16{
18 template <class T>
19 inline const wchar_t* sgml2uni(_In_reads_or_z_(count) const T* entity, _In_ size_t count)
20 {
21 _Assume_(entity && count);
22 _Assume_(count < 2 || entity[0] != '#'); // No numeric entities
23
24 for (size_t i = 0, j = _countof(sgml_unicode); i < j; ) {
25 size_t m = (i + j) / 2;
26 if (sgml_unicode[m].sgml[0] < entity[0])
27 i = m + 1;
28 else if (sgml_unicode[m].sgml[0] > entity[0])
29 j = m;
30 else {
31 auto r = strncmp<char, T>(sgml_unicode[m].sgml + 1, _countof(sgml_unicode[0].sgml) - 1, entity + 1, count - 1);
32 if (r < 0)
33 i = m + 1;
34 else if (r > 0)
35 j = m;
36 else {
37 for (; i < m && strncmp<char, T>(sgml_unicode[m - 1].sgml, _countof(sgml_unicode[0].sgml), entity, count) == 0; m--);
38 return sgml_unicode[m].unicode;
39 }
40 }
41 }
42 return nullptr;
43 }
44
45 template <class T>
46 inline const T* sgmlend(
47 _In_reads_or_z_opt_(count) const T* str, _In_ size_t count)
48 {
49 _Assume_(str || !count);
50 for (size_t i = 0; i < count; i++) {
51 if (str[i] == ';')
52 return str + i;
53 if (!str[i] || str[i] == '&' || isspace(str[i]))
54 break;
55 }
56 return nullptr;
57 }
59
60 constexpr int sgml_full = 0x80000000;
61 constexpr int sgml_quot = 0x00000001;
62 constexpr int sgml_apos = 0x00000002;
63 constexpr int sgml_quot_apos = sgml_quot | sgml_apos;
64 constexpr int sgml_amp = 0x00000004;
65 constexpr int sgml_lt_gt = 0x00000008;
66 constexpr int sgml_bsol = 0x00000010;
67 constexpr int sgml_dollar = 0x00000020;
68 constexpr int sgml_percnt = 0x00000040;
69 constexpr int sgml_commat = 0x00000080;
70 constexpr int sgml_num = 0x00000100;
71 constexpr int sgml_lpar_rpar = 0x00000200;
72 constexpr int sgml_lcub_rcub = 0x00000400;
73 constexpr int sgml_lsqb_rsqb = 0x00000800;
74 constexpr int sgml_sgml = sgml_amp | sgml_lt_gt;
75 constexpr int sgml_ml_attrib = sgml_amp | sgml_quot_apos;
76 constexpr int sgml_c = sgml_amp | sgml_bsol | sgml_quot_apos;
77 // constexpr int sgml_ajt_lemma = sgml_amp | sgml_quot | sgml_dollar | sgml_percnt;
78 // constexpr int sgml_ajt_form = sgml_ajt_lemma;
79 // constexpr int sgml_kolos = sgml_amp | sgml_quot | sgml_dollar | sgml_percnt | sgml_lt_gt | sgml_bsol/* | sgml_commat | sgml_num*/ | sgml_lpar_rpar | sgml_lcub_rcub | sgml_lsqb_rsqb;
80
91 template <class T>
92 inline void sgml2wstrcat(
93 _Inout_ std::wstring& dst,
94 _In_reads_or_z_opt_(count_src) const T* src, _In_ size_t count_src,
95 _In_ int skip = 0,
96 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
97 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
98 {
99 _Assume_(src || !count_src);
100
101 const bool
102 skip_quot = (skip & sgml_quot) == 0,
103 skip_apos = (skip & sgml_apos) == 0,
104 skip_amp = (skip & sgml_amp) == 0,
105 skip_lt_gt = (skip & sgml_lt_gt) == 0,
106 skip_bsol = (skip & sgml_bsol) == 0,
107 skip_dollar = (skip & sgml_dollar) == 0,
108 skip_percnt = (skip & sgml_percnt) == 0,
109 skip_commat = (skip & sgml_commat) == 0,
110 skip_num = (skip & sgml_num) == 0,
111 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
112 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
113 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
114
115 count_src = strnlen(src, count_src);
116 dst.reserve(dst.size() + count_src);
117 for (size_t i = 0; i < count_src;) {
118 if (src[i] == '&') {
119 auto end = sgmlend(src + i + 1, count_src - i - 1);
120 if (end) {
121 const wchar_t* entity_w;
122 wchar_t chr[3];
123 size_t n = end - src - i - 1;
124 if (n >= 2 && src[i + 1] == '#') {
125 uint32_t unicode;
126 if (src[i + 2] == 'x' || src[i + 2] == 'X')
127 unicode = strtou32(src + i + 3, n - 2, nullptr, 16);
128 else
129 unicode = strtou32(src + i + 2, n - 1, nullptr, 10);
130#ifdef _WIN32
131 if (unicode < 0x10000) {
132 chr[0] = (wchar_t)unicode;
133 chr[1] = 0;
134 }
135 else {
136 ucs4_to_surrogate_pair(chr, unicode);
137 chr[2] = 0;
138 }
139#else
140 chr[0] = (wchar_t)unicode;
141 chr[1] = 0;
142#endif
143 entity_w = chr;
144 }
145 else
146 entity_w = sgml2uni(src + i + 1, n);
147
148 if (entity_w &&
149 (skip_quot || (entity_w[0] != L'"')) &&
150 (skip_apos || (entity_w[0] != L'\'')) &&
151 (skip_amp || (entity_w[0] != L'&')) &&
152 (skip_lt_gt || (entity_w[0] != L'<' && entity_w[0] != L'>')) &&
153 (skip_bsol || (entity_w[0] != L'\\')) &&
154 (skip_dollar || (entity_w[0] != L'$')) &&
155 (skip_percnt || (entity_w[0] != L'%')) &&
156 (skip_commat || (entity_w[0] != L'@')) &&
157 (skip_num || (entity_w[0] != L'#')) &&
158 (skip_lpar_rpar || (entity_w[0] != L'(' && entity_w[0] != L')')) &&
159 (skip_lcub_rcub || (entity_w[0] != L'{' && entity_w[0] != L'}')) &&
160 (skip_lsqb_rsqb || (entity_w[0] != L'[' && entity_w[0] != L']')))
161 {
162 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
163 dst.append(entity_w);
164 i = end - src + 1;
165 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
166 continue;
167 }
168 }
169 }
170 dst.append(1, src[i++]);
171 }
172 }
173
174 template <class T>
175 _Deprecated_("Use stdex::sgml2wstrcat")
176 inline void sgml2wstr(
177 _Inout_ std::wstring& dst,
178 _In_reads_or_z_opt_(count_src) const T* src, _In_ size_t count_src,
179 _In_ int skip = 0,
180 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
181 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
182 {
183 sgml2wstrcat(dst, src, count_src, skip, offset, map);
184 }
185
195 template <class T>
196 inline void sgml2wstrcat(
197 _Inout_ std::wstring& dst,
198 _In_ const std::basic_string<T>& src,
199 _In_ int skip = 0,
200 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
201 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
202 {
203 sgml2wstrcat(dst, src.data(), src.size(), skip, offset, map);
204 }
205
206 template <class T>
207 _Deprecated_("Use stdex::sgml2wstrcat")
208 inline void sgml2wstr(
209 _Inout_ std::wstring& dst,
210 _In_ const std::basic_string<T>& src,
211 _In_ int skip = 0,
212 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
213 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
214 {
215 sgml2wstrcat(dst, src, skip, offset, map);
216 }
217
231 template <class T>
232 inline size_t sgml2wstrcat(
233 _Inout_cap_(count_dst) wchar_t* dst, _In_ size_t count_dst,
234 _In_reads_or_z_opt_(count_src) const T* src, _In_ size_t count_src,
235 _In_ int skip = 0,
236 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
237 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
238 {
239 _Assume_(dst || !count_dst);
240 _Assume_(src || !count_src);
241
242 static const std::invalid_argument buffer_overrun("buffer overrun");
243 const bool
244 skip_quot = (skip & sgml_quot) == 0,
245 skip_apos = (skip & sgml_apos) == 0,
246 skip_amp = (skip & sgml_amp) == 0,
247 skip_lt_gt = (skip & sgml_lt_gt) == 0,
248 skip_bsol = (skip & sgml_bsol) == 0,
249 skip_dollar = (skip & sgml_dollar) == 0,
250 skip_percnt = (skip & sgml_percnt) == 0,
251 skip_commat = (skip & sgml_commat) == 0,
252 skip_num = (skip & sgml_num) == 0,
253 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
254 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
255 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
256
257 size_t j = wcsnlen(dst, count_dst);
258 count_src = strnlen(src, count_src);
259 for (size_t i = 0; i < count_src;) {
260 if (src[i] == '&') {
261 auto end = sgmlend(src + i + 1, count_src - i - 1);
262 if (end) {
263 const wchar_t* entity_w;
264 wchar_t chr[3];
265 size_t n = end - src - i - 1;
266 if (n >= 2 && src[i + 1] == '#') {
267 uint32_t unicode;
268 if (src[i + 2] == 'x' || src[i + 2] == 'X')
269 unicode = strtou32(src + i + 3, n - 2, nullptr, 16);
270 else
271 unicode = strtou32(src + i + 2, n - 1, nullptr, 10);
272#ifdef _WIN32
273 if (unicode < 0x10000) {
274 chr[0] = (wchar_t)unicode;
275 chr[1] = 0;
276 }
277 else {
278 ucs4_to_surrogate_pair(chr, unicode);
279 chr[2] = 0;
280 }
281#else
282 chr[0] = (wchar_t)unicode;
283 chr[1] = 0;
284#endif
285 entity_w = chr;
286 }
287 else
288 entity_w = sgml2uni(src + i + 1, n);
289
290 if (entity_w &&
291 (skip_quot || (entity_w[0] != L'"')) &&
292 (skip_apos || (entity_w[0] != L'\'')) &&
293 (skip_amp || (entity_w[0] != L'&')) &&
294 (skip_lt_gt || (entity_w[0] != L'<' && entity_w[0] != L'>')) &&
295 (skip_bsol || (entity_w[0] != L'\\')) &&
296 (skip_dollar || (entity_w[0] != L'$')) &&
297 (skip_percnt || (entity_w[0] != L'%')) &&
298 (skip_commat || (entity_w[0] != L'@')) &&
299 (skip_num || (entity_w[0] != L'#')) &&
300 (skip_lpar_rpar || (entity_w[0] != L'(' && entity_w[0] != L')')) &&
301 (skip_lcub_rcub || (entity_w[0] != L'{' && entity_w[0] != L'}')) &&
302 (skip_lsqb_rsqb || (entity_w[0] != L'[' && entity_w[0] != L']')))
303 {
304 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
305 size_t m = wcslen(entity_w);
306 if (j + m >= count_dst)
307 throw buffer_overrun;
308 memcpy(dst + j, entity_w, m * sizeof(wchar_t)); j += m;
309 i = end - src + 1;
310 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
311 continue;
312 }
313 }
314 }
315 if (j + 1 >= count_dst)
316 throw buffer_overrun;
317 dst[j++] = src[i++];
318 }
319 if (j >= count_dst)
320 throw buffer_overrun;
321 dst[j] = 0;
322 return j;
323 }
324
325 template <class T>
326 _Deprecated_("Use stdex::sgml2wstrcat")
327 inline size_t sgml2wstr(
328 _Inout_cap_(count_dst) wchar_t* dst, _In_ size_t count_dst,
329 _In_reads_or_z_opt_(count_src) const T* src, _In_ size_t count_src,
330 _In_ int skip = 0,
331 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
332 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
333 {
334 return sgml2wstrcat(dst, count_dst, src, count_src, skip, offset, map);
335 }
336
347 template <class T>
348 inline void sgml2wstrcpy(
349 _Inout_ std::wstring& dst,
350 _In_reads_or_z_opt_(count_src) const T* src, _In_ size_t count_src,
351 _In_ int skip = 0,
352 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
353 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
354 {
355 dst.clear();
356 if (map)
357 map->clear();
358 sgml2wstrcat(dst, src, count_src, skip, offset, map);
359 }
360
370 template<class _Elem, class _Traits, class _Ax>
371 inline void sgml2wstrcpy(
372 _Inout_ std::wstring& dst,
373 _In_ const std::basic_string<_Elem, _Traits, _Ax>& src,
374 _In_ int skip = 0,
375 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
376 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
377 {
378 sgml2wstrcpy(dst, src.data(), src.size(), skip, offset, map);
379 }
380
394 template <class T>
395 inline size_t sgml2wstrcpy(
396 _Inout_cap_(count_dst) wchar_t* dst, _In_ size_t count_dst,
397 _In_reads_or_z_opt_(count_src) const T* src, _In_ size_t count_src,
398 _In_ int skip = 0,
399 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
400 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
401 {
402 _Assume_(dst || !count_dst);
403 if (count_dst)
404 dst[0] = 0;
405 if (map)
406 map->clear();
407 return sgml2wstrcat(dst, count_dst, src, count_src, skip, offset, map);
408 }
409
421 template <class T>
422 inline std::wstring sgml2wstr(
423 _In_reads_or_z_opt_(count_src) const T* src, _In_ size_t count_src,
424 _In_ int skip = 0,
425 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
426 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
427 {
428 std::wstring dst;
429 sgml2wstrcat(dst, src, count_src, skip, offset, map);
430 return dst;
431 }
432
443 template <class T>
444 inline std::wstring sgml2wstr(
445 _In_ const std::basic_string<T>& src,
446 _In_ int skip = 0,
447 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
448 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
449 {
450 return sgml2wstr(src.c_str(), src.size(), skip, offset, map);
451 }
452
454 inline const char* chr2sgml(_In_reads_or_z_(count) const wchar_t* entity, _In_ size_t count)
455 {
456 _Assume_(entity && count);
457
458 const wchar_t e2 = entity[0];
459 for (size_t i = 0, j = _countof(unicode_sgml); i < j; ) {
460 size_t m = (i + j) / 2;
461 wchar_t e1 = sgml_unicode[unicode_sgml[m]].unicode[0];
462 if (e1 < e2)
463 i = m + 1;
464 else if (e1 > e2)
465 j = m;
466 else {
467 auto r = strncmp(sgml_unicode[unicode_sgml[m]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1);
468 if (r < 0)
469 i = m + 1;
470 else if (r > 0)
471 j = m;
472 else {
473 for (; i < m && sgml_unicode[unicode_sgml[m - 1]].unicode[0] == e2 && strncmp(sgml_unicode[unicode_sgml[m - 1]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1) == 0; m--);
474 return sgml_unicode[unicode_sgml[m]].sgml;
475 }
476 }
477 }
478 return nullptr;
479 }
481
490 inline void wstr2sgmlcat(
491 _Inout_ std::string& dst,
492 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
493 _In_ size_t what = 0)
494 {
495 _Assume_(src || !count_src);
496
497 const bool
498 do_ascii = (what & sgml_full) == 0,
499 do_quot = (what & sgml_quot) == 0,
500 do_apos = (what & sgml_apos) == 0,
501 do_lt_gt = (what & sgml_lt_gt) == 0,
502 do_bsol = (what & sgml_bsol) == 0,
503 do_dollar = (what & sgml_dollar) == 0,
504 do_percnt = (what & sgml_percnt) == 0,
505 do_commat = (what & sgml_commat) == 0,
506 do_num = (what & sgml_num) == 0,
507 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
508 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
509 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
510
511 count_src = wcsnlen(src, count_src);
512 dst.reserve(dst.size() + count_src);
513 for (size_t i = 0; i < count_src;) {
514 size_t n = glyphlen(src + i, count_src - i);
515 if (n == 1 &&
516 do_ascii && (unsigned int)src[i] < 128 &&
517 src[i] != L'&' &&
518 (do_quot || (src[i] != L'"')) &&
519 (do_apos || (src[i] != L'\'')) &&
520 (do_lt_gt || (src[i] != L'<' && src[i] != L'>')) &&
521 (do_bsol || (src[i] != L'\\')) &&
522 (do_dollar || (src[i] != L'$')) &&
523 (do_percnt || (src[i] != L'%')) &&
524 (do_commat || (src[i] != L'@')) &&
525 (do_num || (src[i] != L'#')) &&
526 (do_lpar_rpar || (src[i] != L'(' && src[i] != L')')) &&
527 (do_lcub_rcub || (src[i] != L'{' && src[i] != L'}')) &&
528 (do_lsqb_rsqb || (src[i] != L'[' && src[i] != L']')))
529 {
530 // 7-bit ASCII and no desire to encode it as an SGML entity.
531 dst.append(1, static_cast<char>(src[i++]));
532 }
533 else {
534 const char* entity = chr2sgml(src + i, n);
535 if (entity) {
536 dst.append(1, '&');
537 dst.append(entity);
538 dst.append(1, ';');
539 i += n;
540 }
541 else if (n == 1) {
542 // Trivial character (1 code unit, 1 glyph), no entity available.
543 if ((unsigned int)src[i] < 128)
544 dst.append(1, static_cast<char>(src[i++]));
545 else {
546 char tmp[3 + 8 + 1 + 1];
547 snprintf(tmp, _countof(tmp), "&#x%x;", src[i++]);
548 dst.append(tmp);
549 }
550 }
551 else {
552 // Non-trivial character. Decompose.
553 const size_t end = i + n;
554 while (i < end) {
555 if ((entity = chr2sgml(src + i, 1)) != nullptr) {
556 dst.append(1, '&');
557 dst.append(entity);
558 dst.append(1, ';');
559 i++;
560 }
561 else if ((unsigned int)src[i] < 128)
562 dst.append(1, static_cast<char>(src[i++]));
563 else {
564 uint32_t unicode;
565#ifdef _WIN32
566 if (i + 1 < end && is_surrogate_pair(src + i)) {
567 unicode = surrogate_pair_to_ucs4(src + i);
568 i += 2;
569 }
570 else
571#endif
572 {
573 unicode = src[i++];
574 }
575 char tmp[3 + 8 + 1 + 1];
576 snprintf(tmp, _countof(tmp), "&#x%x;", unicode);
577 dst.append(tmp);
578 }
579 }
580 }
581 }
582 }
583 }
584
585 _Deprecated_("Use stdex::wstr2sgmlcat")
586 inline void wstr2sgml(
587 _Inout_ std::string& dst,
588 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
589 _In_ size_t what = 0)
590 {
591 wstr2sgmlcat(dst, src, count_src, what);
592 }
593
601 inline void wstr2sgmlcat(
602 _Inout_ std::string& dst,
603 _In_ const std::wstring& src,
604 _In_ size_t what = 0)
605 {
606 wstr2sgmlcat(dst, src.c_str(), src.size(), what);
607 }
608
609 _Deprecated_("Use stdex::wstr2sgmlcat")
610 inline void wstr2sgml(
611 _Inout_ std::string& dst,
612 _In_ const std::wstring& src,
613 _In_ size_t what = 0)
614 {
615 wstr2sgmlcat(dst, src, what);
616 }
617
629 inline size_t wstr2sgmlcat(
630 _Inout_cap_(count_dst) char* dst, _In_ size_t count_dst,
631 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
632 _In_ size_t what = 0)
633 {
634 _Assume_(dst || !count_dst);
635 _Assume_(src || !count_src);
636
637 static const std::invalid_argument buffer_overrun("buffer overrun");
638 const bool
639 do_ascii = (what & sgml_full) == 0,
640 do_quot = (what & sgml_quot) == 0,
641 do_apos = (what & sgml_apos) == 0,
642 do_lt_gt = (what & sgml_lt_gt) == 0,
643 do_bsol = (what & sgml_bsol) == 0,
644 do_dollar = (what & sgml_dollar) == 0,
645 do_percnt = (what & sgml_percnt) == 0,
646 do_commat = (what & sgml_commat) == 0,
647 do_num = (what & sgml_num) == 0,
648 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
649 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
650 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
651
652 size_t j = strnlen(dst, count_dst);
653 count_src = wcsnlen(src, count_src);
654 for (size_t i = 0; i < count_src;) {
655 size_t n = glyphlen(src + i, count_src - i);
656 if (n == 1 &&
657 do_ascii && (unsigned int)src[i] < 128 &&
658 src[i] != L'&' &&
659 (do_quot || (src[i] != L'"')) &&
660 (do_apos || (src[i] != L'\'')) &&
661 (do_lt_gt || (src[i] != L'<' && src[i] != L'>')) &&
662 (do_bsol || (src[i] != L'\\')) &&
663 (do_dollar || (src[i] != L'$')) &&
664 (do_percnt || (src[i] != L'%')) &&
665 (do_commat || (src[i] != L'@')) &&
666 (do_num || (src[i] != L'#')) &&
667 (do_lpar_rpar || (src[i] != L'(' && src[i] != L')')) &&
668 (do_lcub_rcub || (src[i] != L'{' && src[i] != L'}')) &&
669 (do_lsqb_rsqb || (src[i] != L'[' && src[i] != L']')))
670 {
671 // 7-bit ASCII and no desire to encode it as an SGML entity.
672 if (j + 1 >= count_dst)
673 throw buffer_overrun;
674 dst[j++] = static_cast<char>(src[i++]);
675 }
676 else {
677 const char* entity = chr2sgml(src + i, n);
678 if (entity) {
679 size_t m = strlen(entity);
680 if (j + m + 2 >= count_dst)
681 throw buffer_overrun;
682 dst[j++] = '&';
683 memcpy(dst + j, entity, m * sizeof(char)); j += m;
684 dst[j++] = ';';
685 i += n;
686 }
687 else if (n == 1) {
688 // Trivial character (1 code unit, 1 glyph), no entity available.
689 if ((unsigned int)src[i] < 128) {
690 if (j + 1 >= count_dst)
691 throw buffer_overrun;
692 dst[j++] = static_cast<char>(src[i++]);
693 }
694 else {
695 char tmp[3 + 8 + 1 + 1];
696 int m = snprintf(tmp, _countof(tmp), "&#x%x;", src[i++]);
697 _Assume_(m >= 0);
698 if (static_cast<size_t>(m) >= count_dst)
699 throw buffer_overrun;
700 memcpy(dst + j, tmp, m * sizeof(char)); j += m;
701 }
702 }
703 else {
704 // Non-trivial character. Decompose.
705 const size_t end = i + n;
706 while (i < end) {
707 if ((entity = chr2sgml(src + i, 1)) != nullptr) {
708 size_t m = strlen(entity);
709 if (j + m + 2 >= count_dst)
710 throw buffer_overrun;
711 dst[j++] = '&';
712 memcpy(dst + j, entity, m * sizeof(char)); j += m;
713 dst[j++] = ';';
714 i++;
715 }
716 else if ((unsigned int)src[i] < 128) {
717 if (j + 1 >= count_dst)
718 throw buffer_overrun;
719 dst[j++] = static_cast<char>(src[i++]);
720 }
721 else {
722 uint32_t unicode;
723#ifdef _WIN32
724 if (i + 1 < end && is_surrogate_pair(src + i)) {
725 unicode = surrogate_pair_to_ucs4(src + i);
726 i += 2;
727 }
728 else
729#endif
730 {
731 unicode = src[i++];
732 }
733 char tmp[3 + 8 + 1 + 1];
734 int m = snprintf(tmp, _countof(tmp), "&#x%x;", unicode);
735 _Assume_(m >= 0);
736 if (static_cast<size_t>(m) >= count_dst)
737 throw buffer_overrun;
738 memcpy(dst + j, tmp, m * sizeof(char)); j += m;
739 }
740 }
741 }
742 }
743 }
744 if (j >= count_dst)
745 throw buffer_overrun;
746 dst[j] = 0;
747 return j;
748 }
749
750 _Deprecated_("Use stdex::wstr2sgmlcat")
751 inline size_t wstr2sgml(
752 _Inout_cap_(count_dst) char* dst, _In_ size_t count_dst,
753 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
754 _In_ size_t what = 0)
755 {
756 return wstr2sgmlcat(dst, count_dst, src, count_src, what);
757 }
758
767 inline void wstr2sgmlcpy(
768 _Inout_ std::string& dst,
769 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
770 _In_ size_t what = 0)
771 {
772 dst.clear();
773 wstr2sgmlcat(dst, src, count_src, what);
774 }
775
783 inline void wstr2sgmlcpy(
784 _Inout_ std::string& dst,
785 _In_ const std::wstring& src,
786 _In_ size_t what = 0)
787 {
788 wstr2sgmlcpy(dst, src.data(), src.size(), what);
789 }
790
802 inline size_t wstr2sgmlcpy(
803 _Inout_cap_(count_dst) char* dst, _In_ size_t count_dst,
804 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
805 _In_ size_t what = 0)
806 {
807 _Assume_(dst || !count_dst);
808 if (count_dst)
809 dst[0] = 0;
810 return wstr2sgmlcat(dst, count_dst, src, count_src, what);
811 }
812
822 inline std::string wstr2sgml(
823 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
824 _In_ size_t what = 0)
825 {
826 std::string dst;
827 wstr2sgmlcat(dst, src, count_src, what);
828 return dst;
829 }
830
839 inline std::string wstr2sgml(
840 _In_ const std::wstring& src,
841 _In_ size_t what = 0)
842 {
843 return wstr2sgml(src.c_str(), src.size(), what);
844 }
845}