stdex
Additional custom or not Standard C++ covered algorithms
Loading...
Searching...
No Matches
sgml.hpp
1/*
2 SPDX-License-Identifier: MIT
3 Copyright © 2023-2024 Amebis
4*/
5
6#pragma once
7
8#include "compat.hpp"
9#include "mapping.hpp"
10#include "sgml_unicode.hpp"
11#include "string.hpp"
12#include <string.h>
13#include <exception>
14#include <string_view>
15#include <string>
16
17namespace stdex
18{
20 template <class T>
21 const wchar_t* sgml2uni(_In_reads_or_z_(count) const T* entity, _In_ size_t count)
22 {
23 _Assume_(entity && count);
24 _Assume_(count < 2 || entity[0] != '#'); // No numeric entities
25
26 for (size_t i = 0, j = _countof(sgml_unicode); i < j; ) {
27 size_t m = (i + j) / 2;
28 if (sgml_unicode[m].sgml[0] < entity[0])
29 i = m + 1;
30 else if (sgml_unicode[m].sgml[0] > entity[0])
31 j = m;
32 else {
33 auto r = strncmp<char, T>(sgml_unicode[m].sgml + 1, _countof(sgml_unicode[0].sgml) - 1, entity + 1, count - 1);
34 if (r < 0)
35 i = m + 1;
36 else if (r > 0)
37 j = m;
38 else {
39 for (; i < m && strncmp<char, T>(sgml_unicode[m - 1].sgml, _countof(sgml_unicode[0].sgml), entity, count) == 0; m--);
40 return sgml_unicode[m].unicode;
41 }
42 }
43 }
44 return nullptr;
45 }
46
47 template <class T>
48 const T* sgmlend(
49 _In_reads_or_z_opt_(count) const T* str, _In_ size_t count)
50 {
51 _Assume_(str || !count);
52 for (size_t i = 0; i < count; i++) {
53 if (str[i] == ';')
54 return str + i;
55 if (!str[i] || str[i] == '&' || isspace(str[i]))
56 break;
57 }
58 return nullptr;
59 }
61
62 constexpr int sgml_full = 0x40000000;
63 constexpr int sgml_quot = 0x00000001;
64 constexpr int sgml_apos = 0x00000002;
65 constexpr int sgml_quot_apos = sgml_quot | sgml_apos;
66 constexpr int sgml_amp = 0x00000004;
67 constexpr int sgml_lt_gt = 0x00000008;
68 constexpr int sgml_bsol = 0x00000010;
69 constexpr int sgml_dollar = 0x00000020;
70 constexpr int sgml_percnt = 0x00000040;
71 constexpr int sgml_commat = 0x00000080;
72 constexpr int sgml_num = 0x00000100;
73 constexpr int sgml_lpar_rpar = 0x00000200;
74 constexpr int sgml_lcub_rcub = 0x00000400;
75 constexpr int sgml_lsqb_rsqb = 0x00000800;
76 constexpr int sgml_sgml = sgml_amp | sgml_lt_gt;
77 constexpr int sgml_ml_attrib = sgml_amp | sgml_quot_apos;
78 constexpr int sgml_c = sgml_amp | sgml_bsol | sgml_quot_apos;
79 // constexpr int sgml_kolos = sgml_amp | sgml_quot | sgml_dollar | sgml_percnt | sgml_lt_gt | sgml_bsol/* | sgml_commat | sgml_num*/ | sgml_lpar_rpar | sgml_lcub_rcub | sgml_lsqb_rsqb;
80
90 template <class T_from>
91 size_t sgmlerr(
92 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
93 _In_ int what = 0)
94 {
95 _Assume_(src || !count_src);
96
97 const bool
98 do_ascii = (what & sgml_full) == 0;
99
100 for (size_t i = 0; i < count_src && src[i];) {
101 if (src[i] == '&') {
102 auto end = sgmlend(src + i + 1, count_src - i - 1);
103 if (end) {
104 const wchar_t* entity_w;
105 wchar_t chr[3];
106 size_t n = end - src - i - 1;
107 if (n >= 2 && src[i + 1] == '#') {
108 utf32_t unicode;
109 if (src[i + 2] == 'x' || src[i + 2] == 'X')
110 unicode = strtou32(src + i + 3, n - 2, nullptr, 16);
111 else
112 unicode = strtou32(src + i + 2, n - 1, nullptr, 10);
113#ifdef _WIN32
114 if (unicode < 0x10000) {
115 chr[0] = (wchar_t)unicode;
116 chr[1] = 0;
117 }
118 else {
119 ucs4_to_surrogate_pair(chr, unicode);
120 chr[2] = 0;
121 }
122#else
123 chr[0] = (wchar_t)unicode;
124 chr[1] = 0;
125#endif
126 entity_w = chr;
127 }
128 else
129 entity_w = sgml2uni(src + i + 1, n);
130
131 if (entity_w) {
132 i = end - src + 1;
133 continue;
134 }
135
136 // Unknown entity.
137 return i;
138 }
139
140 // Unterminated entity.
141 return i;
142 }
143
144 if (do_ascii && !is7bit(src[i])) {
145 // Non-ASCII character
146 return i;
147 }
148 i++;
149 }
150
151 return npos;
152 }
153
164 template <class T_from, class TR_to = std::char_traits<wchar_t>, class AX_to = std::allocator<wchar_t>>
165 void sgml2strcat(
166 _Inout_ std::basic_string<wchar_t, TR_to, AX_to>& dst,
167 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
168 _In_ int skip = 0,
169 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
170 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
171 {
172 _Assume_(src || !count_src);
173
174 const bool
175 skip_quot = (skip & sgml_quot) == 0,
176 skip_apos = (skip & sgml_apos) == 0,
177 skip_amp = (skip & sgml_amp) == 0,
178 skip_lt_gt = (skip & sgml_lt_gt) == 0,
179 skip_bsol = (skip & sgml_bsol) == 0,
180 skip_dollar = (skip & sgml_dollar) == 0,
181 skip_percnt = (skip & sgml_percnt) == 0,
182 skip_commat = (skip & sgml_commat) == 0,
183 skip_num = (skip & sgml_num) == 0,
184 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
185 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
186 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
187
188 count_src = strnlen(src, count_src);
189 dst.reserve(dst.size() + count_src);
190 for (size_t i = 0; i < count_src;) {
191 if (src[i] == '&') {
192 auto end = sgmlend(src + i + 1, count_src - i - 1);
193 if (end) {
194 const wchar_t* entity_w;
195 wchar_t chr[3];
196 _Assume_(src + i + 1 <= end);
197 size_t n = static_cast<size_t>(end - src) - i - 1;
198 if (n >= 2 && src[i + 1] == '#') {
199 utf32_t unicode;
200 if (src[i + 2] == 'x' || src[i + 2] == 'X')
201 unicode = static_cast<utf32_t>(strtou32(src + i + 3, n - 2, nullptr, 16));
202 else
203 unicode = static_cast<utf32_t>(strtou32(src + i + 2, n - 1, nullptr, 10));
204#ifdef _WIN32
205 if (unicode < 0x10000) {
206 chr[0] = (wchar_t)unicode;
207 chr[1] = 0;
208 }
209 else {
210 ucs4_to_surrogate_pair(chr, unicode);
211 chr[2] = 0;
212 }
213#else
214 chr[0] = (wchar_t)unicode;
215 chr[1] = 0;
216#endif
217 entity_w = chr;
218 }
219 else
220 entity_w = sgml2uni(src + i + 1, n);
221
222 if (entity_w &&
223 (skip_quot || (entity_w[0] != L'"')) &&
224 (skip_apos || (entity_w[0] != L'\'')) &&
225 (skip_amp || (entity_w[0] != L'&')) &&
226 (skip_lt_gt || (entity_w[0] != L'<' && entity_w[0] != L'>')) &&
227 (skip_bsol || (entity_w[0] != L'\\')) &&
228 (skip_dollar || (entity_w[0] != L'$')) &&
229 (skip_percnt || (entity_w[0] != L'%')) &&
230 (skip_commat || (entity_w[0] != L'@')) &&
231 (skip_num || (entity_w[0] != L'#')) &&
232 (skip_lpar_rpar || (entity_w[0] != L'(' && entity_w[0] != L')')) &&
233 (skip_lcub_rcub || (entity_w[0] != L'{' && entity_w[0] != L'}')) &&
234 (skip_lsqb_rsqb || (entity_w[0] != L'[' && entity_w[0] != L']')))
235 {
236 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
237 dst.append(entity_w);
238 _Assume_(src <= end);
239 i = static_cast<size_t>(end - src) + 1;
240 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
241 continue;
242 }
243 }
244 }
245 dst.append(1, src[i++]);
246 }
247 }
248
258 template <class T_from, class TR_to = std::char_traits<wchar_t>, class AX_to = std::allocator<wchar_t>, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
259 void sgml2strcat(
260 _Inout_ std::basic_string<wchar_t, TR_to, AX_to>& dst,
261 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
262 _In_ int skip = 0,
263 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
264 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
265 {
266 sgml2strcat(dst, src.data(), src.size(), skip, offset, map);
267 }
268
282 template <class T_from>
283 size_t sgml2strcat(
284 _Inout_cap_(count_dst) wchar_t* dst, _In_ size_t count_dst,
285 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
286 _In_ int skip = 0,
287 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
288 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
289 {
290 _Assume_(dst || !count_dst);
291 _Assume_(src || !count_src);
292
293 static const std::invalid_argument buffer_overrun("buffer overrun");
294 const bool
295 skip_quot = (skip & sgml_quot) == 0,
296 skip_apos = (skip & sgml_apos) == 0,
297 skip_amp = (skip & sgml_amp) == 0,
298 skip_lt_gt = (skip & sgml_lt_gt) == 0,
299 skip_bsol = (skip & sgml_bsol) == 0,
300 skip_dollar = (skip & sgml_dollar) == 0,
301 skip_percnt = (skip & sgml_percnt) == 0,
302 skip_commat = (skip & sgml_commat) == 0,
303 skip_num = (skip & sgml_num) == 0,
304 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
305 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
306 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
307
308 size_t j = strnlen(dst, count_dst);
309 count_src = strnlen(src, count_src);
310 for (size_t i = 0; i < count_src;) {
311 if (src[i] == '&') {
312 auto end = sgmlend(src + i + 1, count_src - i - 1);
313 if (end) {
314 const wchar_t* entity_w;
315 wchar_t chr[3];
316 size_t n = end - src - i - 1;
317 if (n >= 2 && src[i + 1] == '#') {
318 utf32_t unicode;
319 if (src[i + 2] == 'x' || src[i + 2] == 'X')
320 unicode = strtou32(src + i + 3, n - 2, nullptr, 16);
321 else
322 unicode = strtou32(src + i + 2, n - 1, nullptr, 10);
323#ifdef _WIN32
324 if (unicode < 0x10000) {
325 chr[0] = (wchar_t)unicode;
326 chr[1] = 0;
327 }
328 else {
329 ucs4_to_surrogate_pair(chr, unicode);
330 chr[2] = 0;
331 }
332#else
333 chr[0] = (wchar_t)unicode;
334 chr[1] = 0;
335#endif
336 entity_w = chr;
337 }
338 else
339 entity_w = sgml2uni(src + i + 1, n);
340
341 if (entity_w &&
342 (skip_quot || (entity_w[0] != L'"')) &&
343 (skip_apos || (entity_w[0] != L'\'')) &&
344 (skip_amp || (entity_w[0] != L'&')) &&
345 (skip_lt_gt || (entity_w[0] != L'<' && entity_w[0] != L'>')) &&
346 (skip_bsol || (entity_w[0] != L'\\')) &&
347 (skip_dollar || (entity_w[0] != L'$')) &&
348 (skip_percnt || (entity_w[0] != L'%')) &&
349 (skip_commat || (entity_w[0] != L'@')) &&
350 (skip_num || (entity_w[0] != L'#')) &&
351 (skip_lpar_rpar || (entity_w[0] != L'(' && entity_w[0] != L')')) &&
352 (skip_lcub_rcub || (entity_w[0] != L'{' && entity_w[0] != L'}')) &&
353 (skip_lsqb_rsqb || (entity_w[0] != L'[' && entity_w[0] != L']')))
354 {
355 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
356 size_t m = wcslen(entity_w);
357 if (j + m >= count_dst)
358 throw buffer_overrun;
359 memcpy(dst + j, entity_w, m * sizeof(wchar_t)); j += m;
360 i = end - src + 1;
361 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
362 continue;
363 }
364 }
365 }
366 if (j + 1 >= count_dst)
367 throw buffer_overrun;
368 dst[j++] = src[i++];
369 }
370 if (j >= count_dst)
371 throw buffer_overrun;
372 dst[j] = 0;
373 return j;
374 }
375
386 template <class T_from, class TR_to = std::char_traits<wchar_t>, class AX_to = std::allocator<wchar_t>>
387 void sgml2strcpy(
388 _Inout_ std::basic_string<wchar_t, TR_to, AX_to>& dst,
389 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
390 _In_ int skip = 0,
391 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
392 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
393 {
394 dst.clear();
395 if (map)
396 map->clear();
397 sgml2strcat(dst, src, count_src, skip, offset, map);
398 }
399
409 template<class T_from, class TR_to = std::char_traits<wchar_t>, class AX_to = std::allocator<wchar_t>, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
410 void sgml2strcpy(
411 _Inout_ std::basic_string<wchar_t, TR_to, AX_to>& dst,
412 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
413 _In_ int skip = 0,
414 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
415 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
416 {
417 sgml2strcpy(dst, src.data(), src.size(), skip, offset, map);
418 }
419
433 template <class T_from>
434 size_t sgml2strcpy(
435 _Inout_cap_(count_dst) wchar_t* dst, _In_ size_t count_dst,
436 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
437 _In_ int skip = 0,
438 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
439 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
440 {
441 _Assume_(dst || !count_dst);
442 if (count_dst)
443 dst[0] = 0;
444 if (map)
445 map->clear();
446 return sgml2strcat(dst, count_dst, src, count_src, skip, offset, map);
447 }
448
460 template <class T_from>
461 std::wstring sgml2str(
462 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
463 _In_ int skip = 0,
464 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
465 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
466 {
467 std::wstring dst;
468 sgml2strcat(dst, src, count_src, skip, offset, map);
469 return dst;
470 }
471
482 template <class T_from, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
483 std::wstring sgml2str(
484 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
485 _In_ int skip = 0,
486 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
487 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
488 {
489 return sgml2str(src.data(), src.size(), skip, offset, map);
490 }
491
493 inline const char* chr2sgml(_In_reads_or_z_(count) const wchar_t* entity, _In_ size_t count)
494 {
495 _Assume_(entity && count);
496
497 const wchar_t e2 = entity[0];
498 for (size_t i = 0, j = _countof(unicode_sgml); i < j; ) {
499 size_t m = (i + j) / 2;
500 wchar_t e1 = sgml_unicode[unicode_sgml[m]].unicode[0];
501 if (e1 < e2)
502 i = m + 1;
503 else if (e1 > e2)
504 j = m;
505 else {
506 auto r = strncmp(sgml_unicode[unicode_sgml[m]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1);
507 if (r < 0)
508 i = m + 1;
509 else if (r > 0)
510 j = m;
511 else {
512 for (; i < m && sgml_unicode[unicode_sgml[m - 1]].unicode[0] == e2 && strncmp(sgml_unicode[unicode_sgml[m - 1]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1) == 0; m--);
513 return sgml_unicode[unicode_sgml[m]].sgml;
514 }
515 }
516 }
517 return nullptr;
518 }
520
529 template <class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>>
530 inline void str2sgmlcat(
531 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
532 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
533 _In_ int what = 0)
534 {
535 _Assume_(src || !count_src);
536
537 const bool
538 do_ascii = (what & sgml_full) == 0,
539 do_quot = (what & sgml_quot) == 0,
540 do_apos = (what & sgml_apos) == 0,
541 do_lt_gt = (what & sgml_lt_gt) == 0,
542 do_bsol = (what & sgml_bsol) == 0,
543 do_dollar = (what & sgml_dollar) == 0,
544 do_percnt = (what & sgml_percnt) == 0,
545 do_commat = (what & sgml_commat) == 0,
546 do_num = (what & sgml_num) == 0,
547 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
548 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
549 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
550
551 count_src = strnlen(src, count_src);
552 dst.reserve(dst.size() + count_src);
553 for (size_t i = 0; i < count_src;) {
554 size_t n = glyphlen(src + i, count_src - i);
555 if (n == 1 &&
556 do_ascii && is7bit(src[i]) &&
557 src[i] != L'&' &&
558 (do_quot || (src[i] != L'"')) &&
559 (do_apos || (src[i] != L'\'')) &&
560 (do_lt_gt || (src[i] != L'<' && src[i] != L'>')) &&
561 (do_bsol || (src[i] != L'\\')) &&
562 (do_dollar || (src[i] != L'$')) &&
563 (do_percnt || (src[i] != L'%')) &&
564 (do_commat || (src[i] != L'@')) &&
565 (do_num || (src[i] != L'#')) &&
566 (do_lpar_rpar || (src[i] != L'(' && src[i] != L')')) &&
567 (do_lcub_rcub || (src[i] != L'{' && src[i] != L'}')) &&
568 (do_lsqb_rsqb || (src[i] != L'[' && src[i] != L']')))
569 {
570 // 7-bit ASCII and no desire to encode it as an SGML entity.
571 dst.append(1, static_cast<char>(src[i++]));
572 }
573 else {
574 const char* entity = chr2sgml(src + i, n);
575 if (entity) {
576 dst.append(1, '&');
577 dst.append(entity);
578 dst.append(1, ';');
579 i += n;
580 }
581 else if (n == 1) {
582 // Trivial character (1 code unit, 1 glyph), no entity available.
583 if (is7bit(src[i]))
584 dst.append(1, static_cast<char>(src[i++]));
585 else {
586 char tmp[3 + 8 + 1 + 1];
587 snprintf(tmp, _countof(tmp), "&#x%x;", src[i++]);
588 dst.append(tmp);
589 }
590 }
591 else {
592 // Non-trivial character. Decompose.
593 const size_t end = i + n;
594 while (i < end) {
595 if ((entity = chr2sgml(src + i, 1)) != nullptr) {
596 dst.append(1, '&');
597 dst.append(entity);
598 dst.append(1, ';');
599 i++;
600 }
601 else if (is7bit(src[i]))
602 dst.append(1, static_cast<char>(src[i++]));
603 else {
604 utf32_t unicode;
605#ifdef _WIN32
606 if (i + 1 < end && is_surrogate_pair(src + i)) {
607 unicode = surrogate_pair_to_ucs4(src + i);
608 i += 2;
609 }
610 else
611#endif
612 {
613 unicode = src[i++];
614 }
615 char tmp[3 + 8 + 1 + 1];
616 snprintf(tmp, _countof(tmp), "&#x%x;", unicode);
617 dst.append(tmp);
618 }
619 }
620 }
621 }
622 }
623 }
624
632 template <class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>>
633 void str2sgmlcat(
634 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
635 _In_ const std::basic_string_view<wchar_t, std::char_traits<wchar_t>> src,
636 _In_ int what = 0)
637 {
638 str2sgmlcat(dst, src.data(), src.size(), what);
639 }
640
652 inline size_t str2sgmlcat(
653 _Inout_cap_(count_dst) char* dst, _In_ size_t count_dst,
654 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
655 _In_ int what = 0)
656 {
657 _Assume_(dst || !count_dst);
658 _Assume_(src || !count_src);
659
660 static const std::invalid_argument buffer_overrun("buffer overrun");
661 const bool
662 do_ascii = (what & sgml_full) == 0,
663 do_quot = (what & sgml_quot) == 0,
664 do_apos = (what & sgml_apos) == 0,
665 do_lt_gt = (what & sgml_lt_gt) == 0,
666 do_bsol = (what & sgml_bsol) == 0,
667 do_dollar = (what & sgml_dollar) == 0,
668 do_percnt = (what & sgml_percnt) == 0,
669 do_commat = (what & sgml_commat) == 0,
670 do_num = (what & sgml_num) == 0,
671 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
672 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
673 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
674
675 size_t j = strnlen(dst, count_dst);
676 count_src = strnlen(src, count_src);
677 for (size_t i = 0; i < count_src;) {
678 size_t n = glyphlen(src + i, count_src - i);
679 if (n == 1 &&
680 do_ascii && is7bit(src[i]) &&
681 src[i] != L'&' &&
682 (do_quot || (src[i] != L'"')) &&
683 (do_apos || (src[i] != L'\'')) &&
684 (do_lt_gt || (src[i] != L'<' && src[i] != L'>')) &&
685 (do_bsol || (src[i] != L'\\')) &&
686 (do_dollar || (src[i] != L'$')) &&
687 (do_percnt || (src[i] != L'%')) &&
688 (do_commat || (src[i] != L'@')) &&
689 (do_num || (src[i] != L'#')) &&
690 (do_lpar_rpar || (src[i] != L'(' && src[i] != L')')) &&
691 (do_lcub_rcub || (src[i] != L'{' && src[i] != L'}')) &&
692 (do_lsqb_rsqb || (src[i] != L'[' && src[i] != L']')))
693 {
694 // 7-bit ASCII and no desire to encode it as an SGML entity.
695 if (j + 1 >= count_dst)
696 throw buffer_overrun;
697 dst[j++] = static_cast<char>(src[i++]);
698 }
699 else {
700 const char* entity = chr2sgml(src + i, n);
701 if (entity) {
702 size_t m = strlen(entity);
703 if (j + m + 2 >= count_dst)
704 throw buffer_overrun;
705 dst[j++] = '&';
706 memcpy(dst + j, entity, m * sizeof(char)); j += m;
707 dst[j++] = ';';
708 i += n;
709 }
710 else if (n == 1) {
711 // Trivial character (1 code unit, 1 glyph), no entity available.
712 if (is7bit(src[i])) {
713 if (j + 1 >= count_dst)
714 throw buffer_overrun;
715 dst[j++] = static_cast<char>(src[i++]);
716 }
717 else {
718 char tmp[3 + 8 + 1 + 1];
719 int m = snprintf(tmp, _countof(tmp), "&#x%x;", src[i++]);
720 _Assume_(m >= 0);
721 if (static_cast<size_t>(m) >= count_dst)
722 throw buffer_overrun;
723 memcpy(dst + j, tmp, static_cast<size_t>(m) * sizeof(char));
724 j += static_cast<size_t>(m);
725 }
726 }
727 else {
728 // Non-trivial character. Decompose.
729 const size_t end = i + n;
730 while (i < end) {
731 if ((entity = chr2sgml(src + i, 1)) != nullptr) {
732 size_t m = strlen(entity);
733 if (j + m + 2 >= count_dst)
734 throw buffer_overrun;
735 dst[j++] = '&';
736 memcpy(dst + j, entity, m * sizeof(char)); j += m;
737 dst[j++] = ';';
738 i++;
739 }
740 else if (is7bit(src[i])) {
741 if (j + 1 >= count_dst)
742 throw buffer_overrun;
743 dst[j++] = static_cast<char>(src[i++]);
744 }
745 else {
746 utf32_t unicode;
747#ifdef _WIN32
748 if (i + 1 < end && is_surrogate_pair(src + i)) {
749 unicode = surrogate_pair_to_ucs4(src + i);
750 i += 2;
751 }
752 else
753#endif
754 {
755 unicode = src[i++];
756 }
757 char tmp[3 + 8 + 1 + 1];
758 int m = snprintf(tmp, _countof(tmp), "&#x%x;", unicode);
759 _Assume_(m >= 0);
760 if (static_cast<size_t>(m) >= count_dst)
761 throw buffer_overrun;
762 memcpy(dst + j, tmp, static_cast<size_t>(m) * sizeof(char));
763 j += static_cast<size_t>(m);
764 }
765 }
766 }
767 }
768 }
769 if (j >= count_dst)
770 throw buffer_overrun;
771 dst[j] = 0;
772 return j;
773 }
774
783 template <class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>>
784 inline void str2sgmlcpy(
785 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
786 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
787 _In_ int what = 0)
788 {
789 dst.clear();
790 str2sgmlcat(dst, src, count_src, what);
791 }
792
800 template <class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>>
801 void str2sgmlcpy(
802 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
803 _In_ const std::basic_string_view<wchar_t, std::char_traits<wchar_t>> src,
804 _In_ int what = 0)
805 {
806 str2sgmlcpy(dst, src.data(), src.size(), what);
807 }
808
820 inline size_t str2sgmlcpy(
821 _Inout_cap_(count_dst) char* dst, _In_ size_t count_dst,
822 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
823 _In_ int what = 0)
824 {
825 _Assume_(dst || !count_dst);
826 if (count_dst)
827 dst[0] = 0;
828 return str2sgmlcat(dst, count_dst, src, count_src, what);
829 }
830
840 inline std::string str2sgml(
841 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
842 _In_ int what = 0)
843 {
844 std::string dst;
845 str2sgmlcat(dst, src, count_src, what);
846 return dst;
847 }
848
857 inline std::string str2sgml(
858 _In_ const std::basic_string_view<wchar_t, std::char_traits<wchar_t>> src,
859 _In_ int what = 0)
860 {
861 return str2sgml(src.data(), src.size(), what);
862 }
863}