stdex
Additional custom or not Standard C++ covered algorithms
Loading...
Searching...
No Matches
sgml.hpp
1/*
2 SPDX-License-Identifier: MIT
3 Copyright © 2023-2024 Amebis
4*/
5
6#pragma once
7
8#include "compat.hpp"
9#include "mapping.hpp"
10#include "sgml_unicode.hpp"
11#include "string.hpp"
12#include <string.h>
13#include <exception>
14#include <string_view>
15#include <string>
16
17#if defined(__GNUC__)
18#pragma GCC diagnostic push
19#pragma GCC diagnostic ignored "-Wexit-time-destructors"
20#endif
21
22namespace stdex
23{
25 template <class T>
26 const wchar_t* sgml2uni(_In_reads_or_z_(count) const T* entity, _In_ size_t count)
27 {
28 _Assume_(entity && count);
29 _Assume_(count < 2 || entity[0] != '#'); // No numeric entities
30
31 for (size_t i = 0, j = _countof(sgml_unicode); i < j; ) {
32 size_t m = (i + j) / 2;
33 if (sgml_unicode[m].sgml[0] < entity[0])
34 i = m + 1;
35 else if (sgml_unicode[m].sgml[0] > entity[0])
36 j = m;
37 else {
38 auto r = strncmp<char, T>(sgml_unicode[m].sgml + 1, _countof(sgml_unicode[0].sgml) - 1, entity + 1, count - 1);
39 if (r < 0)
40 i = m + 1;
41 else if (r > 0)
42 j = m;
43 else {
44 for (; i < m && strncmp<char, T>(sgml_unicode[m - 1].sgml, _countof(sgml_unicode[0].sgml), entity, count) == 0; m--);
45 return sgml_unicode[m].unicode;
46 }
47 }
48 }
49 return nullptr;
50 }
51
52 template <class T>
53 const T* sgmlend(
54 _In_reads_or_z_opt_(count) const T* str, _In_ size_t count)
55 {
56 _Assume_(str || !count);
57 for (size_t i = 0; i < count; i++) {
58 if (str[i] == ';')
59 return str + i;
60 if (!str[i] || str[i] == '&' || isspace(str[i]))
61 break;
62 }
63 return nullptr;
64 }
66
67 constexpr int sgml_full = 0x40000000;
68 constexpr int sgml_quot = 0x00000001;
69 constexpr int sgml_apos = 0x00000002;
70 constexpr int sgml_quot_apos = sgml_quot | sgml_apos;
71 constexpr int sgml_amp = 0x00000004;
72 constexpr int sgml_lt_gt = 0x00000008;
73 constexpr int sgml_bsol = 0x00000010;
74 constexpr int sgml_dollar = 0x00000020;
75 constexpr int sgml_percnt = 0x00000040;
76 constexpr int sgml_commat = 0x00000080;
77 constexpr int sgml_num = 0x00000100;
78 constexpr int sgml_lpar_rpar = 0x00000200;
79 constexpr int sgml_lcub_rcub = 0x00000400;
80 constexpr int sgml_lsqb_rsqb = 0x00000800;
81 constexpr int sgml_sgml = sgml_amp | sgml_lt_gt;
82 constexpr int sgml_ml_attrib = sgml_amp | sgml_quot_apos;
83 constexpr int sgml_c = sgml_amp | sgml_bsol | sgml_quot_apos;
84 // constexpr int sgml_kolos = sgml_amp | sgml_quot | sgml_dollar | sgml_percnt | sgml_lt_gt | sgml_bsol/* | sgml_commat | sgml_num*/ | sgml_lpar_rpar | sgml_lcub_rcub | sgml_lsqb_rsqb;
85
95 template <class T_from>
96 size_t sgmlerr(
97 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
98 _In_ int what = 0)
99 {
100 _Assume_(src || !count_src);
101
102 const bool
103 do_ascii = (what & sgml_full) == 0;
104
105 for (size_t i = 0; i < count_src && src[i];) {
106 if (src[i] == '&') {
107 auto end = sgmlend(src + i + 1, count_src - i - 1);
108 if (end) {
109 const wchar_t* entity_w;
110 wchar_t chr[3];
111 size_t n = end - src - i - 1;
112 if (n >= 2 && src[i + 1] == '#') {
113 utf32_t unicode;
114 if (src[i + 2] == 'x' || src[i + 2] == 'X')
115 unicode = strtou32(src + i + 3, n - 2, nullptr, 16);
116 else
117 unicode = strtou32(src + i + 2, n - 1, nullptr, 10);
118#ifdef _WIN32
119 if (unicode < 0x10000) {
120 chr[0] = (wchar_t)unicode;
121 chr[1] = 0;
122 }
123 else {
124 ucs4_to_surrogate_pair(chr, unicode);
125 chr[2] = 0;
126 }
127#else
128 chr[0] = (wchar_t)unicode;
129 chr[1] = 0;
130#endif
131 entity_w = chr;
132 }
133 else
134 entity_w = sgml2uni(src + i + 1, n);
135
136 if (entity_w) {
137 i = end - src + 1;
138 continue;
139 }
140
141 // Unknown entity.
142 return i;
143 }
144
145 // Unterminated entity.
146 return i;
147 }
148
149 if (do_ascii && !is7bit(src[i])) {
150 // Non-ASCII character
151 return i;
152 }
153 i++;
154 }
155
156 return npos;
157 }
158
169 template <class T_from, class TR_to = std::char_traits<wchar_t>, class AX_to = std::allocator<wchar_t>>
170 void sgml2strcat(
171 _Inout_ std::basic_string<wchar_t, TR_to, AX_to>& dst,
172 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
173 _In_ int skip = 0,
174 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
175 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
176 {
177 _Assume_(src || !count_src);
178
179 const bool
180 skip_quot = (skip & sgml_quot) == 0,
181 skip_apos = (skip & sgml_apos) == 0,
182 skip_amp = (skip & sgml_amp) == 0,
183 skip_lt_gt = (skip & sgml_lt_gt) == 0,
184 skip_bsol = (skip & sgml_bsol) == 0,
185 skip_dollar = (skip & sgml_dollar) == 0,
186 skip_percnt = (skip & sgml_percnt) == 0,
187 skip_commat = (skip & sgml_commat) == 0,
188 skip_num = (skip & sgml_num) == 0,
189 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
190 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
191 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
192
193 count_src = strnlen(src, count_src);
194 dst.reserve(dst.size() + count_src);
195 for (size_t i = 0; i < count_src;) {
196 if (src[i] == '&') {
197 auto end = sgmlend(src + i + 1, count_src - i - 1);
198 if (end) {
199 const wchar_t* entity_w;
200 wchar_t chr[3];
201 _Assume_(src + i + 1 <= end);
202 size_t n = static_cast<size_t>(end - src) - i - 1;
203 if (n >= 2 && src[i + 1] == '#') {
204 utf32_t unicode;
205 if (src[i + 2] == 'x' || src[i + 2] == 'X')
206 unicode = static_cast<utf32_t>(strtou32(src + i + 3, n - 2, nullptr, 16));
207 else
208 unicode = static_cast<utf32_t>(strtou32(src + i + 2, n - 1, nullptr, 10));
209#ifdef _WIN32
210 if (unicode < 0x10000) {
211 chr[0] = (wchar_t)unicode;
212 chr[1] = 0;
213 }
214 else {
215 ucs4_to_surrogate_pair(chr, unicode);
216 chr[2] = 0;
217 }
218#else
219 chr[0] = (wchar_t)unicode;
220 chr[1] = 0;
221#endif
222 entity_w = chr;
223 }
224 else
225 entity_w = sgml2uni(src + i + 1, n);
226
227 if (entity_w &&
228 (skip_quot || (entity_w[0] != L'"')) &&
229 (skip_apos || (entity_w[0] != L'\'')) &&
230 (skip_amp || (entity_w[0] != L'&')) &&
231 (skip_lt_gt || (entity_w[0] != L'<' && entity_w[0] != L'>')) &&
232 (skip_bsol || (entity_w[0] != L'\\')) &&
233 (skip_dollar || (entity_w[0] != L'$')) &&
234 (skip_percnt || (entity_w[0] != L'%')) &&
235 (skip_commat || (entity_w[0] != L'@')) &&
236 (skip_num || (entity_w[0] != L'#')) &&
237 (skip_lpar_rpar || (entity_w[0] != L'(' && entity_w[0] != L')')) &&
238 (skip_lcub_rcub || (entity_w[0] != L'{' && entity_w[0] != L'}')) &&
239 (skip_lsqb_rsqb || (entity_w[0] != L'[' && entity_w[0] != L']')))
240 {
241 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
242 dst.append(entity_w);
243 _Assume_(src <= end);
244 i = static_cast<size_t>(end - src) + 1;
245 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
246 continue;
247 }
248 }
249 }
250 dst.append(1, src[i++]);
251 }
252 }
253
263 template <class T_from, class TR_to = std::char_traits<wchar_t>, class AX_to = std::allocator<wchar_t>, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
264 void sgml2strcat(
265 _Inout_ std::basic_string<wchar_t, TR_to, AX_to>& dst,
266 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
267 _In_ int skip = 0,
268 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
269 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
270 {
271 sgml2strcat(dst, src.data(), src.size(), skip, offset, map);
272 }
273
287 template <class T_from>
288 size_t sgml2strcat(
289 _Inout_cap_(count_dst) wchar_t* dst, _In_ size_t count_dst,
290 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
291 _In_ int skip = 0,
292 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
293 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
294 {
295 _Assume_(dst || !count_dst);
296 _Assume_(src || !count_src);
297
298 static const std::invalid_argument buffer_overrun("buffer overrun");
299 const bool
300 skip_quot = (skip & sgml_quot) == 0,
301 skip_apos = (skip & sgml_apos) == 0,
302 skip_amp = (skip & sgml_amp) == 0,
303 skip_lt_gt = (skip & sgml_lt_gt) == 0,
304 skip_bsol = (skip & sgml_bsol) == 0,
305 skip_dollar = (skip & sgml_dollar) == 0,
306 skip_percnt = (skip & sgml_percnt) == 0,
307 skip_commat = (skip & sgml_commat) == 0,
308 skip_num = (skip & sgml_num) == 0,
309 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
310 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
311 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
312
313 size_t j = strnlen(dst, count_dst);
314 count_src = strnlen(src, count_src);
315 for (size_t i = 0; i < count_src;) {
316 if (src[i] == '&') {
317 auto end = sgmlend(src + i + 1, count_src - i - 1);
318 if (end) {
319 const wchar_t* entity_w;
320 wchar_t chr[3];
321 size_t n = end - src - i - 1;
322 if (n >= 2 && src[i + 1] == '#') {
323 utf32_t unicode;
324 if (src[i + 2] == 'x' || src[i + 2] == 'X')
325 unicode = strtou32(src + i + 3, n - 2, nullptr, 16);
326 else
327 unicode = strtou32(src + i + 2, n - 1, nullptr, 10);
328#ifdef _WIN32
329 if (unicode < 0x10000) {
330 chr[0] = (wchar_t)unicode;
331 chr[1] = 0;
332 }
333 else {
334 ucs4_to_surrogate_pair(chr, unicode);
335 chr[2] = 0;
336 }
337#else
338 chr[0] = (wchar_t)unicode;
339 chr[1] = 0;
340#endif
341 entity_w = chr;
342 }
343 else
344 entity_w = sgml2uni(src + i + 1, n);
345
346 if (entity_w &&
347 (skip_quot || (entity_w[0] != L'"')) &&
348 (skip_apos || (entity_w[0] != L'\'')) &&
349 (skip_amp || (entity_w[0] != L'&')) &&
350 (skip_lt_gt || (entity_w[0] != L'<' && entity_w[0] != L'>')) &&
351 (skip_bsol || (entity_w[0] != L'\\')) &&
352 (skip_dollar || (entity_w[0] != L'$')) &&
353 (skip_percnt || (entity_w[0] != L'%')) &&
354 (skip_commat || (entity_w[0] != L'@')) &&
355 (skip_num || (entity_w[0] != L'#')) &&
356 (skip_lpar_rpar || (entity_w[0] != L'(' && entity_w[0] != L')')) &&
357 (skip_lcub_rcub || (entity_w[0] != L'{' && entity_w[0] != L'}')) &&
358 (skip_lsqb_rsqb || (entity_w[0] != L'[' && entity_w[0] != L']')))
359 {
360 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
361 size_t m = wcslen(entity_w);
362 if (j + m >= count_dst)
363 throw buffer_overrun;
364 memcpy(dst + j, entity_w, m * sizeof(wchar_t)); j += m;
365 i = end - src + 1;
366 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
367 continue;
368 }
369 }
370 }
371 if (j + 1 >= count_dst)
372 throw buffer_overrun;
373 dst[j++] = src[i++];
374 }
375 if (j >= count_dst)
376 throw buffer_overrun;
377 dst[j] = 0;
378 return j;
379 }
380
391 template <class T_from, class TR_to = std::char_traits<wchar_t>, class AX_to = std::allocator<wchar_t>>
392 void sgml2strcpy(
393 _Inout_ std::basic_string<wchar_t, TR_to, AX_to>& dst,
394 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
395 _In_ int skip = 0,
396 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
397 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
398 {
399 dst.clear();
400 if (map)
401 map->clear();
402 sgml2strcat(dst, src, count_src, skip, offset, map);
403 }
404
414 template<class T_from, class TR_to = std::char_traits<wchar_t>, class AX_to = std::allocator<wchar_t>, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
415 void sgml2strcpy(
416 _Inout_ std::basic_string<wchar_t, TR_to, AX_to>& dst,
417 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
418 _In_ int skip = 0,
419 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
420 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
421 {
422 sgml2strcpy(dst, src.data(), src.size(), skip, offset, map);
423 }
424
438 template <class T_from>
439 size_t sgml2strcpy(
440 _Inout_cap_(count_dst) wchar_t* dst, _In_ size_t count_dst,
441 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
442 _In_ int skip = 0,
443 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
444 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
445 {
446 _Assume_(dst || !count_dst);
447 if (count_dst)
448 dst[0] = 0;
449 if (map)
450 map->clear();
451 return sgml2strcat(dst, count_dst, src, count_src, skip, offset, map);
452 }
453
465 template <class T_from>
466 std::wstring sgml2str(
467 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
468 _In_ int skip = 0,
469 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
470 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
471 {
472 std::wstring dst;
473 sgml2strcat(dst, src, count_src, skip, offset, map);
474 return dst;
475 }
476
487 template <class T_from, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
488 std::wstring sgml2str(
489 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
490 _In_ int skip = 0,
491 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
492 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
493 {
494 return sgml2str(src.data(), src.size(), skip, offset, map);
495 }
496
498 inline const char* chr2sgml(_In_reads_or_z_(count) const wchar_t* entity, _In_ size_t count)
499 {
500 _Assume_(entity && count);
501
502 const wchar_t e2 = entity[0];
503 for (size_t i = 0, j = _countof(unicode_sgml); i < j; ) {
504 size_t m = (i + j) / 2;
505 wchar_t e1 = sgml_unicode[unicode_sgml[m]].unicode[0];
506 if (e1 < e2)
507 i = m + 1;
508 else if (e1 > e2)
509 j = m;
510 else {
511 auto r = strncmp(sgml_unicode[unicode_sgml[m]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1);
512 if (r < 0)
513 i = m + 1;
514 else if (r > 0)
515 j = m;
516 else {
517 for (; i < m && sgml_unicode[unicode_sgml[m - 1]].unicode[0] == e2 && strncmp(sgml_unicode[unicode_sgml[m - 1]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1) == 0; m--);
518 return sgml_unicode[unicode_sgml[m]].sgml;
519 }
520 }
521 }
522 return nullptr;
523 }
525
534 template <class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>>
535 inline void str2sgmlcat(
536 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
537 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
538 _In_ int what = 0)
539 {
540 _Assume_(src || !count_src);
541
542 const bool
543 do_ascii = (what & sgml_full) == 0,
544 do_quot = (what & sgml_quot) == 0,
545 do_apos = (what & sgml_apos) == 0,
546 do_lt_gt = (what & sgml_lt_gt) == 0,
547 do_bsol = (what & sgml_bsol) == 0,
548 do_dollar = (what & sgml_dollar) == 0,
549 do_percnt = (what & sgml_percnt) == 0,
550 do_commat = (what & sgml_commat) == 0,
551 do_num = (what & sgml_num) == 0,
552 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
553 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
554 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
555
556 count_src = strnlen(src, count_src);
557 dst.reserve(dst.size() + count_src);
558 for (size_t i = 0; i < count_src;) {
559 size_t n = glyphlen(src + i, count_src - i);
560 if (n == 1 &&
561 do_ascii && is7bit(src[i]) &&
562 src[i] != L'&' &&
563 (do_quot || (src[i] != L'"')) &&
564 (do_apos || (src[i] != L'\'')) &&
565 (do_lt_gt || (src[i] != L'<' && src[i] != L'>')) &&
566 (do_bsol || (src[i] != L'\\')) &&
567 (do_dollar || (src[i] != L'$')) &&
568 (do_percnt || (src[i] != L'%')) &&
569 (do_commat || (src[i] != L'@')) &&
570 (do_num || (src[i] != L'#')) &&
571 (do_lpar_rpar || (src[i] != L'(' && src[i] != L')')) &&
572 (do_lcub_rcub || (src[i] != L'{' && src[i] != L'}')) &&
573 (do_lsqb_rsqb || (src[i] != L'[' && src[i] != L']')))
574 {
575 // 7-bit ASCII and no desire to encode it as an SGML entity.
576 dst.append(1, static_cast<char>(src[i++]));
577 }
578 else {
579 const char* entity = chr2sgml(src + i, n);
580 if (entity) {
581 dst.append(1, '&');
582 dst.append(entity);
583 dst.append(1, ';');
584 i += n;
585 }
586 else if (n == 1) {
587 // Trivial character (1 code unit, 1 glyph), no entity available.
588 if (is7bit(src[i]))
589 dst.append(1, static_cast<char>(src[i++]));
590 else {
591 char tmp[3 + 8 + 1 + 1];
592 snprintf(tmp, _countof(tmp), "&#x%x;", src[i++]);
593 dst.append(tmp);
594 }
595 }
596 else {
597 // Non-trivial character. Decompose.
598 const size_t end = i + n;
599 while (i < end) {
600 if ((entity = chr2sgml(src + i, 1)) != nullptr) {
601 dst.append(1, '&');
602 dst.append(entity);
603 dst.append(1, ';');
604 i++;
605 }
606 else if (is7bit(src[i]))
607 dst.append(1, static_cast<char>(src[i++]));
608 else {
609 utf32_t unicode;
610#ifdef _WIN32
611 if (i + 1 < end && is_surrogate_pair(src + i)) {
612 unicode = surrogate_pair_to_ucs4(src + i);
613 i += 2;
614 }
615 else
616#endif
617 {
618 unicode = src[i++];
619 }
620 char tmp[3 + 8 + 1 + 1];
621 snprintf(tmp, _countof(tmp), "&#x%x;", static_cast<unsigned int>(unicode));
622 dst.append(tmp);
623 }
624 }
625 }
626 }
627 }
628 }
629
637 template <class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>>
638 void str2sgmlcat(
639 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
640 _In_ const std::basic_string_view<wchar_t, std::char_traits<wchar_t>> src,
641 _In_ int what = 0)
642 {
643 str2sgmlcat(dst, src.data(), src.size(), what);
644 }
645
657 inline size_t str2sgmlcat(
658 _Inout_cap_(count_dst) char* dst, _In_ size_t count_dst,
659 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
660 _In_ int what = 0)
661 {
662 _Assume_(dst || !count_dst);
663 _Assume_(src || !count_src);
664
665 static const std::invalid_argument buffer_overrun("buffer overrun");
666 const bool
667 do_ascii = (what & sgml_full) == 0,
668 do_quot = (what & sgml_quot) == 0,
669 do_apos = (what & sgml_apos) == 0,
670 do_lt_gt = (what & sgml_lt_gt) == 0,
671 do_bsol = (what & sgml_bsol) == 0,
672 do_dollar = (what & sgml_dollar) == 0,
673 do_percnt = (what & sgml_percnt) == 0,
674 do_commat = (what & sgml_commat) == 0,
675 do_num = (what & sgml_num) == 0,
676 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
677 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
678 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
679
680 size_t j = strnlen(dst, count_dst);
681 count_src = strnlen(src, count_src);
682 for (size_t i = 0; i < count_src;) {
683 size_t n = glyphlen(src + i, count_src - i);
684 if (n == 1 &&
685 do_ascii && is7bit(src[i]) &&
686 src[i] != L'&' &&
687 (do_quot || (src[i] != L'"')) &&
688 (do_apos || (src[i] != L'\'')) &&
689 (do_lt_gt || (src[i] != L'<' && src[i] != L'>')) &&
690 (do_bsol || (src[i] != L'\\')) &&
691 (do_dollar || (src[i] != L'$')) &&
692 (do_percnt || (src[i] != L'%')) &&
693 (do_commat || (src[i] != L'@')) &&
694 (do_num || (src[i] != L'#')) &&
695 (do_lpar_rpar || (src[i] != L'(' && src[i] != L')')) &&
696 (do_lcub_rcub || (src[i] != L'{' && src[i] != L'}')) &&
697 (do_lsqb_rsqb || (src[i] != L'[' && src[i] != L']')))
698 {
699 // 7-bit ASCII and no desire to encode it as an SGML entity.
700 if (j + 1 >= count_dst)
701 throw buffer_overrun;
702 dst[j++] = static_cast<char>(src[i++]);
703 }
704 else {
705 const char* entity = chr2sgml(src + i, n);
706 if (entity) {
707 size_t m = strlen(entity);
708 if (j + m + 2 >= count_dst)
709 throw buffer_overrun;
710 dst[j++] = '&';
711 memcpy(dst + j, entity, m * sizeof(char)); j += m;
712 dst[j++] = ';';
713 i += n;
714 }
715 else if (n == 1) {
716 // Trivial character (1 code unit, 1 glyph), no entity available.
717 if (is7bit(src[i])) {
718 if (j + 1 >= count_dst)
719 throw buffer_overrun;
720 dst[j++] = static_cast<char>(src[i++]);
721 }
722 else {
723 char tmp[3 + 8 + 1 + 1];
724 int m = snprintf(tmp, _countof(tmp), "&#x%x;", src[i++]);
725 _Assume_(m >= 0);
726 if (static_cast<size_t>(m) >= count_dst)
727 throw buffer_overrun;
728 memcpy(dst + j, tmp, static_cast<size_t>(m) * sizeof(char));
729 j += static_cast<size_t>(m);
730 }
731 }
732 else {
733 // Non-trivial character. Decompose.
734 const size_t end = i + n;
735 while (i < end) {
736 if ((entity = chr2sgml(src + i, 1)) != nullptr) {
737 size_t m = strlen(entity);
738 if (j + m + 2 >= count_dst)
739 throw buffer_overrun;
740 dst[j++] = '&';
741 memcpy(dst + j, entity, m * sizeof(char)); j += m;
742 dst[j++] = ';';
743 i++;
744 }
745 else if (is7bit(src[i])) {
746 if (j + 1 >= count_dst)
747 throw buffer_overrun;
748 dst[j++] = static_cast<char>(src[i++]);
749 }
750 else {
751 utf32_t unicode;
752#ifdef _WIN32
753 if (i + 1 < end && is_surrogate_pair(src + i)) {
754 unicode = surrogate_pair_to_ucs4(src + i);
755 i += 2;
756 }
757 else
758#endif
759 {
760 unicode = src[i++];
761 }
762 char tmp[3 + 8 + 1 + 1];
763 int m = snprintf(tmp, _countof(tmp), "&#x%x;", static_cast<unsigned int>(unicode));
764 _Assume_(m >= 0);
765 if (static_cast<size_t>(m) >= count_dst)
766 throw buffer_overrun;
767 memcpy(dst + j, tmp, static_cast<size_t>(m) * sizeof(char));
768 j += static_cast<size_t>(m);
769 }
770 }
771 }
772 }
773 }
774 if (j >= count_dst)
775 throw buffer_overrun;
776 dst[j] = 0;
777 return j;
778 }
779
788 template <class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>>
789 inline void str2sgmlcpy(
790 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
791 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
792 _In_ int what = 0)
793 {
794 dst.clear();
795 str2sgmlcat(dst, src, count_src, what);
796 }
797
805 template <class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>>
806 void str2sgmlcpy(
807 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
808 _In_ const std::basic_string_view<wchar_t, std::char_traits<wchar_t>> src,
809 _In_ int what = 0)
810 {
811 str2sgmlcpy(dst, src.data(), src.size(), what);
812 }
813
825 inline size_t str2sgmlcpy(
826 _Inout_cap_(count_dst) char* dst, _In_ size_t count_dst,
827 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
828 _In_ int what = 0)
829 {
830 _Assume_(dst || !count_dst);
831 if (count_dst)
832 dst[0] = 0;
833 return str2sgmlcat(dst, count_dst, src, count_src, what);
834 }
835
845 inline std::string str2sgml(
846 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
847 _In_ int what = 0)
848 {
849 std::string dst;
850 str2sgmlcat(dst, src, count_src, what);
851 return dst;
852 }
853
862 inline std::string str2sgml(
863 _In_ const std::basic_string_view<wchar_t, std::char_traits<wchar_t>> src,
864 _In_ int what = 0)
865 {
866 return str2sgml(src.data(), src.size(), what);
867 }
868}
869
870#if defined(__GNUC__)
871#pragma GCC diagnostic pop
872#endif