stdex
Additional custom or not Standard C++ covered algorithms
Loading...
Searching...
No Matches
sgml.hpp
1/*
2 SPDX-License-Identifier: MIT
3 Copyright © 2023-2024 Amebis
4*/
5
6#pragma once
7
8#include "compat.hpp"
9#include "mapping.hpp"
10#include "sgml_unicode.hpp"
11#include "string.hpp"
12#include <string.h>
13#include <exception>
14#include <string_view>
15#include <string>
16
17#if defined(__GNUC__)
18#pragma GCC diagnostic push
19#pragma GCC diagnostic ignored "-Wexit-time-destructors"
20#endif
21
22namespace stdex
23{
25 template <class T>
26 const utf32_t* sgml2uni(_In_reads_or_z_(count) const T* entity, _In_ size_t count, utf32_t buf[2])
27 {
28 _Assume_(entity && count);
29
30 if (count < 2 || entity[0] != '#') {
31 for (size_t i = 0, j = _countof(sgml_unicode); i < j; ) {
32 size_t m = (i + j) / 2;
33 if (sgml_unicode[m].sgml[0] < entity[0])
34 i = m + 1;
35 else if (sgml_unicode[m].sgml[0] > entity[0])
36 j = m;
37 else {
38 auto r = strncmp<char, T>(sgml_unicode[m].sgml + 1, _countof(sgml_unicode[0].sgml) - 1, entity + 1, count - 1);
39 if (r < 0)
40 i = m + 1;
41 else if (r > 0)
42 j = m;
43 else {
44 for (; i < m && strncmp<char, T>(sgml_unicode[m - 1].sgml, _countof(sgml_unicode[0].sgml), entity, count) == 0; m--);
45 return reinterpret_cast<const utf32_t*>(sgml_unicode[m].unicode);
46 }
47 }
48 }
49 return nullptr;
50 }
51
52 buf[0] = entity[1] == 'x' || entity[1] == 'X' ?
53 static_cast<utf32_t>(strtou32(&entity[2], count - 2, nullptr, 16)) :
54 static_cast<utf32_t>(strtou32(&entity[1], count - 1, nullptr, 10));
55 buf[1] = 0;
56 return buf;
57 }
58
59 inline const utf16_t* utf32_to_wstr(_In_opt_z_ const utf32_t* str, utf16_t* buf)
60 {
61 if (!str)
62 return nullptr;
63 for (size_t i = 0, j = 0;; ++i) {
64 if (!str[i]) {
65 buf[j] = 0;
66 return buf;
67 }
68 if (str[i] < 0x10000)
69 buf[j++] = static_cast<utf16_t>(str[i]);
70 else {
71 ucs4_to_surrogate_pair(&buf[j], str[i]);
72 j += 2;
73 }
74 }
75 }
76
77 inline const utf32_t* utf32_to_wstr(_In_opt_z_ const utf32_t* str, utf32_t* buf)
78 {
79 _Unreferenced_(buf);
80 return str;
81 }
82
83 template <class T>
84 const T* sgmlend(
85 _In_reads_or_z_opt_(count) const T* str, _In_ size_t count)
86 {
87 _Assume_(str || !count);
88 for (size_t i = 0; i < count; i++) {
89 if (str[i] == ';')
90 return str + i;
91 if (!str[i] || str[i] == '&' || isspace(str[i]))
92 break;
93 }
94 return nullptr;
95 }
97
98 constexpr int sgml_full = 0x40000000;
99 constexpr int sgml_quot = 0x00000001;
100 constexpr int sgml_apos = 0x00000002;
101 constexpr int sgml_quot_apos = sgml_quot | sgml_apos;
102 constexpr int sgml_amp = 0x00000004;
103 constexpr int sgml_lt_gt = 0x00000008;
104 constexpr int sgml_bsol = 0x00000010;
105 constexpr int sgml_dollar = 0x00000020;
106 constexpr int sgml_percnt = 0x00000040;
107 constexpr int sgml_commat = 0x00000080;
108 constexpr int sgml_num = 0x00000100;
109 constexpr int sgml_lpar_rpar = 0x00000200;
110 constexpr int sgml_lcub_rcub = 0x00000400;
111 constexpr int sgml_lsqb_rsqb = 0x00000800;
112 constexpr int sgml_sgml = sgml_amp | sgml_lt_gt;
113 constexpr int sgml_ml_attrib = sgml_amp | sgml_quot_apos;
114 constexpr int sgml_c = sgml_amp | sgml_bsol | sgml_quot_apos;
115 // constexpr int sgml_kolos = sgml_amp | sgml_quot | sgml_dollar | sgml_percnt | sgml_lt_gt | sgml_bsol/* | sgml_commat | sgml_num*/ | sgml_lpar_rpar | sgml_lcub_rcub | sgml_lsqb_rsqb;
116
126 template <class T_from>
127 size_t sgmlerr(
128 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
129 _In_ int what = 0)
130 {
131 _Assume_(src || !count_src);
132
133 const bool
134 do_ascii = (what & sgml_full) == 0;
135
136 for (size_t i = 0; i < count_src && src[i];) {
137 if (src[i] == '&') {
138 auto end = sgmlend(&src[i + 1], count_src - i - 1);
139 if (end) {
140 utf32_t chr[2];
141 size_t n = end - src - i - 1;
142 auto entity_w = sgml2uni(&src[i + 1], n, chr);
143 if (entity_w) {
144 i = end - src + 1;
145 continue;
146 }
147
148 // Unknown entity.
149 return i;
150 }
151
152 // Unterminated entity.
153 return i;
154 }
155
156 if (do_ascii && !is7bit(src[i])) {
157 // Non-ASCII character
158 return i;
159 }
160 i++;
161 }
162
163 return npos;
164 }
165
176 template <class T_to = wchar_t, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>>
177 void sgml2strcat(
178 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
179 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
180 _In_ int skip = 0,
181 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
182 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
183 {
184 _Assume_(src || !count_src);
185
186 const bool
187 skip_quot = (skip & sgml_quot) == 0,
188 skip_apos = (skip & sgml_apos) == 0,
189 skip_amp = (skip & sgml_amp) == 0,
190 skip_lt_gt = (skip & sgml_lt_gt) == 0,
191 skip_bsol = (skip & sgml_bsol) == 0,
192 skip_dollar = (skip & sgml_dollar) == 0,
193 skip_percnt = (skip & sgml_percnt) == 0,
194 skip_commat = (skip & sgml_commat) == 0,
195 skip_num = (skip & sgml_num) == 0,
196 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
197 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
198 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
199
200 count_src = strnlen(src, count_src);
201 dst.reserve(dst.size() + count_src);
202 for (size_t i = 0; i < count_src;) {
203 if (src[i] == '&') {
204 auto end = sgmlend(&src[i + 1], count_src - i - 1);
205 if (end) {
206 utf32_t chr32[2];
207 _Assume_(&src[i + 1] <= end);
208 size_t n = static_cast<size_t>(end - src) - i - 1;
209 T_to chr[5];
210 auto entity_w = utf32_to_wstr(sgml2uni(&src[i + 1], n, chr32), chr);
211 if (entity_w &&
212 (skip_quot || (entity_w[0] != '"')) &&
213 (skip_apos || (entity_w[0] != '\'')) &&
214 (skip_amp || (entity_w[0] != '&')) &&
215 (skip_lt_gt || (entity_w[0] != '<' && entity_w[0] != '>')) &&
216 (skip_bsol || (entity_w[0] != '\\')) &&
217 (skip_dollar || (entity_w[0] != '$')) &&
218 (skip_percnt || (entity_w[0] != '%')) &&
219 (skip_commat || (entity_w[0] != '@')) &&
220 (skip_num || (entity_w[0] != '#')) &&
221 (skip_lpar_rpar || (entity_w[0] != '(' && entity_w[0] != ')')) &&
222 (skip_lcub_rcub || (entity_w[0] != '{' && entity_w[0] != '}')) &&
223 (skip_lsqb_rsqb || (entity_w[0] != '[' && entity_w[0] != ']')))
224 {
225 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
226 dst.append(entity_w);
227 _Assume_(src <= end);
228 i = static_cast<size_t>(end - src) + 1;
229 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
230 continue;
231 }
232 }
233 }
234 dst.append(1, src[i++]);
235 }
236 }
237
247 template <class T_to = wchar_t, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
248 void sgml2strcat(
249 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
250 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
251 _In_ int skip = 0,
252 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
253 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
254 {
255 sgml2strcat(dst, src.data(), src.size(), skip, offset, map);
256 }
257
271 template <class T_to = wchar_t, class T_from>
272 size_t sgml2strcat(
273 _Inout_cap_(count_dst) T_to* dst, _In_ size_t count_dst,
274 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
275 _In_ int skip = 0,
276 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
277 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
278 {
279 _Assume_(dst || !count_dst);
280 _Assume_(src || !count_src);
281
282 static const std::invalid_argument buffer_overrun("buffer overrun");
283 const bool
284 skip_quot = (skip & sgml_quot) == 0,
285 skip_apos = (skip & sgml_apos) == 0,
286 skip_amp = (skip & sgml_amp) == 0,
287 skip_lt_gt = (skip & sgml_lt_gt) == 0,
288 skip_bsol = (skip & sgml_bsol) == 0,
289 skip_dollar = (skip & sgml_dollar) == 0,
290 skip_percnt = (skip & sgml_percnt) == 0,
291 skip_commat = (skip & sgml_commat) == 0,
292 skip_num = (skip & sgml_num) == 0,
293 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
294 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
295 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
296
297 size_t j = strnlen(dst, count_dst);
298 count_src = strnlen(src, count_src);
299 for (size_t i = 0; i < count_src;) {
300 if (src[i] == '&') {
301 auto end = sgmlend(&src[i + 1], count_src - i - 1);
302 if (end) {
303 utf32_t chr32[2];
304 T_to chr[5];
305 size_t n = end - src - i - 1;
306 auto entity_w = utf32_to_wstr(sgml2uni(&src[i + 1], n, chr32), chr);
307 if (entity_w &&
308 (skip_quot || (entity_w[0] != '"')) &&
309 (skip_apos || (entity_w[0] != '\'')) &&
310 (skip_amp || (entity_w[0] != '&')) &&
311 (skip_lt_gt || (entity_w[0] != '<' && entity_w[0] != '>')) &&
312 (skip_bsol || (entity_w[0] != '\\')) &&
313 (skip_dollar || (entity_w[0] != '$')) &&
314 (skip_percnt || (entity_w[0] != '%')) &&
315 (skip_commat || (entity_w[0] != '@')) &&
316 (skip_num || (entity_w[0] != '#')) &&
317 (skip_lpar_rpar || (entity_w[0] != '(' && entity_w[0] != ')')) &&
318 (skip_lcub_rcub || (entity_w[0] != '{' && entity_w[0] != '}')) &&
319 (skip_lsqb_rsqb || (entity_w[0] != '[' && entity_w[0] != ']')))
320 {
321 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
322 size_t m = strlen(entity_w);
323 if (j + m >= count_dst)
324 throw buffer_overrun;
325 memcpy(dst + j, entity_w, m * sizeof(*entity_w)); j += m;
326 i = end - src + 1;
327 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
328 continue;
329 }
330 }
331 }
332 if (j + 1 >= count_dst)
333 throw buffer_overrun;
334 dst[j++] = src[i++];
335 }
336 if (j >= count_dst)
337 throw buffer_overrun;
338 dst[j] = 0;
339 return j;
340 }
341
352 template <class T_to = wchar_t, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>>
353 void sgml2strcpy(
354 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
355 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
356 _In_ int skip = 0,
357 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
358 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
359 {
360 dst.clear();
361 if (map)
362 map->clear();
363 sgml2strcat(dst, src, count_src, skip, offset, map);
364 }
365
375 template<class T_to = wchar_t, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
376 void sgml2strcpy(
377 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
378 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
379 _In_ int skip = 0,
380 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
381 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
382 {
383 sgml2strcpy(dst, src.data(), src.size(), skip, offset, map);
384 }
385
399 template <class T_to = wchar_t, class T_from>
400 size_t sgml2strcpy(
401 _Inout_cap_(count_dst) T_to* dst, _In_ size_t count_dst,
402 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
403 _In_ int skip = 0,
404 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
405 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
406 {
407 _Assume_(dst || !count_dst);
408 if (count_dst)
409 dst[0] = 0;
410 if (map)
411 map->clear();
412 return sgml2strcat(dst, count_dst, src, count_src, skip, offset, map);
413 }
414
426 template <class T_to = wchar_t, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>>
427 std::basic_string<T_to, TR_to, AX_to> sgml2str(
428 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
429 _In_ int skip = 0,
430 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
431 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
432 {
433 std::basic_string<T_to, TR_to, AX_to> dst;
434 sgml2strcat(dst, src, count_src, skip, offset, map);
435 return dst;
436 }
437
448 template <class T_to = wchar_t, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
449 std::basic_string<T_to, TR_to, AX_to> sgml2str(
450 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
451 _In_ int skip = 0,
452 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
453 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
454 {
455 return sgml2str<T_to, T_from, TR_to, AX_to>(src.data(), src.size(), skip, offset, map);
456 }
457
459 inline const char* chr2sgml(_In_reads_or_z_(count) const utf16_t* entity, _In_ size_t count)
460 {
461 _Assume_(entity && count);
462
463 utf32_t e2;
464 size_t offset;
465 if (count < 2 || !is_surrogate_pair(entity)) {
466 e2 = static_cast<utf32_t>(entity[0]);
467 offset = 1;
468 }
469 else {
470 e2 = surrogate_pair_to_ucs4(entity);
471 offset = 2;
472 }
473 for (size_t i = 0, j = _countof(unicode_sgml); i < j; ) {
474 size_t m = (i + j) / 2;
475 auto e1 = sgml_unicode[unicode_sgml[m]].unicode[0];
476 if (e1 < e2)
477 i = m + 1;
478 else if (e1 > e2)
479 j = m;
480 else {
481 auto r = strncmp(sgml_unicode[unicode_sgml[m]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + offset, count - offset);
482 if (r < 0)
483 i = m + 1;
484 else if (r > 0)
485 j = m;
486 else {
487 for (; i < m && sgml_unicode[unicode_sgml[m - 1]].unicode[0] == e2 && strncmp(sgml_unicode[unicode_sgml[m - 1]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + offset, count - offset) == 0; m--);
488 return sgml_unicode[unicode_sgml[m]].sgml;
489 }
490 }
491 }
492 return nullptr;
493 }
494
495 inline const char* chr2sgml(_In_reads_or_z_(count) const utf32_t* entity, _In_ size_t count)
496 {
497 _Assume_(entity && count);
498
499 utf32_t e2 = entity[0];
500 for (size_t i = 0, j = _countof(unicode_sgml); i < j; ) {
501 size_t m = (i + j) / 2;
502 auto e1 = sgml_unicode[unicode_sgml[m]].unicode[0];
503 if (e1 < e2)
504 i = m + 1;
505 else if (e1 > e2)
506 j = m;
507 else {
508 auto r = strncmp(sgml_unicode[unicode_sgml[m]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1);
509 if (r < 0)
510 i = m + 1;
511 else if (r > 0)
512 j = m;
513 else {
514 for (; i < m && sgml_unicode[unicode_sgml[m - 1]].unicode[0] == e2 && strncmp(sgml_unicode[unicode_sgml[m - 1]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1) == 0; m--);
515 return sgml_unicode[unicode_sgml[m]].sgml;
516 }
517 }
518 }
519 return nullptr;
520 }
521
522 inline utf32_t wstr_to_utf32(_In_reads_(end) const utf16_t* src, _Inout_ size_t& i, _In_ size_t end)
523 {
524 _Assume_(i < end);
525 if (i + 1 >= end || !is_surrogate_pair(src + i))
526 return src[i++];
527
528 utf32_t unicode = surrogate_pair_to_ucs4(src + i);
529 i += 2;
530 return unicode;
531 }
532
533 inline utf32_t wstr_to_utf32(_In_reads_(end) const utf32_t* src, _Inout_ size_t& i, _In_ size_t end)
534 {
535 _Unreferenced_(end);
536 _Assume_(i < end);
537 return src[i++];
538 }
540
549 template <class T_from = wchar_t, class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>>
550 void str2sgmlcat(
551 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
552 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
553 _In_ int what = 0)
554 {
555 _Assume_(src || !count_src);
556
557 const bool
558 do_ascii = (what & sgml_full) == 0,
559 do_quot = (what & sgml_quot) == 0,
560 do_apos = (what & sgml_apos) == 0,
561 do_lt_gt = (what & sgml_lt_gt) == 0,
562 do_bsol = (what & sgml_bsol) == 0,
563 do_dollar = (what & sgml_dollar) == 0,
564 do_percnt = (what & sgml_percnt) == 0,
565 do_commat = (what & sgml_commat) == 0,
566 do_num = (what & sgml_num) == 0,
567 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
568 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
569 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
570
571 count_src = strnlen(src, count_src);
572 dst.reserve(dst.size() + count_src);
573 for (size_t i = 0; i < count_src;) {
574 size_t n = glyphlen(src + i, count_src - i);
575 if (n == 1 &&
576 do_ascii && is7bit(src[i]) &&
577 src[i] != '&' &&
578 (do_quot || (src[i] != '"')) &&
579 (do_apos || (src[i] != '\'')) &&
580 (do_lt_gt || (src[i] != '<' && src[i] != '>')) &&
581 (do_bsol || (src[i] != '\\')) &&
582 (do_dollar || (src[i] != '$')) &&
583 (do_percnt || (src[i] != '%')) &&
584 (do_commat || (src[i] != '@')) &&
585 (do_num || (src[i] != '#')) &&
586 (do_lpar_rpar || (src[i] != '(' && src[i] != ')')) &&
587 (do_lcub_rcub || (src[i] != '{' && src[i] != '}')) &&
588 (do_lsqb_rsqb || (src[i] != '[' && src[i] != ']')))
589 {
590 // 7-bit ASCII and no desire to encode it as an SGML entity.
591 dst.append(1, static_cast<char>(src[i++]));
592 }
593 else {
594 const char* entity = chr2sgml(src + i, n);
595 if (entity) {
596 dst.append(1, '&');
597 dst.append(entity);
598 dst.append(1, ';');
599 i += n;
600 }
601 else if (n == 1) {
602 // Trivial character (1 code unit, 1 glyph), no entity available.
603 if (is7bit(src[i]))
604 dst.append(1, static_cast<char>(src[i++]));
605 else {
606 char tmp[3 + 8 + 1 + 1];
607 snprintf(tmp, _countof(tmp), "&#x%x;", static_cast<unsigned int>(src[i++]));
608 dst.append(tmp);
609 }
610 }
611 else {
612 // Non-trivial character. Decompose.
613 const size_t end = i + n;
614 while (i < end) {
615 if ((entity = chr2sgml(src + i, 1)) != nullptr) {
616 dst.append(1, '&');
617 dst.append(entity);
618 dst.append(1, ';');
619 i++;
620 }
621 else if (is7bit(src[i]))
622 dst.append(1, static_cast<char>(src[i++]));
623 else {
624 char tmp[3 + 8 + 1 + 1];
625 snprintf(tmp, _countof(tmp), "&#x%x;", static_cast<unsigned int>(wstr_to_utf32(src, i, end)));
626 dst.append(tmp);
627 }
628 }
629 }
630 }
631 }
632 }
633
641 template <class T_from = wchar_t, class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>>
642 void str2sgmlcat(
643 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
644 _In_ const std::basic_string_view<T_from, std::char_traits<T_from>> src,
645 _In_ int what = 0)
646 {
647 str2sgmlcat(dst, src.data(), src.size(), what);
648 }
649
661 template <class T_from = wchar_t>
662 size_t str2sgmlcat(
663 _Inout_cap_(count_dst) char* dst, _In_ size_t count_dst,
664 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
665 _In_ int what = 0)
666 {
667 _Assume_(dst || !count_dst);
668 _Assume_(src || !count_src);
669
670 static const std::invalid_argument buffer_overrun("buffer overrun");
671 const bool
672 do_ascii = (what & sgml_full) == 0,
673 do_quot = (what & sgml_quot) == 0,
674 do_apos = (what & sgml_apos) == 0,
675 do_lt_gt = (what & sgml_lt_gt) == 0,
676 do_bsol = (what & sgml_bsol) == 0,
677 do_dollar = (what & sgml_dollar) == 0,
678 do_percnt = (what & sgml_percnt) == 0,
679 do_commat = (what & sgml_commat) == 0,
680 do_num = (what & sgml_num) == 0,
681 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
682 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
683 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
684
685 size_t j = strnlen(dst, count_dst);
686 count_src = strnlen(src, count_src);
687 for (size_t i = 0; i < count_src;) {
688 size_t n = glyphlen(src + i, count_src - i);
689 if (n == 1 &&
690 do_ascii && is7bit(src[i]) &&
691 src[i] != '&' &&
692 (do_quot || (src[i] != '"')) &&
693 (do_apos || (src[i] != '\'')) &&
694 (do_lt_gt || (src[i] != '<' && src[i] != '>')) &&
695 (do_bsol || (src[i] != '\\')) &&
696 (do_dollar || (src[i] != '$')) &&
697 (do_percnt || (src[i] != '%')) &&
698 (do_commat || (src[i] != '@')) &&
699 (do_num || (src[i] != '#')) &&
700 (do_lpar_rpar || (src[i] != '(' && src[i] != ')')) &&
701 (do_lcub_rcub || (src[i] != '{' && src[i] != '}')) &&
702 (do_lsqb_rsqb || (src[i] != '[' && src[i] != ']')))
703 {
704 // 7-bit ASCII and no desire to encode it as an SGML entity.
705 if (j + 1 >= count_dst)
706 throw buffer_overrun;
707 dst[j++] = static_cast<char>(src[i++]);
708 }
709 else {
710 const char* entity = chr2sgml(src + i, n);
711 if (entity) {
712 size_t m = strlen(entity);
713 if (j + m + 2 >= count_dst)
714 throw buffer_overrun;
715 dst[j++] = '&';
716 memcpy(dst + j, entity, m * sizeof(char)); j += m;
717 dst[j++] = ';';
718 i += n;
719 }
720 else if (n == 1) {
721 // Trivial character (1 code unit, 1 glyph), no entity available.
722 if (is7bit(src[i])) {
723 if (j + 1 >= count_dst)
724 throw buffer_overrun;
725 dst[j++] = static_cast<char>(src[i++]);
726 }
727 else {
728 char tmp[3 + 8 + 1 + 1];
729 int m = snprintf(tmp, _countof(tmp), "&#x%x;", static_cast<unsigned int>(src[i++]));
730 _Assume_(m >= 0);
731 if (static_cast<size_t>(m) >= count_dst)
732 throw buffer_overrun;
733 memcpy(dst + j, tmp, static_cast<size_t>(m) * sizeof(char));
734 j += static_cast<size_t>(m);
735 }
736 }
737 else {
738 // Non-trivial character. Decompose.
739 const size_t end = i + n;
740 while (i < end) {
741 if ((entity = chr2sgml(src + i, 1)) != nullptr) {
742 size_t m = strlen(entity);
743 if (j + m + 2 >= count_dst)
744 throw buffer_overrun;
745 dst[j++] = '&';
746 memcpy(dst + j, entity, m * sizeof(char)); j += m;
747 dst[j++] = ';';
748 i++;
749 }
750 else if (is7bit(src[i])) {
751 if (j + 1 >= count_dst)
752 throw buffer_overrun;
753 dst[j++] = static_cast<char>(src[i++]);
754 }
755 else {
756 char tmp[3 + 8 + 1 + 1];
757 int m = snprintf(tmp, _countof(tmp), "&#x%x;", static_cast<unsigned int>(wstr_to_utf32(src, i, end)));
758 _Assume_(m >= 0);
759 if (static_cast<size_t>(m) >= count_dst)
760 throw buffer_overrun;
761 memcpy(dst + j, tmp, static_cast<size_t>(m) * sizeof(char));
762 j += static_cast<size_t>(m);
763 }
764 }
765 }
766 }
767 }
768 if (j >= count_dst)
769 throw buffer_overrun;
770 dst[j] = 0;
771 return j;
772 }
773
782 template <class T_from = wchar_t, class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>>
783 void str2sgmlcpy(
784 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
785 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
786 _In_ int what = 0)
787 {
788 dst.clear();
789 str2sgmlcat(dst, src, count_src, what);
790 }
791
799 template <class T_from = wchar_t, class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>>
800 void str2sgmlcpy(
801 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
802 _In_ const std::basic_string_view<T_from, std::char_traits<T_from>> src,
803 _In_ int what = 0)
804 {
805 str2sgmlcpy(dst, src.data(), src.size(), what);
806 }
807
819 template <class T_from = wchar_t>
820 size_t str2sgmlcpy(
821 _Inout_cap_(count_dst) char* dst, _In_ size_t count_dst,
822 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
823 _In_ int what = 0)
824 {
825 _Assume_(dst || !count_dst);
826 if (count_dst)
827 dst[0] = 0;
828 return str2sgmlcat(dst, count_dst, src, count_src, what);
829 }
830
840 template <class T_from = wchar_t>
841 std::string str2sgml(
842 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
843 _In_ int what = 0)
844 {
845 std::string dst;
846 str2sgmlcat(dst, src, count_src, what);
847 return dst;
848 }
849
858 template <class T_from = wchar_t>
859 std::string str2sgml(
860 _In_ const std::basic_string_view<T_from, std::char_traits<T_from>> src,
861 _In_ int what = 0)
862 {
863 return str2sgml(src.data(), src.size(), what);
864 }
865}
866
867#if defined(__GNUC__)
868#pragma GCC diagnostic pop
869#endif