stdex
Additional custom or not Standard C++ covered algorithms
Loading...
Searching...
No Matches
sgml.hpp
1/*
2 SPDX-License-Identifier: MIT
3 Copyright © 2023-2024 Amebis
4*/
5
6#pragma once
7
8#include "compat.hpp"
9#include "mapping.hpp"
10#include "sgml_unicode.hpp"
11#include "string.hpp"
12#include <string.h>
13#include <exception>
14#include <string_view>
15#include <string>
16
17#if defined(__GNUC__)
18#pragma GCC diagnostic push
19#pragma GCC diagnostic ignored "-Wexit-time-destructors"
20#endif
21
22namespace stdex
23{
25 template <class T>
26 const utf32_t* sgml2uni(_In_reads_or_z_(count) const T* entity, _In_ size_t count, utf32_t buf[2])
27 {
28 _Assume_(entity && count);
29
30 if (count < 2 || entity[0] != '#') {
31 for (size_t i = 0, j = _countof(sgml_unicode); i < j; ) {
32 size_t m = (i + j) / 2;
33 if (sgml_unicode[m].sgml[0] < entity[0])
34 i = m + 1;
35 else if (sgml_unicode[m].sgml[0] > entity[0])
36 j = m;
37 else {
38 auto r = strncmp<char, T>(sgml_unicode[m].sgml + 1, _countof(sgml_unicode[0].sgml) - 1, entity + 1, count - 1);
39 if (r < 0)
40 i = m + 1;
41 else if (r > 0)
42 j = m;
43 else {
44 for (; i < m && strncmp<char, T>(sgml_unicode[m - 1].sgml, _countof(sgml_unicode[0].sgml), entity, count) == 0; m--);
45 return sgml_unicode[m].unicode;
46 }
47 }
48 }
49 return nullptr;
50 }
51
52 buf[0] = entity[1] == 'x' || entity[1] == 'X' ?
53 static_cast<utf32_t>(strtou32(&entity[2], count - 2, nullptr, 16)) :
54 static_cast<utf32_t>(strtou32(&entity[1], count - 1, nullptr, 10));
55 buf[1] = 0;
56 return buf;
57 }
58
59 inline const utf16_t* utf32_to_wstr(_In_opt_z_ const utf32_t* str, utf16_t* buf)
60 {
61 if (!str)
62 return nullptr;
63 for (size_t i = 0, j = 0;; ++i) {
64 if (!str[i]) {
65 buf[j] = 0;
66 return buf;
67 }
68 if (str[i] < 0x10000)
69 buf[j++] = static_cast<utf16_t>(str[i]);
70 else {
71 ucs4_to_surrogate_pair(&buf[j], str[i]);
72 j += 2;
73 }
74 }
75 }
76
77 inline const utf32_t* utf32_to_wstr(_In_opt_z_ const utf32_t* str, utf32_t* buf)
78 {
79 _Unreferenced_(buf);
80 return str;
81 }
82
83 template <class T>
84 const T* sgmlend(
85 _In_reads_or_z_opt_(count) const T* str, _In_ size_t count)
86 {
87 _Assume_(str || !count);
88 for (size_t i = 0; i < count; i++) {
89 if (str[i] == ';')
90 return str + i;
91 if (!str[i] || str[i] == '&' || isspace(str[i]))
92 break;
93 }
94 return nullptr;
95 }
97
98 constexpr int sgml_full = 0x40000000;
99 constexpr int sgml_quot = 0x00000001;
100 constexpr int sgml_apos = 0x00000002;
101 constexpr int sgml_quot_apos = sgml_quot | sgml_apos;
102 constexpr int sgml_amp = 0x00000004;
103 constexpr int sgml_lt_gt = 0x00000008;
104 constexpr int sgml_bsol = 0x00000010;
105 constexpr int sgml_dollar = 0x00000020;
106 constexpr int sgml_percnt = 0x00000040;
107 constexpr int sgml_commat = 0x00000080;
108 constexpr int sgml_num = 0x00000100;
109 constexpr int sgml_lpar_rpar = 0x00000200;
110 constexpr int sgml_lcub_rcub = 0x00000400;
111 constexpr int sgml_lsqb_rsqb = 0x00000800;
112 constexpr int sgml_sgml = sgml_amp | sgml_lt_gt;
113 constexpr int sgml_ml_attrib = sgml_amp | sgml_quot_apos;
114 constexpr int sgml_c = sgml_amp | sgml_bsol | sgml_quot_apos;
115 // constexpr int sgml_kolos = sgml_amp | sgml_quot | sgml_dollar | sgml_percnt | sgml_lt_gt | sgml_bsol/* | sgml_commat | sgml_num*/ | sgml_lpar_rpar | sgml_lcub_rcub | sgml_lsqb_rsqb;
116
126 template <class T_from>
127 size_t sgmlerr(
128 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
129 _In_ int what = 0)
130 {
131 _Assume_(src || !count_src);
132
133 const bool
134 do_ascii = (what & sgml_full) == 0;
135
136 for (size_t i = 0; i < count_src && src[i];) {
137 if (src[i] == '&') {
138 auto end = sgmlend(&src[i + 1], count_src - i - 1);
139 if (end) {
140 utf32_t chr[2];
141 size_t n = end - src - i - 1;
142 auto entity_w = sgml2uni(&src[i + 1], n, chr);
143 if (entity_w) {
144 i = end - src + 1;
145 continue;
146 }
147
148 // Unknown entity.
149 return i;
150 }
151
152 // Unterminated entity.
153 return i;
154 }
155
156 if (do_ascii && !is7bit(src[i])) {
157 // Non-ASCII character
158 return i;
159 }
160 i++;
161 }
162
163 return npos;
164 }
165
176 template <class T_to = wchar_t, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>>
177 void sgml2strcat(
178 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
179 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
180 _In_ int skip = 0,
181 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
182 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
183 {
184 _Assume_(src || !count_src);
185
186 const bool
187 skip_quot = (skip & sgml_quot) == 0,
188 skip_apos = (skip & sgml_apos) == 0,
189 skip_amp = (skip & sgml_amp) == 0,
190 skip_lt_gt = (skip & sgml_lt_gt) == 0,
191 skip_bsol = (skip & sgml_bsol) == 0,
192 skip_dollar = (skip & sgml_dollar) == 0,
193 skip_percnt = (skip & sgml_percnt) == 0,
194 skip_commat = (skip & sgml_commat) == 0,
195 skip_num = (skip & sgml_num) == 0,
196 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
197 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
198 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
199
200 count_src = strnlen(src, count_src);
201 dst.reserve(dst.size() + count_src);
202 for (size_t i = 0; i < count_src;) {
203 if (src[i] == '&') {
204 auto end = sgmlend(&src[i + 1], count_src - i - 1);
205 if (end) {
206 utf32_t chr32[2];
207 _Assume_(&src[i + 1] <= end);
208 size_t n = static_cast<size_t>(end - src) - i - 1;
209 T_to chr[5];
210 auto entity_w = utf32_to_wstr(sgml2uni(&src[i + 1], n, chr32), chr);
211 if (entity_w &&
212 (skip_quot || (entity_w[0] != '"')) &&
213 (skip_apos || (entity_w[0] != '\'')) &&
214 (skip_amp || (entity_w[0] != '&')) &&
215 (skip_lt_gt || (entity_w[0] != '<' && entity_w[0] != '>')) &&
216 (skip_bsol || (entity_w[0] != '\\')) &&
217 (skip_dollar || (entity_w[0] != '$')) &&
218 (skip_percnt || (entity_w[0] != '%')) &&
219 (skip_commat || (entity_w[0] != '@')) &&
220 (skip_num || (entity_w[0] != '#')) &&
221 (skip_lpar_rpar || (entity_w[0] != '(' && entity_w[0] != ')')) &&
222 (skip_lcub_rcub || (entity_w[0] != '{' && entity_w[0] != '}')) &&
223 (skip_lsqb_rsqb || (entity_w[0] != '[' && entity_w[0] != ']')))
224 {
225 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
226 dst.append(entity_w);
227 _Assume_(src <= end);
228 i = static_cast<size_t>(end - src) + 1;
229 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
230 continue;
231 }
232 }
233 }
234 dst.append(1, src[i++]);
235 }
236 }
237
247 template <class T_to = wchar_t, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
248 void sgml2strcat(
249 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
250 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
251 _In_ int skip = 0,
252 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
253 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
254 {
255 sgml2strcat(dst, src.data(), src.size(), skip, offset, map);
256 }
257
271 template <class T_to = wchar_t, class T_from>
272 size_t sgml2strcat(
273 _Inout_cap_(count_dst) T_to* dst, _In_ size_t count_dst,
274 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
275 _In_ int skip = 0,
276 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
277 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
278 {
279 _Assume_(dst || !count_dst);
280 _Assume_(src || !count_src);
281
282 static const std::invalid_argument buffer_overrun("buffer overrun");
283 const bool
284 skip_quot = (skip & sgml_quot) == 0,
285 skip_apos = (skip & sgml_apos) == 0,
286 skip_amp = (skip & sgml_amp) == 0,
287 skip_lt_gt = (skip & sgml_lt_gt) == 0,
288 skip_bsol = (skip & sgml_bsol) == 0,
289 skip_dollar = (skip & sgml_dollar) == 0,
290 skip_percnt = (skip & sgml_percnt) == 0,
291 skip_commat = (skip & sgml_commat) == 0,
292 skip_num = (skip & sgml_num) == 0,
293 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
294 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
295 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
296
297 size_t j = strnlen(dst, count_dst);
298 count_src = strnlen(src, count_src);
299 for (size_t i = 0; i < count_src;) {
300 if (src[i] == '&') {
301 auto end = sgmlend(&src[i + 1], count_src - i - 1);
302 if (end) {
303 utf32_t chr32[2];
304 T_to chr[5];
305 size_t n = end - src - i - 1;
306 auto entity_w = utf32_to_wstr(sgml2uni(&src[i + 1], n, chr32), chr);
307 if (entity_w &&
308 (skip_quot || (entity_w[0] != '"')) &&
309 (skip_apos || (entity_w[0] != '\'')) &&
310 (skip_amp || (entity_w[0] != '&')) &&
311 (skip_lt_gt || (entity_w[0] != '<' && entity_w[0] != '>')) &&
312 (skip_bsol || (entity_w[0] != '\\')) &&
313 (skip_dollar || (entity_w[0] != '$')) &&
314 (skip_percnt || (entity_w[0] != '%')) &&
315 (skip_commat || (entity_w[0] != '@')) &&
316 (skip_num || (entity_w[0] != '#')) &&
317 (skip_lpar_rpar || (entity_w[0] != '(' && entity_w[0] != ')')) &&
318 (skip_lcub_rcub || (entity_w[0] != '{' && entity_w[0] != '}')) &&
319 (skip_lsqb_rsqb || (entity_w[0] != '[' && entity_w[0] != ']')))
320 {
321 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
322 size_t m = strlen(entity_w);
323 if (j + m >= count_dst)
324 throw buffer_overrun;
325 memcpy(dst + j, entity_w, m * sizeof(*entity_w)); j += m;
326 i = end - src + 1;
327 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
328 continue;
329 }
330 }
331 }
332 if (j + 1 >= count_dst)
333 throw buffer_overrun;
334 dst[j++] = src[i++];
335 }
336 if (j >= count_dst)
337 throw buffer_overrun;
338 dst[j] = 0;
339 return j;
340 }
341
352 template <class T_to = wchar_t, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>>
353 void sgml2strcpy(
354 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
355 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
356 _In_ int skip = 0,
357 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
358 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
359 {
360 dst.clear();
361 if (map)
362 map->clear();
363 sgml2strcat(dst, src, count_src, skip, offset, map);
364 }
365
375 template<class T_to = wchar_t, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
376 void sgml2strcpy(
377 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
378 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
379 _In_ int skip = 0,
380 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
381 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
382 {
383 sgml2strcpy(dst, src.data(), src.size(), skip, offset, map);
384 }
385
399 template <class T_to = wchar_t, class T_from>
400 size_t sgml2strcpy(
401 _Inout_cap_(count_dst) T_to* dst, _In_ size_t count_dst,
402 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
403 _In_ int skip = 0,
404 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
405 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
406 {
407 _Assume_(dst || !count_dst);
408 if (count_dst)
409 dst[0] = 0;
410 if (map)
411 map->clear();
412 return sgml2strcat(dst, count_dst, src, count_src, skip, offset, map);
413 }
414
426 template <class T_to = wchar_t, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>>
427 std::basic_string<T_to, TR_to, AX_to> sgml2str(
428 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
429 _In_ int skip = 0,
430 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
431 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
432 {
433 std::basic_string<T_to, TR_to, AX_to> dst;
434 sgml2strcat(dst, src, count_src, skip, offset, map);
435 return dst;
436 }
437
448 template <class T_to = wchar_t, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
449 std::basic_string<T_to, TR_to, AX_to> sgml2str(
450 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
451 _In_ int skip = 0,
452 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
453 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
454 {
455 return sgml2str<T_to, T_from, TR_to, AX_to>(src.data(), src.size(), skip, offset, map);
456 }
457
459 inline const char* chr2sgml(_In_reads_or_z_(count) const utf16_t* entity, _In_ size_t count)
460 {
461 _Assume_(entity && count);
462
463 utf32_t e2;
464 size_t offset;
465 if (count < 2 || !is_surrogate_pair(entity)) {
466 e2 = static_cast<utf32_t>(entity[0]);
467 offset = 1;
468 }
469 else {
470 e2 = surrogate_pair_to_ucs4(entity);
471 offset = 2;
472 }
473 for (size_t i = 0, j = _countof(unicode_sgml); i < j; ) {
474 size_t m = (i + j) / 2;
475 auto e1 = sgml_unicode[unicode_sgml[m]].unicode[0];
476 if (e1 < e2)
477 i = m + 1;
478 else if (e1 > e2)
479 j = m;
480 else {
481 auto r = strncmp(sgml_unicode[unicode_sgml[m]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + offset, count - offset);
482 if (r < 0)
483 i = m + 1;
484 else if (r > 0)
485 j = m;
486 else {
487 for (; i < m && sgml_unicode[unicode_sgml[m - 1]].unicode[0] == e2 && strncmp(sgml_unicode[unicode_sgml[m - 1]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + offset, count - offset) == 0; m--);
488 return sgml_unicode[unicode_sgml[m]].sgml;
489 }
490 }
491 }
492 return nullptr;
493 }
494
495 inline const char* chr2sgml(_In_reads_or_z_(count) const utf32_t* entity, _In_ size_t count)
496 {
497 _Assume_(entity && count);
498
499 utf32_t e2 = entity[0];
500 for (size_t i = 0, j = _countof(unicode_sgml); i < j; ) {
501 size_t m = (i + j) / 2;
502 auto e1 = sgml_unicode[unicode_sgml[m]].unicode[0];
503 if (e1 < e2)
504 i = m + 1;
505 else if (e1 > e2)
506 j = m;
507 else {
508 auto r = strncmp(sgml_unicode[unicode_sgml[m]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1);
509 if (r < 0)
510 i = m + 1;
511 else if (r > 0)
512 j = m;
513 else {
514 for (; i < m && sgml_unicode[unicode_sgml[m - 1]].unicode[0] == e2 && strncmp(sgml_unicode[unicode_sgml[m - 1]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1) == 0; m--);
515 return sgml_unicode[unicode_sgml[m]].sgml;
516 }
517 }
518 }
519 return nullptr;
520 }
521
522 inline utf32_t wstr_to_utf32(_In_reads_(end) const utf16_t* src, _Inout_ size_t& i, _In_ size_t end)
523 {
524 _Assume_(i < end);
525 if (i + 1 >= end || !is_surrogate_pair(src + i))
526 return src[i++];
527
528 utf32_t unicode = surrogate_pair_to_ucs4(src + i);
529 i += 2;
530 return unicode;
531 }
532
533 inline utf32_t wstr_to_utf32(_In_reads_(end) const utf32_t* src, _Inout_ size_t& i, _In_ size_t end)
534 {
535 _Assume_(i < end);
536 return src[i++];
537 }
539
548 template <class T_from = wchar_t, class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>>
549 void str2sgmlcat(
550 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
551 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
552 _In_ int what = 0)
553 {
554 _Assume_(src || !count_src);
555
556 const bool
557 do_ascii = (what & sgml_full) == 0,
558 do_quot = (what & sgml_quot) == 0,
559 do_apos = (what & sgml_apos) == 0,
560 do_lt_gt = (what & sgml_lt_gt) == 0,
561 do_bsol = (what & sgml_bsol) == 0,
562 do_dollar = (what & sgml_dollar) == 0,
563 do_percnt = (what & sgml_percnt) == 0,
564 do_commat = (what & sgml_commat) == 0,
565 do_num = (what & sgml_num) == 0,
566 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
567 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
568 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
569
570 count_src = strnlen(src, count_src);
571 dst.reserve(dst.size() + count_src);
572 for (size_t i = 0; i < count_src;) {
573 size_t n = glyphlen(src + i, count_src - i);
574 if (n == 1 &&
575 do_ascii && is7bit(src[i]) &&
576 src[i] != '&' &&
577 (do_quot || (src[i] != '"')) &&
578 (do_apos || (src[i] != '\'')) &&
579 (do_lt_gt || (src[i] != '<' && src[i] != '>')) &&
580 (do_bsol || (src[i] != '\\')) &&
581 (do_dollar || (src[i] != '$')) &&
582 (do_percnt || (src[i] != '%')) &&
583 (do_commat || (src[i] != '@')) &&
584 (do_num || (src[i] != '#')) &&
585 (do_lpar_rpar || (src[i] != '(' && src[i] != ')')) &&
586 (do_lcub_rcub || (src[i] != '{' && src[i] != '}')) &&
587 (do_lsqb_rsqb || (src[i] != '[' && src[i] != ']')))
588 {
589 // 7-bit ASCII and no desire to encode it as an SGML entity.
590 dst.append(1, static_cast<char>(src[i++]));
591 }
592 else {
593 const char* entity = chr2sgml(src + i, n);
594 if (entity) {
595 dst.append(1, '&');
596 dst.append(entity);
597 dst.append(1, ';');
598 i += n;
599 }
600 else if (n == 1) {
601 // Trivial character (1 code unit, 1 glyph), no entity available.
602 if (is7bit(src[i]))
603 dst.append(1, static_cast<char>(src[i++]));
604 else {
605 char tmp[3 + 8 + 1 + 1];
606 snprintf(tmp, _countof(tmp), "&#x%x;", static_cast<unsigned int>(src[i++]));
607 dst.append(tmp);
608 }
609 }
610 else {
611 // Non-trivial character. Decompose.
612 const size_t end = i + n;
613 while (i < end) {
614 if ((entity = chr2sgml(src + i, 1)) != nullptr) {
615 dst.append(1, '&');
616 dst.append(entity);
617 dst.append(1, ';');
618 i++;
619 }
620 else if (is7bit(src[i]))
621 dst.append(1, static_cast<char>(src[i++]));
622 else {
623 char tmp[3 + 8 + 1 + 1];
624 snprintf(tmp, _countof(tmp), "&#x%x;", static_cast<unsigned int>(wstr_to_utf32(src, i, end)));
625 dst.append(tmp);
626 }
627 }
628 }
629 }
630 }
631 }
632
640 template <class T_from = wchar_t, class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>>
641 void str2sgmlcat(
642 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
643 _In_ const std::basic_string_view<T_from, std::char_traits<T_from>> src,
644 _In_ int what = 0)
645 {
646 str2sgmlcat(dst, src.data(), src.size(), what);
647 }
648
660 template <class T_from = wchar_t>
661 size_t str2sgmlcat(
662 _Inout_cap_(count_dst) char* dst, _In_ size_t count_dst,
663 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
664 _In_ int what = 0)
665 {
666 _Assume_(dst || !count_dst);
667 _Assume_(src || !count_src);
668
669 static const std::invalid_argument buffer_overrun("buffer overrun");
670 const bool
671 do_ascii = (what & sgml_full) == 0,
672 do_quot = (what & sgml_quot) == 0,
673 do_apos = (what & sgml_apos) == 0,
674 do_lt_gt = (what & sgml_lt_gt) == 0,
675 do_bsol = (what & sgml_bsol) == 0,
676 do_dollar = (what & sgml_dollar) == 0,
677 do_percnt = (what & sgml_percnt) == 0,
678 do_commat = (what & sgml_commat) == 0,
679 do_num = (what & sgml_num) == 0,
680 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
681 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
682 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
683
684 size_t j = strnlen(dst, count_dst);
685 count_src = strnlen(src, count_src);
686 for (size_t i = 0; i < count_src;) {
687 size_t n = glyphlen(src + i, count_src - i);
688 if (n == 1 &&
689 do_ascii && is7bit(src[i]) &&
690 src[i] != '&' &&
691 (do_quot || (src[i] != '"')) &&
692 (do_apos || (src[i] != '\'')) &&
693 (do_lt_gt || (src[i] != '<' && src[i] != '>')) &&
694 (do_bsol || (src[i] != '\\')) &&
695 (do_dollar || (src[i] != '$')) &&
696 (do_percnt || (src[i] != '%')) &&
697 (do_commat || (src[i] != '@')) &&
698 (do_num || (src[i] != '#')) &&
699 (do_lpar_rpar || (src[i] != '(' && src[i] != ')')) &&
700 (do_lcub_rcub || (src[i] != '{' && src[i] != '}')) &&
701 (do_lsqb_rsqb || (src[i] != '[' && src[i] != ']')))
702 {
703 // 7-bit ASCII and no desire to encode it as an SGML entity.
704 if (j + 1 >= count_dst)
705 throw buffer_overrun;
706 dst[j++] = static_cast<char>(src[i++]);
707 }
708 else {
709 const char* entity = chr2sgml(src + i, n);
710 if (entity) {
711 size_t m = strlen(entity);
712 if (j + m + 2 >= count_dst)
713 throw buffer_overrun;
714 dst[j++] = '&';
715 memcpy(dst + j, entity, m * sizeof(char)); j += m;
716 dst[j++] = ';';
717 i += n;
718 }
719 else if (n == 1) {
720 // Trivial character (1 code unit, 1 glyph), no entity available.
721 if (is7bit(src[i])) {
722 if (j + 1 >= count_dst)
723 throw buffer_overrun;
724 dst[j++] = static_cast<char>(src[i++]);
725 }
726 else {
727 char tmp[3 + 8 + 1 + 1];
728 int m = snprintf(tmp, _countof(tmp), "&#x%x;", static_cast<unsigned int>(src[i++]));
729 _Assume_(m >= 0);
730 if (static_cast<size_t>(m) >= count_dst)
731 throw buffer_overrun;
732 memcpy(dst + j, tmp, static_cast<size_t>(m) * sizeof(char));
733 j += static_cast<size_t>(m);
734 }
735 }
736 else {
737 // Non-trivial character. Decompose.
738 const size_t end = i + n;
739 while (i < end) {
740 if ((entity = chr2sgml(src + i, 1)) != nullptr) {
741 size_t m = strlen(entity);
742 if (j + m + 2 >= count_dst)
743 throw buffer_overrun;
744 dst[j++] = '&';
745 memcpy(dst + j, entity, m * sizeof(char)); j += m;
746 dst[j++] = ';';
747 i++;
748 }
749 else if (is7bit(src[i])) {
750 if (j + 1 >= count_dst)
751 throw buffer_overrun;
752 dst[j++] = static_cast<char>(src[i++]);
753 }
754 else {
755 char tmp[3 + 8 + 1 + 1];
756 int m = snprintf(tmp, _countof(tmp), "&#x%x;", static_cast<unsigned int>(wstr_to_utf32(src, i, end)));
757 _Assume_(m >= 0);
758 if (static_cast<size_t>(m) >= count_dst)
759 throw buffer_overrun;
760 memcpy(dst + j, tmp, static_cast<size_t>(m) * sizeof(char));
761 j += static_cast<size_t>(m);
762 }
763 }
764 }
765 }
766 }
767 if (j >= count_dst)
768 throw buffer_overrun;
769 dst[j] = 0;
770 return j;
771 }
772
781 template <class T_from = wchar_t, class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>>
782 void str2sgmlcpy(
783 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
784 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
785 _In_ int what = 0)
786 {
787 dst.clear();
788 str2sgmlcat(dst, src, count_src, what);
789 }
790
798 template <class T_from = wchar_t, class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>>
799 void str2sgmlcpy(
800 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
801 _In_ const std::basic_string_view<T_from, std::char_traits<T_from>> src,
802 _In_ int what = 0)
803 {
804 str2sgmlcpy(dst, src.data(), src.size(), what);
805 }
806
818 template <class T_from = wchar_t>
819 size_t str2sgmlcpy(
820 _Inout_cap_(count_dst) char* dst, _In_ size_t count_dst,
821 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
822 _In_ int what = 0)
823 {
824 _Assume_(dst || !count_dst);
825 if (count_dst)
826 dst[0] = 0;
827 return str2sgmlcat(dst, count_dst, src, count_src, what);
828 }
829
839 template <class T_from = wchar_t>
840 std::string str2sgml(
841 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
842 _In_ int what = 0)
843 {
844 std::string dst;
845 str2sgmlcat(dst, src, count_src, what);
846 return dst;
847 }
848
857 template <class T_from = wchar_t>
858 std::string str2sgml(
859 _In_ const std::basic_string_view<T_from, std::char_traits<T_from>> src,
860 _In_ int what = 0)
861 {
862 return str2sgml(src.data(), src.size(), what);
863 }
864}
865
866#if defined(__GNUC__)
867#pragma GCC diagnostic pop
868#endif