stdex
Additional custom or not Standard C++ covered algorithms
Loading...
Searching...
No Matches
sgml.hpp
1/*
2 SPDX-License-Identifier: MIT
3 Copyright © 2023-2024 Amebis
4*/
5
6#pragma once
7
8#include "compat.hpp"
9#include "mapping.hpp"
10#include "sgml_unicode.hpp"
11#include "string.hpp"
12#include <string.h>
13#include <exception>
14#include <string>
15
16#if defined(__GNUC__)
17#pragma GCC diagnostic push
18#pragma GCC diagnostic ignored "-Wexit-time-destructors"
19#endif
20
21namespace stdex
22{
24 template <class T>
25 const utf32_t* sgml2uni(_In_reads_or_z_(count) const T* entity, _In_ size_t count, utf32_t buf[2])
26 {
27 _Assume_(entity && count);
28
29 if (count < 2 || entity[0] != '#') {
30 for (size_t i = 0, j = _countof(sgml_unicode); i < j; ) {
31 size_t m = (i + j) / 2;
32 if (sgml_unicode[m].sgml[0] < entity[0])
33 i = m + 1;
34 else if (sgml_unicode[m].sgml[0] > entity[0])
35 j = m;
36 else {
37 auto r = strncmp<char, T>(sgml_unicode[m].sgml + 1, _countof(sgml_unicode[0].sgml) - 1, entity + 1, count - 1);
38 if (r < 0)
39 i = m + 1;
40 else if (r > 0)
41 j = m;
42 else {
43 for (; i < m && strncmp<char, T>(sgml_unicode[m - 1].sgml, _countof(sgml_unicode[0].sgml), entity, count) == 0; m--);
44 return reinterpret_cast<const utf32_t*>(sgml_unicode[m].unicode);
45 }
46 }
47 }
48 return nullptr;
49 }
50
51 buf[0] = entity[1] == 'x' || entity[1] == 'X' ?
52 static_cast<utf32_t>(strtou32(&entity[2], count - 2, nullptr, 16)) :
53 static_cast<utf32_t>(strtou32(&entity[1], count - 1, nullptr, 10));
54 buf[1] = 0;
55 return buf;
56 }
57
58 inline const utf16_t* utf32_to_wstr(_In_opt_z_ const utf32_t* str, utf16_t* buf)
59 {
60 if (!str)
61 return nullptr;
62 for (size_t i = 0, j = 0;; ++i) {
63 if (!str[i]) {
64 buf[j] = 0;
65 return buf;
66 }
67 if (str[i] < 0x10000)
68 buf[j++] = static_cast<utf16_t>(str[i]);
69 else {
70 ucs4_to_surrogate_pair(&buf[j], str[i]);
71 j += 2;
72 }
73 }
74 }
75
76 inline const utf32_t* utf32_to_wstr(_In_opt_z_ const utf32_t* str, utf32_t* buf)
77 {
78 _Unreferenced_(buf);
79 return str;
80 }
81
82 template <class T>
83 const T* sgmlend(
84 _In_reads_or_z_opt_(count) const T* str, _In_ size_t count)
85 {
86 _Assume_(str || !count);
87 for (size_t i = 0; i < count; i++) {
88 if (str[i] == ';')
89 return str + i;
90 if (!str[i] || str[i] == '&' || isspace(str[i]))
91 break;
92 }
93 return nullptr;
94 }
96
97 constexpr int sgml_full = 0x40000000;
98 constexpr int sgml_quot = 0x00000001;
99 constexpr int sgml_apos = 0x00000002;
100 constexpr int sgml_quot_apos = sgml_quot | sgml_apos;
101 constexpr int sgml_amp = 0x00000004;
102 constexpr int sgml_lt_gt = 0x00000008;
103 constexpr int sgml_bsol = 0x00000010;
104 constexpr int sgml_dollar = 0x00000020;
105 constexpr int sgml_percnt = 0x00000040;
106 constexpr int sgml_commat = 0x00000080;
107 constexpr int sgml_num = 0x00000100;
108 constexpr int sgml_lpar_rpar = 0x00000200;
109 constexpr int sgml_lcub_rcub = 0x00000400;
110 constexpr int sgml_lsqb_rsqb = 0x00000800;
111 constexpr int sgml_sgml = sgml_amp | sgml_lt_gt;
112 constexpr int sgml_ml_attrib = sgml_amp | sgml_quot_apos;
113 constexpr int sgml_c = sgml_amp | sgml_bsol | sgml_quot_apos;
114 // constexpr int sgml_kolos = sgml_amp | sgml_quot | sgml_dollar | sgml_percnt | sgml_lt_gt | sgml_bsol/* | sgml_commat | sgml_num*/ | sgml_lpar_rpar | sgml_lcub_rcub | sgml_lsqb_rsqb;
115
125 template <class T_from>
126 size_t sgmlerr(
127 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
128 _In_ int what = 0)
129 {
130 _Assume_(src || !count_src);
131
132 const bool
133 do_ascii = (what & sgml_full) == 0;
134
135 for (size_t i = 0; i < count_src && src[i];) {
136 if (src[i] == '&') {
137 auto end = sgmlend(&src[i + 1], count_src - i - 1);
138 if (end) {
139 utf32_t chr[2];
140 size_t n = end - src - i - 1;
141 auto entity_w = sgml2uni(&src[i + 1], n, chr);
142 if (entity_w) {
143 i = end - src + 1;
144 continue;
145 }
146
147 // Unknown entity.
148 return i;
149 }
150
151 // Unterminated entity.
152 return i;
153 }
154
155 if (do_ascii && !is7bit(src[i])) {
156 // Non-ASCII character
157 return i;
158 }
159 i++;
160 }
161
162 return npos;
163 }
164
173 template <class T_from, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
174 size_t sgmlerr(
175 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
176 _In_ int what = 0)
177 {
178 return sgmlerr(src.data(), src.size(), what);
179 }
180
191 template <class T_to, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>>
192 void sgml2strcat(
193 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
194 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
195 _In_ int skip = 0,
196 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
197 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
198 {
199 _Assume_(src || !count_src);
200
201 const bool
202 skip_quot = (skip & sgml_quot) == 0,
203 skip_apos = (skip & sgml_apos) == 0,
204 skip_amp = (skip & sgml_amp) == 0,
205 skip_lt_gt = (skip & sgml_lt_gt) == 0,
206 skip_bsol = (skip & sgml_bsol) == 0,
207 skip_dollar = (skip & sgml_dollar) == 0,
208 skip_percnt = (skip & sgml_percnt) == 0,
209 skip_commat = (skip & sgml_commat) == 0,
210 skip_num = (skip & sgml_num) == 0,
211 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
212 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
213 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
214
215 count_src = strnlen(src, count_src);
216 dst.reserve(dst.size() + count_src);
217 for (size_t i = 0; i < count_src;) {
218 if (src[i] == '&') {
219 auto end = sgmlend(&src[i + 1], count_src - i - 1);
220 if (end) {
221 utf32_t chr32[2];
222 _Assume_(&src[i + 1] <= end);
223 size_t n = static_cast<size_t>(end - src) - i - 1;
224 T_to chr[5];
225 auto entity_w = utf32_to_wstr(sgml2uni(&src[i + 1], n, chr32), chr);
226 if (entity_w &&
227 (skip_quot || (entity_w[0] != '"')) &&
228 (skip_apos || (entity_w[0] != '\'')) &&
229 (skip_amp || (entity_w[0] != '&')) &&
230 (skip_lt_gt || (entity_w[0] != '<' && entity_w[0] != '>')) &&
231 (skip_bsol || (entity_w[0] != '\\')) &&
232 (skip_dollar || (entity_w[0] != '$')) &&
233 (skip_percnt || (entity_w[0] != '%')) &&
234 (skip_commat || (entity_w[0] != '@')) &&
235 (skip_num || (entity_w[0] != '#')) &&
236 (skip_lpar_rpar || (entity_w[0] != '(' && entity_w[0] != ')')) &&
237 (skip_lcub_rcub || (entity_w[0] != '{' && entity_w[0] != '}')) &&
238 (skip_lsqb_rsqb || (entity_w[0] != '[' && entity_w[0] != ']')))
239 {
240 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
241 dst.append(entity_w);
242 _Assume_(src <= end);
243 i = static_cast<size_t>(end - src) + 1;
244 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
245 continue;
246 }
247 }
248 }
249 dst.append(1, src[i++]);
250 }
251 }
252
262 template <class T_to, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
263 void sgml2strcat(
264 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
265 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
266 _In_ int skip = 0,
267 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
268 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
269 {
270 sgml2strcat(dst, src.data(), src.size(), skip, offset, map);
271 }
272
286 template <class T_to, class T_from>
287 size_t sgml2strcat(
288 _Inout_cap_(count_dst) T_to* dst, _In_ size_t count_dst,
289 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
290 _In_ int skip = 0,
291 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
292 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
293 {
294 _Assume_(dst || !count_dst);
295 _Assume_(src || !count_src);
296
297 static const std::invalid_argument buffer_overrun("buffer overrun");
298 const bool
299 skip_quot = (skip & sgml_quot) == 0,
300 skip_apos = (skip & sgml_apos) == 0,
301 skip_amp = (skip & sgml_amp) == 0,
302 skip_lt_gt = (skip & sgml_lt_gt) == 0,
303 skip_bsol = (skip & sgml_bsol) == 0,
304 skip_dollar = (skip & sgml_dollar) == 0,
305 skip_percnt = (skip & sgml_percnt) == 0,
306 skip_commat = (skip & sgml_commat) == 0,
307 skip_num = (skip & sgml_num) == 0,
308 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
309 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
310 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
311
312 size_t j = strnlen(dst, count_dst);
313 count_src = strnlen(src, count_src);
314 for (size_t i = 0; i < count_src;) {
315 if (src[i] == '&') {
316 auto end = sgmlend(&src[i + 1], count_src - i - 1);
317 if (end) {
318 utf32_t chr32[2];
319 T_to chr[5];
320 size_t n = end - src - i - 1;
321 auto entity_w = utf32_to_wstr(sgml2uni(&src[i + 1], n, chr32), chr);
322 if (entity_w &&
323 (skip_quot || (entity_w[0] != '"')) &&
324 (skip_apos || (entity_w[0] != '\'')) &&
325 (skip_amp || (entity_w[0] != '&')) &&
326 (skip_lt_gt || (entity_w[0] != '<' && entity_w[0] != '>')) &&
327 (skip_bsol || (entity_w[0] != '\\')) &&
328 (skip_dollar || (entity_w[0] != '$')) &&
329 (skip_percnt || (entity_w[0] != '%')) &&
330 (skip_commat || (entity_w[0] != '@')) &&
331 (skip_num || (entity_w[0] != '#')) &&
332 (skip_lpar_rpar || (entity_w[0] != '(' && entity_w[0] != ')')) &&
333 (skip_lcub_rcub || (entity_w[0] != '{' && entity_w[0] != '}')) &&
334 (skip_lsqb_rsqb || (entity_w[0] != '[' && entity_w[0] != ']')))
335 {
336 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
337 size_t m = strlen(entity_w);
338 if (j + m >= count_dst)
339 throw buffer_overrun;
340 memcpy(dst + j, entity_w, m * sizeof(*entity_w)); j += m;
341 i = end - src + 1;
342 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
343 continue;
344 }
345 }
346 }
347 if (j + 1 >= count_dst)
348 throw buffer_overrun;
349 dst[j++] = src[i++];
350 }
351 if (j >= count_dst)
352 throw buffer_overrun;
353 dst[j] = 0;
354 return j;
355 }
356
367 template <class T_to, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>>
368 void sgml2strcpy(
369 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
370 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
371 _In_ int skip = 0,
372 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
373 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
374 {
375 dst.clear();
376 if (map)
377 map->clear();
378 sgml2strcat(dst, src, count_src, skip, offset, map);
379 }
380
390 template<class T_to, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
391 void sgml2strcpy(
392 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
393 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
394 _In_ int skip = 0,
395 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
396 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
397 {
398 sgml2strcpy(dst, src.data(), src.size(), skip, offset, map);
399 }
400
414 template <class T_to, class T_from>
415 size_t sgml2strcpy(
416 _Inout_cap_(count_dst) T_to* dst, _In_ size_t count_dst,
417 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
418 _In_ int skip = 0,
419 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
420 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
421 {
422 _Assume_(dst || !count_dst);
423 if (count_dst)
424 dst[0] = 0;
425 if (map)
426 map->clear();
427 return sgml2strcat(dst, count_dst, src, count_src, skip, offset, map);
428 }
429
441 template <class T_to = wchar_t, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>>
442 std::basic_string<T_to, TR_to, AX_to> sgml2str(
443 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
444 _In_ int skip = 0,
445 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
446 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
447 {
448 std::basic_string<T_to, TR_to, AX_to> dst;
449 sgml2strcat(dst, src, count_src, skip, offset, map);
450 return dst;
451 }
452
463 template <class T_to = wchar_t, class T_from, class TR_to = std::char_traits<T_to>, class AX_to = std::allocator<T_to>, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
464 std::basic_string<T_to, TR_to, AX_to> sgml2str(
465 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
466 _In_ int skip = 0,
467 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
468 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
469 {
470 return sgml2str<T_to, T_from, TR_to, AX_to>(src.data(), src.size(), skip, offset, map);
471 }
472
474 inline const char* chr2sgml(_In_reads_or_z_(count) const utf16_t* entity, _In_ size_t count)
475 {
476 _Assume_(entity && count);
477
478 utf32_t e2;
479 size_t offset;
480 if (count < 2 || !is_surrogate_pair(entity)) {
481 e2 = static_cast<utf32_t>(entity[0]);
482 offset = 1;
483 }
484 else {
485 e2 = surrogate_pair_to_ucs4(entity);
486 offset = 2;
487 }
488 for (size_t i = 0, j = _countof(unicode_sgml); i < j; ) {
489 size_t m = (i + j) / 2;
490 auto e1 = sgml_unicode[unicode_sgml[m]].unicode[0];
491 if (e1 < e2)
492 i = m + 1;
493 else if (e1 > e2)
494 j = m;
495 else {
496 auto r = strncmp(sgml_unicode[unicode_sgml[m]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + offset, count - offset);
497 if (r < 0)
498 i = m + 1;
499 else if (r > 0)
500 j = m;
501 else {
502 for (; i < m && sgml_unicode[unicode_sgml[m - 1]].unicode[0] == e2 && strncmp(sgml_unicode[unicode_sgml[m - 1]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + offset, count - offset) == 0; m--);
503 return sgml_unicode[unicode_sgml[m]].sgml;
504 }
505 }
506 }
507 return nullptr;
508 }
509
510 inline const char* chr2sgml(_In_reads_or_z_(count) const utf32_t* entity, _In_ size_t count)
511 {
512 _Assume_(entity && count);
513
514 utf32_t e2 = entity[0];
515 for (size_t i = 0, j = _countof(unicode_sgml); i < j; ) {
516 size_t m = (i + j) / 2;
517 auto e1 = sgml_unicode[unicode_sgml[m]].unicode[0];
518 if (e1 < e2)
519 i = m + 1;
520 else if (e1 > e2)
521 j = m;
522 else {
523 auto r = strncmp(sgml_unicode[unicode_sgml[m]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1);
524 if (r < 0)
525 i = m + 1;
526 else if (r > 0)
527 j = m;
528 else {
529 for (; i < m && sgml_unicode[unicode_sgml[m - 1]].unicode[0] == e2 && strncmp(sgml_unicode[unicode_sgml[m - 1]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1) == 0; m--);
530 return sgml_unicode[unicode_sgml[m]].sgml;
531 }
532 }
533 }
534 return nullptr;
535 }
536
537 inline utf32_t wstr_to_utf32(_In_reads_(end) const utf16_t* src, _Inout_ size_t& i, _In_ size_t end)
538 {
539 _Assume_(i < end);
540 if (i + 1 >= end || !is_surrogate_pair(src + i))
541 return src[i++];
542
543 utf32_t unicode = surrogate_pair_to_ucs4(src + i);
544 i += 2;
545 return unicode;
546 }
547
548 inline utf32_t wstr_to_utf32(_In_reads_(end) const utf32_t* src, _Inout_ size_t& i, _In_ size_t end)
549 {
550 _Unreferenced_(end);
551 _Assume_(i < end);
552 return src[i++];
553 }
555
564 template <class T_from, class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>>
565 void str2sgmlcat(
566 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
567 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
568 _In_ int what = 0)
569 {
570 _Assume_(src || !count_src);
571
572 const bool
573 do_ascii = (what & sgml_full) == 0,
574 do_quot = (what & sgml_quot) == 0,
575 do_apos = (what & sgml_apos) == 0,
576 do_lt_gt = (what & sgml_lt_gt) == 0,
577 do_bsol = (what & sgml_bsol) == 0,
578 do_dollar = (what & sgml_dollar) == 0,
579 do_percnt = (what & sgml_percnt) == 0,
580 do_commat = (what & sgml_commat) == 0,
581 do_num = (what & sgml_num) == 0,
582 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
583 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
584 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
585
586 count_src = strnlen(src, count_src);
587 dst.reserve(dst.size() + count_src);
588 for (size_t i = 0; i < count_src;) {
589 size_t n = glyphlen(src + i, count_src - i);
590 if (n == 1 &&
591 do_ascii && is7bit(src[i]) &&
592 src[i] != '&' &&
593 (do_quot || (src[i] != '"')) &&
594 (do_apos || (src[i] != '\'')) &&
595 (do_lt_gt || (src[i] != '<' && src[i] != '>')) &&
596 (do_bsol || (src[i] != '\\')) &&
597 (do_dollar || (src[i] != '$')) &&
598 (do_percnt || (src[i] != '%')) &&
599 (do_commat || (src[i] != '@')) &&
600 (do_num || (src[i] != '#')) &&
601 (do_lpar_rpar || (src[i] != '(' && src[i] != ')')) &&
602 (do_lcub_rcub || (src[i] != '{' && src[i] != '}')) &&
603 (do_lsqb_rsqb || (src[i] != '[' && src[i] != ']')))
604 {
605 // 7-bit ASCII and no desire to encode it as an SGML entity.
606 dst.append(1, static_cast<char>(src[i++]));
607 }
608 else {
609 const char* entity = chr2sgml(src + i, n);
610 if (entity) {
611 dst.append(1, '&');
612 dst.append(entity);
613 dst.append(1, ';');
614 i += n;
615 }
616 else if (n == 1) {
617 // Trivial character (1 code unit, 1 glyph), no entity available.
618 if (is7bit(src[i]))
619 dst.append(1, static_cast<char>(src[i++]));
620 else {
621 char tmp[3 + 8 + 1 + 1];
622 snprintf(tmp, _countof(tmp), "&#x%x;", static_cast<unsigned int>(src[i++]));
623 dst.append(tmp);
624 }
625 }
626 else {
627 // Non-trivial character. Decompose.
628 const size_t end = i + n;
629 while (i < end) {
630 if ((entity = chr2sgml(src + i, 1)) != nullptr) {
631 dst.append(1, '&');
632 dst.append(entity);
633 dst.append(1, ';');
634 i++;
635 }
636 else if (is7bit(src[i]))
637 dst.append(1, static_cast<char>(src[i++]));
638 else {
639 char tmp[3 + 8 + 1 + 1];
640 snprintf(tmp, _countof(tmp), "&#x%x;", static_cast<unsigned int>(wstr_to_utf32(src, i, end)));
641 dst.append(tmp);
642 }
643 }
644 }
645 }
646 }
647 }
648
656 template <class T_from, class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
657 void str2sgmlcat(
658 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
659 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
660 _In_ int what = 0)
661 {
662 str2sgmlcat(dst, src.data(), src.size(), what);
663 }
664
676 template <class T_from>
677 size_t str2sgmlcat(
678 _Inout_cap_(count_dst) char* dst, _In_ size_t count_dst,
679 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
680 _In_ int what = 0)
681 {
682 _Assume_(dst || !count_dst);
683 _Assume_(src || !count_src);
684
685 static const std::invalid_argument buffer_overrun("buffer overrun");
686 const bool
687 do_ascii = (what & sgml_full) == 0,
688 do_quot = (what & sgml_quot) == 0,
689 do_apos = (what & sgml_apos) == 0,
690 do_lt_gt = (what & sgml_lt_gt) == 0,
691 do_bsol = (what & sgml_bsol) == 0,
692 do_dollar = (what & sgml_dollar) == 0,
693 do_percnt = (what & sgml_percnt) == 0,
694 do_commat = (what & sgml_commat) == 0,
695 do_num = (what & sgml_num) == 0,
696 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
697 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
698 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
699
700 size_t j = strnlen(dst, count_dst);
701 count_src = strnlen(src, count_src);
702 for (size_t i = 0; i < count_src;) {
703 size_t n = glyphlen(src + i, count_src - i);
704 if (n == 1 &&
705 do_ascii && is7bit(src[i]) &&
706 src[i] != '&' &&
707 (do_quot || (src[i] != '"')) &&
708 (do_apos || (src[i] != '\'')) &&
709 (do_lt_gt || (src[i] != '<' && src[i] != '>')) &&
710 (do_bsol || (src[i] != '\\')) &&
711 (do_dollar || (src[i] != '$')) &&
712 (do_percnt || (src[i] != '%')) &&
713 (do_commat || (src[i] != '@')) &&
714 (do_num || (src[i] != '#')) &&
715 (do_lpar_rpar || (src[i] != '(' && src[i] != ')')) &&
716 (do_lcub_rcub || (src[i] != '{' && src[i] != '}')) &&
717 (do_lsqb_rsqb || (src[i] != '[' && src[i] != ']')))
718 {
719 // 7-bit ASCII and no desire to encode it as an SGML entity.
720 if (j + 1 >= count_dst)
721 throw buffer_overrun;
722 dst[j++] = static_cast<char>(src[i++]);
723 }
724 else {
725 const char* entity = chr2sgml(src + i, n);
726 if (entity) {
727 size_t m = strlen(entity);
728 if (j + m + 2 >= count_dst)
729 throw buffer_overrun;
730 dst[j++] = '&';
731 memcpy(dst + j, entity, m * sizeof(char)); j += m;
732 dst[j++] = ';';
733 i += n;
734 }
735 else if (n == 1) {
736 // Trivial character (1 code unit, 1 glyph), no entity available.
737 if (is7bit(src[i])) {
738 if (j + 1 >= count_dst)
739 throw buffer_overrun;
740 dst[j++] = static_cast<char>(src[i++]);
741 }
742 else {
743 char tmp[3 + 8 + 1 + 1];
744 int m = snprintf(tmp, _countof(tmp), "&#x%x;", static_cast<unsigned int>(src[i++]));
745 _Assume_(m >= 0);
746 if (static_cast<size_t>(m) >= count_dst)
747 throw buffer_overrun;
748 memcpy(dst + j, tmp, static_cast<size_t>(m) * sizeof(char));
749 j += static_cast<size_t>(m);
750 }
751 }
752 else {
753 // Non-trivial character. Decompose.
754 const size_t end = i + n;
755 while (i < end) {
756 if ((entity = chr2sgml(src + i, 1)) != nullptr) {
757 size_t m = strlen(entity);
758 if (j + m + 2 >= count_dst)
759 throw buffer_overrun;
760 dst[j++] = '&';
761 memcpy(dst + j, entity, m * sizeof(char)); j += m;
762 dst[j++] = ';';
763 i++;
764 }
765 else if (is7bit(src[i])) {
766 if (j + 1 >= count_dst)
767 throw buffer_overrun;
768 dst[j++] = static_cast<char>(src[i++]);
769 }
770 else {
771 char tmp[3 + 8 + 1 + 1];
772 int m = snprintf(tmp, _countof(tmp), "&#x%x;", static_cast<unsigned int>(wstr_to_utf32(src, i, end)));
773 _Assume_(m >= 0);
774 if (static_cast<size_t>(m) >= count_dst)
775 throw buffer_overrun;
776 memcpy(dst + j, tmp, static_cast<size_t>(m) * sizeof(char));
777 j += static_cast<size_t>(m);
778 }
779 }
780 }
781 }
782 }
783 if (j >= count_dst)
784 throw buffer_overrun;
785 dst[j] = 0;
786 return j;
787 }
788
797 template <class T_from, class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>>
798 void str2sgmlcpy(
799 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
800 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
801 _In_ int what = 0)
802 {
803 dst.clear();
804 str2sgmlcat(dst, src, count_src, what);
805 }
806
814 template <class T_from, class TR_to = std::char_traits<char>, class AX_to = std::allocator<char>, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
815 void str2sgmlcpy(
816 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
817 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
818 _In_ int what = 0)
819 {
820 str2sgmlcpy(dst, src.data(), src.size(), what);
821 }
822
834 template <class T_from>
835 size_t str2sgmlcpy(
836 _Inout_cap_(count_dst) char* dst, _In_ size_t count_dst,
837 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
838 _In_ int what = 0)
839 {
840 _Assume_(dst || !count_dst);
841 if (count_dst)
842 dst[0] = 0;
843 return str2sgmlcat(dst, count_dst, src, count_src, what);
844 }
845
855 template <class T_from>
856 std::string str2sgml(
857 _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src,
858 _In_ int what = 0)
859 {
860 std::string dst;
861 str2sgmlcat(dst, src, count_src, what);
862 return dst;
863 }
864
873 template <class T_from, class TR_from = std::char_traits<T_from>, class AX_from = std::allocator<T_from>>
874 std::string str2sgml(
875 _In_ const std::basic_string<T_from, TR_from, AX_from>& src,
876 _In_ int what = 0)
877 {
878 return str2sgml(src.data(), src.size(), what);
879 }
880}
881
882#if defined(__GNUC__)
883#pragma GCC diagnostic pop
884#endif