10#include "sgml_unicode.hpp"
18#pragma GCC diagnostic push
19#pragma GCC diagnostic ignored "-Wexit-time-destructors"
26 const wchar_t* sgml2uni(_In_reads_or_z_(count)
const T* entity, _In_
size_t count)
28 _Assume_(entity && count);
29 _Assume_(count < 2 || entity[0] !=
'#');
31 for (
size_t i = 0, j = _countof(sgml_unicode); i < j; ) {
32 size_t m = (i + j) / 2;
33 if (sgml_unicode[m].sgml[0] < entity[0])
35 else if (sgml_unicode[m].sgml[0] > entity[0])
38 auto r = strncmp<char, T>(sgml_unicode[m].sgml + 1, _countof(sgml_unicode[0].sgml) - 1, entity + 1, count - 1);
44 for (; i < m && strncmp<char, T>(sgml_unicode[m - 1].sgml, _countof(sgml_unicode[0].sgml), entity, count) == 0; m--);
45 return sgml_unicode[m].unicode;
54 _In_reads_or_z_opt_(count)
const T* str, _In_
size_t count)
56 _Assume_(str || !count);
57 for (
size_t i = 0; i < count; i++) {
60 if (!str[i] || str[i] ==
'&' || isspace(str[i]))
67 constexpr int sgml_full = 0x40000000;
68 constexpr int sgml_quot = 0x00000001;
69 constexpr int sgml_apos = 0x00000002;
70 constexpr int sgml_quot_apos = sgml_quot | sgml_apos;
71 constexpr int sgml_amp = 0x00000004;
72 constexpr int sgml_lt_gt = 0x00000008;
73 constexpr int sgml_bsol = 0x00000010;
74 constexpr int sgml_dollar = 0x00000020;
75 constexpr int sgml_percnt = 0x00000040;
76 constexpr int sgml_commat = 0x00000080;
77 constexpr int sgml_num = 0x00000100;
78 constexpr int sgml_lpar_rpar = 0x00000200;
79 constexpr int sgml_lcub_rcub = 0x00000400;
80 constexpr int sgml_lsqb_rsqb = 0x00000800;
81 constexpr int sgml_sgml = sgml_amp | sgml_lt_gt;
82 constexpr int sgml_ml_attrib = sgml_amp | sgml_quot_apos;
83 constexpr int sgml_c = sgml_amp | sgml_bsol | sgml_quot_apos;
95 template <
class T_from>
97 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
100 _Assume_(src || !count_src);
103 do_ascii = (what & sgml_full) == 0;
105 for (
size_t i = 0; i < count_src && src[i];) {
107 auto end = sgmlend(src + i + 1, count_src - i - 1);
109 const wchar_t* entity_w;
111 size_t n = end - src - i - 1;
112 if (n >= 2 && src[i + 1] ==
'#') {
114 if (src[i + 2] ==
'x' || src[i + 2] ==
'X')
115 unicode = strtou32(src + i + 3, n - 2,
nullptr, 16);
117 unicode = strtou32(src + i + 2, n - 1,
nullptr, 10);
119 if (unicode < 0x10000) {
120 chr[0] = (wchar_t)unicode;
124 ucs4_to_surrogate_pair(chr, unicode);
128 chr[0] = (wchar_t)unicode;
134 entity_w = sgml2uni(src + i + 1, n);
149 if (do_ascii && !is7bit(src[i])) {
169 template <
class T_from,
class TR_to = std::
char_traits<
wchar_t>,
class AX_to = std::allocator<
wchar_t>>
171 _Inout_ std::basic_string<wchar_t, TR_to, AX_to>& dst,
172 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
174 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
175 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
177 _Assume_(src || !count_src);
180 skip_quot = (skip & sgml_quot) == 0,
181 skip_apos = (skip & sgml_apos) == 0,
182 skip_amp = (skip & sgml_amp) == 0,
183 skip_lt_gt = (skip & sgml_lt_gt) == 0,
184 skip_bsol = (skip & sgml_bsol) == 0,
185 skip_dollar = (skip & sgml_dollar) == 0,
186 skip_percnt = (skip & sgml_percnt) == 0,
187 skip_commat = (skip & sgml_commat) == 0,
188 skip_num = (skip & sgml_num) == 0,
189 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
190 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
191 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
193 count_src = strnlen(src, count_src);
194 dst.reserve(dst.size() + count_src);
195 for (
size_t i = 0; i < count_src;) {
197 auto end = sgmlend(src + i + 1, count_src - i - 1);
199 const wchar_t* entity_w;
201 _Assume_(src + i + 1 <= end);
202 size_t n =
static_cast<size_t>(end - src) - i - 1;
203 if (n >= 2 && src[i + 1] ==
'#') {
205 if (src[i + 2] ==
'x' || src[i + 2] ==
'X')
206 unicode =
static_cast<utf32_t
>(strtou32(src + i + 3, n - 2,
nullptr, 16));
208 unicode =
static_cast<utf32_t
>(strtou32(src + i + 2, n - 1,
nullptr, 10));
210 if (unicode < 0x10000) {
211 chr[0] = (wchar_t)unicode;
215 ucs4_to_surrogate_pair(chr, unicode);
219 chr[0] = (wchar_t)unicode;
225 entity_w = sgml2uni(src + i + 1, n);
228 (skip_quot || (entity_w[0] != L
'"')) &&
229 (skip_apos || (entity_w[0] != L
'\'')) &&
230 (skip_amp || (entity_w[0] != L
'&')) &&
231 (skip_lt_gt || (entity_w[0] != L
'<' && entity_w[0] != L
'>')) &&
232 (skip_bsol || (entity_w[0] != L
'\\')) &&
233 (skip_dollar || (entity_w[0] != L
'$')) &&
234 (skip_percnt || (entity_w[0] != L
'%')) &&
235 (skip_commat || (entity_w[0] != L
'@')) &&
236 (skip_num || (entity_w[0] != L
'#')) &&
237 (skip_lpar_rpar || (entity_w[0] != L
'(' && entity_w[0] != L
')')) &&
238 (skip_lcub_rcub || (entity_w[0] != L
'{' && entity_w[0] != L
'}')) &&
239 (skip_lsqb_rsqb || (entity_w[0] != L
'[' && entity_w[0] != L
']')))
241 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
242 dst.append(entity_w);
243 _Assume_(src <= end);
244 i =
static_cast<size_t>(end - src) + 1;
245 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
250 dst.append(1, src[i++]);
263 template <
class T_from,
class TR_to = std::
char_traits<
wchar_t>,
class AX_to = std::allocator<
wchar_t>,
class TR_from = std::
char_traits<T_from>,
class AX_from = std::allocator<T_from>>
265 _Inout_ std::basic_string<wchar_t, TR_to, AX_to>& dst,
266 _In_
const std::basic_string<T_from, TR_from, AX_from>& src,
268 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
269 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
271 sgml2strcat(dst, src.data(), src.size(), skip, offset, map);
287 template <
class T_from>
289 _Inout_cap_(count_dst)
wchar_t* dst, _In_
size_t count_dst,
290 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
292 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
293 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
295 _Assume_(dst || !count_dst);
296 _Assume_(src || !count_src);
298 static const std::invalid_argument buffer_overrun(
"buffer overrun");
300 skip_quot = (skip & sgml_quot) == 0,
301 skip_apos = (skip & sgml_apos) == 0,
302 skip_amp = (skip & sgml_amp) == 0,
303 skip_lt_gt = (skip & sgml_lt_gt) == 0,
304 skip_bsol = (skip & sgml_bsol) == 0,
305 skip_dollar = (skip & sgml_dollar) == 0,
306 skip_percnt = (skip & sgml_percnt) == 0,
307 skip_commat = (skip & sgml_commat) == 0,
308 skip_num = (skip & sgml_num) == 0,
309 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
310 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
311 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
313 size_t j = strnlen(dst, count_dst);
314 count_src = strnlen(src, count_src);
315 for (
size_t i = 0; i < count_src;) {
317 auto end = sgmlend(src + i + 1, count_src - i - 1);
319 const wchar_t* entity_w;
321 size_t n = end - src - i - 1;
322 if (n >= 2 && src[i + 1] ==
'#') {
324 if (src[i + 2] ==
'x' || src[i + 2] ==
'X')
325 unicode = strtou32(src + i + 3, n - 2,
nullptr, 16);
327 unicode = strtou32(src + i + 2, n - 1,
nullptr, 10);
329 if (unicode < 0x10000) {
330 chr[0] = (wchar_t)unicode;
334 ucs4_to_surrogate_pair(chr, unicode);
338 chr[0] = (wchar_t)unicode;
344 entity_w = sgml2uni(src + i + 1, n);
347 (skip_quot || (entity_w[0] != L
'"')) &&
348 (skip_apos || (entity_w[0] != L
'\'')) &&
349 (skip_amp || (entity_w[0] != L
'&')) &&
350 (skip_lt_gt || (entity_w[0] != L
'<' && entity_w[0] != L
'>')) &&
351 (skip_bsol || (entity_w[0] != L
'\\')) &&
352 (skip_dollar || (entity_w[0] != L
'$')) &&
353 (skip_percnt || (entity_w[0] != L
'%')) &&
354 (skip_commat || (entity_w[0] != L
'@')) &&
355 (skip_num || (entity_w[0] != L
'#')) &&
356 (skip_lpar_rpar || (entity_w[0] != L
'(' && entity_w[0] != L
')')) &&
357 (skip_lcub_rcub || (entity_w[0] != L
'{' && entity_w[0] != L
'}')) &&
358 (skip_lsqb_rsqb || (entity_w[0] != L
'[' && entity_w[0] != L
']')))
360 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
361 size_t m = wcslen(entity_w);
362 if (j + m >= count_dst)
363 throw buffer_overrun;
364 memcpy(dst + j, entity_w, m *
sizeof(
wchar_t)); j += m;
366 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
371 if (j + 1 >= count_dst)
372 throw buffer_overrun;
376 throw buffer_overrun;
391 template <
class T_from,
class TR_to = std::
char_traits<
wchar_t>,
class AX_to = std::allocator<
wchar_t>>
393 _Inout_ std::basic_string<wchar_t, TR_to, AX_to>& dst,
394 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
396 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
397 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
402 sgml2strcat(dst, src, count_src, skip, offset, map);
414 template<
class T_from,
class TR_to = std::
char_traits<
wchar_t>,
class AX_to = std::allocator<
wchar_t>,
class TR_from = std::
char_traits<T_from>,
class AX_from = std::allocator<T_from>>
416 _Inout_ std::basic_string<wchar_t, TR_to, AX_to>& dst,
417 _In_
const std::basic_string<T_from, TR_from, AX_from>& src,
419 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
420 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
422 sgml2strcpy(dst, src.data(), src.size(), skip, offset, map);
438 template <
class T_from>
440 _Inout_cap_(count_dst)
wchar_t* dst, _In_
size_t count_dst,
441 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
443 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
444 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
446 _Assume_(dst || !count_dst);
451 return sgml2strcat(dst, count_dst, src, count_src, skip, offset, map);
465 template <
class T_from>
466 std::wstring sgml2str(
467 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
469 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
470 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
473 sgml2strcat(dst, src, count_src, skip, offset, map);
487 template <
class T_from,
class TR_from = std::
char_traits<T_from>,
class AX_from = std::allocator<T_from>>
488 std::wstring sgml2str(
489 _In_
const std::basic_string<T_from, TR_from, AX_from>& src,
491 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
492 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
494 return sgml2str(src.data(), src.size(), skip, offset, map);
498 inline const char* chr2sgml(_In_reads_or_z_(count)
const wchar_t* entity, _In_
size_t count)
500 _Assume_(entity && count);
502 const wchar_t e2 = entity[0];
503 for (
size_t i = 0, j = _countof(unicode_sgml); i < j; ) {
504 size_t m = (i + j) / 2;
505 wchar_t e1 = sgml_unicode[unicode_sgml[m]].unicode[0];
511 auto r = strncmp(sgml_unicode[unicode_sgml[m]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1);
517 for (; i < m && sgml_unicode[unicode_sgml[m - 1]].unicode[0] == e2 && strncmp(sgml_unicode[unicode_sgml[m - 1]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1) == 0; m--);
518 return sgml_unicode[unicode_sgml[m]].sgml;
534 template <
class TR_to = std::
char_traits<
char>,
class AX_to = std::allocator<
char>>
535 inline void str2sgmlcat(
536 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
537 _In_reads_or_z_opt_(count_src)
const wchar_t* src, _In_
size_t count_src,
540 _Assume_(src || !count_src);
543 do_ascii = (what & sgml_full) == 0,
544 do_quot = (what & sgml_quot) == 0,
545 do_apos = (what & sgml_apos) == 0,
546 do_lt_gt = (what & sgml_lt_gt) == 0,
547 do_bsol = (what & sgml_bsol) == 0,
548 do_dollar = (what & sgml_dollar) == 0,
549 do_percnt = (what & sgml_percnt) == 0,
550 do_commat = (what & sgml_commat) == 0,
551 do_num = (what & sgml_num) == 0,
552 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
553 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
554 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
556 count_src = strnlen(src, count_src);
557 dst.reserve(dst.size() + count_src);
558 for (
size_t i = 0; i < count_src;) {
559 size_t n = glyphlen(src + i, count_src - i);
561 do_ascii && is7bit(src[i]) &&
563 (do_quot || (src[i] != L
'"')) &&
564 (do_apos || (src[i] != L
'\'')) &&
565 (do_lt_gt || (src[i] != L
'<' && src[i] != L
'>')) &&
566 (do_bsol || (src[i] != L
'\\')) &&
567 (do_dollar || (src[i] != L
'$')) &&
568 (do_percnt || (src[i] != L
'%')) &&
569 (do_commat || (src[i] != L
'@')) &&
570 (do_num || (src[i] != L
'#')) &&
571 (do_lpar_rpar || (src[i] != L
'(' && src[i] != L
')')) &&
572 (do_lcub_rcub || (src[i] != L
'{' && src[i] != L
'}')) &&
573 (do_lsqb_rsqb || (src[i] != L
'[' && src[i] != L
']')))
576 dst.append(1,
static_cast<char>(src[i++]));
579 const char* entity = chr2sgml(src + i, n);
589 dst.append(1,
static_cast<char>(src[i++]));
591 char tmp[3 + 8 + 1 + 1];
592 snprintf(tmp, _countof(tmp),
"&#x%x;", src[i++]);
598 const size_t end = i + n;
600 if ((entity = chr2sgml(src + i, 1)) !=
nullptr) {
606 else if (is7bit(src[i]))
607 dst.append(1,
static_cast<char>(src[i++]));
611 if (i + 1 < end && is_surrogate_pair(src + i)) {
612 unicode = surrogate_pair_to_ucs4(src + i);
620 char tmp[3 + 8 + 1 + 1];
621 snprintf(tmp, _countof(tmp),
"&#x%x;",
static_cast<unsigned int>(unicode));
637 template <
class TR_to = std::
char_traits<
char>,
class AX_to = std::allocator<
char>>
639 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
640 _In_
const std::basic_string_view<
wchar_t, std::char_traits<wchar_t>> src,
643 str2sgmlcat(dst, src.data(), src.size(), what);
657 inline size_t str2sgmlcat(
658 _Inout_cap_(count_dst)
char* dst, _In_
size_t count_dst,
659 _In_reads_or_z_opt_(count_src)
const wchar_t* src, _In_
size_t count_src,
662 _Assume_(dst || !count_dst);
663 _Assume_(src || !count_src);
665 static const std::invalid_argument buffer_overrun(
"buffer overrun");
667 do_ascii = (what & sgml_full) == 0,
668 do_quot = (what & sgml_quot) == 0,
669 do_apos = (what & sgml_apos) == 0,
670 do_lt_gt = (what & sgml_lt_gt) == 0,
671 do_bsol = (what & sgml_bsol) == 0,
672 do_dollar = (what & sgml_dollar) == 0,
673 do_percnt = (what & sgml_percnt) == 0,
674 do_commat = (what & sgml_commat) == 0,
675 do_num = (what & sgml_num) == 0,
676 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
677 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
678 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
680 size_t j = strnlen(dst, count_dst);
681 count_src = strnlen(src, count_src);
682 for (
size_t i = 0; i < count_src;) {
683 size_t n = glyphlen(src + i, count_src - i);
685 do_ascii && is7bit(src[i]) &&
687 (do_quot || (src[i] != L
'"')) &&
688 (do_apos || (src[i] != L
'\'')) &&
689 (do_lt_gt || (src[i] != L
'<' && src[i] != L
'>')) &&
690 (do_bsol || (src[i] != L
'\\')) &&
691 (do_dollar || (src[i] != L
'$')) &&
692 (do_percnt || (src[i] != L
'%')) &&
693 (do_commat || (src[i] != L
'@')) &&
694 (do_num || (src[i] != L
'#')) &&
695 (do_lpar_rpar || (src[i] != L
'(' && src[i] != L
')')) &&
696 (do_lcub_rcub || (src[i] != L
'{' && src[i] != L
'}')) &&
697 (do_lsqb_rsqb || (src[i] != L
'[' && src[i] != L
']')))
700 if (j + 1 >= count_dst)
701 throw buffer_overrun;
702 dst[j++] =
static_cast<char>(src[i++]);
705 const char* entity = chr2sgml(src + i, n);
707 size_t m = strlen(entity);
708 if (j + m + 2 >= count_dst)
709 throw buffer_overrun;
711 memcpy(dst + j, entity, m *
sizeof(
char)); j += m;
717 if (is7bit(src[i])) {
718 if (j + 1 >= count_dst)
719 throw buffer_overrun;
720 dst[j++] =
static_cast<char>(src[i++]);
723 char tmp[3 + 8 + 1 + 1];
724 int m = snprintf(tmp, _countof(tmp),
"&#x%x;", src[i++]);
726 if (
static_cast<size_t>(m) >= count_dst)
727 throw buffer_overrun;
728 memcpy(dst + j, tmp,
static_cast<size_t>(m) *
sizeof(
char));
729 j +=
static_cast<size_t>(m);
734 const size_t end = i + n;
736 if ((entity = chr2sgml(src + i, 1)) !=
nullptr) {
737 size_t m = strlen(entity);
738 if (j + m + 2 >= count_dst)
739 throw buffer_overrun;
741 memcpy(dst + j, entity, m *
sizeof(
char)); j += m;
745 else if (is7bit(src[i])) {
746 if (j + 1 >= count_dst)
747 throw buffer_overrun;
748 dst[j++] =
static_cast<char>(src[i++]);
753 if (i + 1 < end && is_surrogate_pair(src + i)) {
754 unicode = surrogate_pair_to_ucs4(src + i);
762 char tmp[3 + 8 + 1 + 1];
763 int m = snprintf(tmp, _countof(tmp),
"&#x%x;",
static_cast<unsigned int>(unicode));
765 if (
static_cast<size_t>(m) >= count_dst)
766 throw buffer_overrun;
767 memcpy(dst + j, tmp,
static_cast<size_t>(m) *
sizeof(
char));
768 j +=
static_cast<size_t>(m);
775 throw buffer_overrun;
788 template <
class TR_to = std::
char_traits<
char>,
class AX_to = std::allocator<
char>>
789 inline void str2sgmlcpy(
790 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
791 _In_reads_or_z_opt_(count_src)
const wchar_t* src, _In_
size_t count_src,
795 str2sgmlcat(dst, src, count_src, what);
805 template <
class TR_to = std::
char_traits<
char>,
class AX_to = std::allocator<
char>>
807 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
808 _In_
const std::basic_string_view<
wchar_t, std::char_traits<wchar_t>> src,
811 str2sgmlcpy(dst, src.data(), src.size(), what);
825 inline size_t str2sgmlcpy(
826 _Inout_cap_(count_dst)
char* dst, _In_
size_t count_dst,
827 _In_reads_or_z_opt_(count_src)
const wchar_t* src, _In_
size_t count_src,
830 _Assume_(dst || !count_dst);
833 return str2sgmlcat(dst, count_dst, src, count_src, what);
845 inline std::string str2sgml(
846 _In_reads_or_z_opt_(count_src)
const wchar_t* src, _In_
size_t count_src,
850 str2sgmlcat(dst, src, count_src, what);
862 inline std::string str2sgml(
863 _In_
const std::basic_string_view<
wchar_t, std::char_traits<wchar_t>> src,
866 return str2sgml(src.data(), src.size(), what);
871#pragma GCC diagnostic pop