10#include "sgml_unicode.hpp"
21 const wchar_t* sgml2uni(_In_reads_or_z_(count)
const T* entity, _In_
size_t count)
23 _Assume_(entity && count);
24 _Assume_(count < 2 || entity[0] !=
'#');
26 for (
size_t i = 0, j = _countof(sgml_unicode); i < j; ) {
27 size_t m = (i + j) / 2;
28 if (sgml_unicode[m].sgml[0] < entity[0])
30 else if (sgml_unicode[m].sgml[0] > entity[0])
33 auto r = strncmp<char, T>(sgml_unicode[m].sgml + 1, _countof(sgml_unicode[0].sgml) - 1, entity + 1, count - 1);
39 for (; i < m && strncmp<char, T>(sgml_unicode[m - 1].sgml, _countof(sgml_unicode[0].sgml), entity, count) == 0; m--);
40 return sgml_unicode[m].unicode;
49 _In_reads_or_z_opt_(count)
const T* str, _In_
size_t count)
51 _Assume_(str || !count);
52 for (
size_t i = 0; i < count; i++) {
55 if (!str[i] || str[i] ==
'&' || isspace(str[i]))
62 constexpr int sgml_full = 0x80000000;
63 constexpr int sgml_quot = 0x00000001;
64 constexpr int sgml_apos = 0x00000002;
65 constexpr int sgml_quot_apos = sgml_quot | sgml_apos;
66 constexpr int sgml_amp = 0x00000004;
67 constexpr int sgml_lt_gt = 0x00000008;
68 constexpr int sgml_bsol = 0x00000010;
69 constexpr int sgml_dollar = 0x00000020;
70 constexpr int sgml_percnt = 0x00000040;
71 constexpr int sgml_commat = 0x00000080;
72 constexpr int sgml_num = 0x00000100;
73 constexpr int sgml_lpar_rpar = 0x00000200;
74 constexpr int sgml_lcub_rcub = 0x00000400;
75 constexpr int sgml_lsqb_rsqb = 0x00000800;
76 constexpr int sgml_sgml = sgml_amp | sgml_lt_gt;
77 constexpr int sgml_ml_attrib = sgml_amp | sgml_quot_apos;
78 constexpr int sgml_c = sgml_amp | sgml_bsol | sgml_quot_apos;
90 template <
class T_from>
92 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
95 _Assume_(src || !count_src);
98 do_ascii = (what & sgml_full) == 0;
100 for (
size_t i = 0; i < count_src && src[i];) {
102 auto end = sgmlend(src + i + 1, count_src - i - 1);
104 const wchar_t* entity_w;
106 size_t n = end - src - i - 1;
107 if (n >= 2 && src[i + 1] ==
'#') {
109 if (src[i + 2] ==
'x' || src[i + 2] ==
'X')
110 unicode = strtou32(src + i + 3, n - 2,
nullptr, 16);
112 unicode = strtou32(src + i + 2, n - 1,
nullptr, 10);
114 if (unicode < 0x10000) {
115 chr[0] = (wchar_t)unicode;
119 ucs4_to_surrogate_pair(chr, unicode);
123 chr[0] = (wchar_t)unicode;
129 entity_w = sgml2uni(src + i + 1, n);
144 if (do_ascii && !is7bit(src[i])) {
164 template <
class T_from,
class TR_to = std::
char_traits<
wchar_t>,
class AX_to = std::allocator<
wchar_t>>
166 _Inout_ std::basic_string<wchar_t, TR_to, AX_to>& dst,
167 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
169 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
170 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
172 _Assume_(src || !count_src);
175 skip_quot = (skip & sgml_quot) == 0,
176 skip_apos = (skip & sgml_apos) == 0,
177 skip_amp = (skip & sgml_amp) == 0,
178 skip_lt_gt = (skip & sgml_lt_gt) == 0,
179 skip_bsol = (skip & sgml_bsol) == 0,
180 skip_dollar = (skip & sgml_dollar) == 0,
181 skip_percnt = (skip & sgml_percnt) == 0,
182 skip_commat = (skip & sgml_commat) == 0,
183 skip_num = (skip & sgml_num) == 0,
184 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
185 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
186 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
188 count_src = strnlen(src, count_src);
189 dst.reserve(dst.size() + count_src);
190 for (
size_t i = 0; i < count_src;) {
192 auto end = sgmlend(src + i + 1, count_src - i - 1);
194 const wchar_t* entity_w;
196 size_t n = end - src - i - 1;
197 if (n >= 2 && src[i + 1] ==
'#') {
199 if (src[i + 2] ==
'x' || src[i + 2] ==
'X')
200 unicode = strtou32(src + i + 3, n - 2,
nullptr, 16);
202 unicode = strtou32(src + i + 2, n - 1,
nullptr, 10);
204 if (unicode < 0x10000) {
205 chr[0] = (wchar_t)unicode;
209 ucs4_to_surrogate_pair(chr, unicode);
213 chr[0] = (wchar_t)unicode;
219 entity_w = sgml2uni(src + i + 1, n);
222 (skip_quot || (entity_w[0] != L
'"')) &&
223 (skip_apos || (entity_w[0] != L
'\'')) &&
224 (skip_amp || (entity_w[0] != L
'&')) &&
225 (skip_lt_gt || (entity_w[0] != L
'<' && entity_w[0] != L
'>')) &&
226 (skip_bsol || (entity_w[0] != L
'\\')) &&
227 (skip_dollar || (entity_w[0] != L
'$')) &&
228 (skip_percnt || (entity_w[0] != L
'%')) &&
229 (skip_commat || (entity_w[0] != L
'@')) &&
230 (skip_num || (entity_w[0] != L
'#')) &&
231 (skip_lpar_rpar || (entity_w[0] != L
'(' && entity_w[0] != L
')')) &&
232 (skip_lcub_rcub || (entity_w[0] != L
'{' && entity_w[0] != L
'}')) &&
233 (skip_lsqb_rsqb || (entity_w[0] != L
'[' && entity_w[0] != L
']')))
235 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
236 dst.append(entity_w);
238 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
243 dst.append(1, src[i++]);
256 template <
class T_from,
class TR_to = std::
char_traits<
wchar_t>,
class AX_to = std::allocator<
wchar_t>,
class TR_from = std::
char_traits<T_from>,
class AX_from = std::allocator<T_from>>
258 _Inout_ std::basic_string<wchar_t, TR_to, AX_to>& dst,
259 _In_
const std::basic_string<T_from, TR_from, AX_from>& src,
261 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
262 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
264 sgml2strcat(dst, src.data(), src.size(), skip, offset, map);
280 template <
class T_from>
282 _Inout_cap_(count_dst)
wchar_t* dst, _In_
size_t count_dst,
283 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
285 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
286 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
288 _Assume_(dst || !count_dst);
289 _Assume_(src || !count_src);
291 static const std::invalid_argument buffer_overrun(
"buffer overrun");
293 skip_quot = (skip & sgml_quot) == 0,
294 skip_apos = (skip & sgml_apos) == 0,
295 skip_amp = (skip & sgml_amp) == 0,
296 skip_lt_gt = (skip & sgml_lt_gt) == 0,
297 skip_bsol = (skip & sgml_bsol) == 0,
298 skip_dollar = (skip & sgml_dollar) == 0,
299 skip_percnt = (skip & sgml_percnt) == 0,
300 skip_commat = (skip & sgml_commat) == 0,
301 skip_num = (skip & sgml_num) == 0,
302 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
303 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
304 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
306 size_t j = strnlen(dst, count_dst);
307 count_src = strnlen(src, count_src);
308 for (
size_t i = 0; i < count_src;) {
310 auto end = sgmlend(src + i + 1, count_src - i - 1);
312 const wchar_t* entity_w;
314 size_t n = end - src - i - 1;
315 if (n >= 2 && src[i + 1] ==
'#') {
317 if (src[i + 2] ==
'x' || src[i + 2] ==
'X')
318 unicode = strtou32(src + i + 3, n - 2,
nullptr, 16);
320 unicode = strtou32(src + i + 2, n - 1,
nullptr, 10);
322 if (unicode < 0x10000) {
323 chr[0] = (wchar_t)unicode;
327 ucs4_to_surrogate_pair(chr, unicode);
331 chr[0] = (wchar_t)unicode;
337 entity_w = sgml2uni(src + i + 1, n);
340 (skip_quot || (entity_w[0] != L
'"')) &&
341 (skip_apos || (entity_w[0] != L
'\'')) &&
342 (skip_amp || (entity_w[0] != L
'&')) &&
343 (skip_lt_gt || (entity_w[0] != L
'<' && entity_w[0] != L
'>')) &&
344 (skip_bsol || (entity_w[0] != L
'\\')) &&
345 (skip_dollar || (entity_w[0] != L
'$')) &&
346 (skip_percnt || (entity_w[0] != L
'%')) &&
347 (skip_commat || (entity_w[0] != L
'@')) &&
348 (skip_num || (entity_w[0] != L
'#')) &&
349 (skip_lpar_rpar || (entity_w[0] != L
'(' && entity_w[0] != L
')')) &&
350 (skip_lcub_rcub || (entity_w[0] != L
'{' && entity_w[0] != L
'}')) &&
351 (skip_lsqb_rsqb || (entity_w[0] != L
'[' && entity_w[0] != L
']')))
353 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
354 size_t m = wcslen(entity_w);
355 if (j + m >= count_dst)
356 throw buffer_overrun;
357 memcpy(dst + j, entity_w, m *
sizeof(
wchar_t)); j += m;
359 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
364 if (j + 1 >= count_dst)
365 throw buffer_overrun;
369 throw buffer_overrun;
384 template <
class T_from,
class TR_to = std::
char_traits<
wchar_t>,
class AX_to = std::allocator<
wchar_t>>
386 _Inout_ std::basic_string<wchar_t, TR_to, AX_to>& dst,
387 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
389 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
390 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
395 sgml2strcat(dst, src, count_src, skip, offset, map);
407 template<
class T_from,
class TR_to = std::
char_traits<
wchar_t>,
class AX_to = std::allocator<
wchar_t>,
class TR_from = std::
char_traits<T_from>,
class AX_from = std::allocator<T_from>>
409 _Inout_ std::basic_string<wchar_t, TR_to, AX_to>& dst,
410 _In_
const std::basic_string<T_from, TR_from, AX_from>& src,
412 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
413 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
415 sgml2strcpy(dst, src.data(), src.size(), skip, offset, map);
431 template <
class T_from>
433 _Inout_cap_(count_dst)
wchar_t* dst, _In_
size_t count_dst,
434 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
436 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
437 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
439 _Assume_(dst || !count_dst);
444 return sgml2strcat(dst, count_dst, src, count_src, skip, offset, map);
458 template <
class T_from>
459 std::wstring sgml2str(
460 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
462 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
463 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
466 sgml2strcat(dst, src, count_src, skip, offset, map);
480 template <
class T_from,
class TR_from = std::
char_traits<T_from>,
class AX_from = std::allocator<T_from>>
481 std::wstring sgml2str(
482 _In_
const std::basic_string<T_from, TR_from, AX_from>& src,
484 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
485 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
487 return sgml2str(src.data(), src.size(), skip, offset, map);
491 inline const char* chr2sgml(_In_reads_or_z_(count)
const wchar_t* entity, _In_
size_t count)
493 _Assume_(entity && count);
495 const wchar_t e2 = entity[0];
496 for (
size_t i = 0, j = _countof(unicode_sgml); i < j; ) {
497 size_t m = (i + j) / 2;
498 wchar_t e1 = sgml_unicode[unicode_sgml[m]].unicode[0];
504 auto r = strncmp(sgml_unicode[unicode_sgml[m]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1);
510 for (; i < m && sgml_unicode[unicode_sgml[m - 1]].unicode[0] == e2 && strncmp(sgml_unicode[unicode_sgml[m - 1]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1) == 0; m--);
511 return sgml_unicode[unicode_sgml[m]].sgml;
527 template <
class TR_to = std::
char_traits<
char>,
class AX_to = std::allocator<
char>>
528 inline void str2sgmlcat(
529 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
530 _In_reads_or_z_opt_(count_src)
const wchar_t* src, _In_
size_t count_src,
533 _Assume_(src || !count_src);
536 do_ascii = (what & sgml_full) == 0,
537 do_quot = (what & sgml_quot) == 0,
538 do_apos = (what & sgml_apos) == 0,
539 do_lt_gt = (what & sgml_lt_gt) == 0,
540 do_bsol = (what & sgml_bsol) == 0,
541 do_dollar = (what & sgml_dollar) == 0,
542 do_percnt = (what & sgml_percnt) == 0,
543 do_commat = (what & sgml_commat) == 0,
544 do_num = (what & sgml_num) == 0,
545 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
546 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
547 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
549 count_src = strnlen(src, count_src);
550 dst.reserve(dst.size() + count_src);
551 for (
size_t i = 0; i < count_src;) {
552 size_t n = glyphlen(src + i, count_src - i);
554 do_ascii && is7bit(src[i]) &&
556 (do_quot || (src[i] != L
'"')) &&
557 (do_apos || (src[i] != L
'\'')) &&
558 (do_lt_gt || (src[i] != L
'<' && src[i] != L
'>')) &&
559 (do_bsol || (src[i] != L
'\\')) &&
560 (do_dollar || (src[i] != L
'$')) &&
561 (do_percnt || (src[i] != L
'%')) &&
562 (do_commat || (src[i] != L
'@')) &&
563 (do_num || (src[i] != L
'#')) &&
564 (do_lpar_rpar || (src[i] != L
'(' && src[i] != L
')')) &&
565 (do_lcub_rcub || (src[i] != L
'{' && src[i] != L
'}')) &&
566 (do_lsqb_rsqb || (src[i] != L
'[' && src[i] != L
']')))
569 dst.append(1,
static_cast<char>(src[i++]));
572 const char* entity = chr2sgml(src + i, n);
582 dst.append(1,
static_cast<char>(src[i++]));
584 char tmp[3 + 8 + 1 + 1];
585 snprintf(tmp, _countof(tmp),
"&#x%x;", src[i++]);
591 const size_t end = i + n;
593 if ((entity = chr2sgml(src + i, 1)) !=
nullptr) {
599 else if (is7bit(src[i]))
600 dst.append(1,
static_cast<char>(src[i++]));
604 if (i + 1 < end && is_surrogate_pair(src + i)) {
605 unicode = surrogate_pair_to_ucs4(src + i);
613 char tmp[3 + 8 + 1 + 1];
614 snprintf(tmp, _countof(tmp),
"&#x%x;", unicode);
630 template <
class TR_to = std::
char_traits<
char>,
class AX_to = std::allocator<
char>>
632 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
633 _In_
const std::basic_string_view<
wchar_t, std::char_traits<wchar_t>> src,
636 str2sgmlcat(dst, src.data(), src.size(), what);
650 inline size_t str2sgmlcat(
651 _Inout_cap_(count_dst)
char* dst, _In_
size_t count_dst,
652 _In_reads_or_z_opt_(count_src)
const wchar_t* src, _In_
size_t count_src,
655 _Assume_(dst || !count_dst);
656 _Assume_(src || !count_src);
658 static const std::invalid_argument buffer_overrun(
"buffer overrun");
660 do_ascii = (what & sgml_full) == 0,
661 do_quot = (what & sgml_quot) == 0,
662 do_apos = (what & sgml_apos) == 0,
663 do_lt_gt = (what & sgml_lt_gt) == 0,
664 do_bsol = (what & sgml_bsol) == 0,
665 do_dollar = (what & sgml_dollar) == 0,
666 do_percnt = (what & sgml_percnt) == 0,
667 do_commat = (what & sgml_commat) == 0,
668 do_num = (what & sgml_num) == 0,
669 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
670 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
671 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
673 size_t j = strnlen(dst, count_dst);
674 count_src = strnlen(src, count_src);
675 for (
size_t i = 0; i < count_src;) {
676 size_t n = glyphlen(src + i, count_src - i);
678 do_ascii && is7bit(src[i]) &&
680 (do_quot || (src[i] != L
'"')) &&
681 (do_apos || (src[i] != L
'\'')) &&
682 (do_lt_gt || (src[i] != L
'<' && src[i] != L
'>')) &&
683 (do_bsol || (src[i] != L
'\\')) &&
684 (do_dollar || (src[i] != L
'$')) &&
685 (do_percnt || (src[i] != L
'%')) &&
686 (do_commat || (src[i] != L
'@')) &&
687 (do_num || (src[i] != L
'#')) &&
688 (do_lpar_rpar || (src[i] != L
'(' && src[i] != L
')')) &&
689 (do_lcub_rcub || (src[i] != L
'{' && src[i] != L
'}')) &&
690 (do_lsqb_rsqb || (src[i] != L
'[' && src[i] != L
']')))
693 if (j + 1 >= count_dst)
694 throw buffer_overrun;
695 dst[j++] =
static_cast<char>(src[i++]);
698 const char* entity = chr2sgml(src + i, n);
700 size_t m = strlen(entity);
701 if (j + m + 2 >= count_dst)
702 throw buffer_overrun;
704 memcpy(dst + j, entity, m *
sizeof(
char)); j += m;
710 if (is7bit(src[i])) {
711 if (j + 1 >= count_dst)
712 throw buffer_overrun;
713 dst[j++] =
static_cast<char>(src[i++]);
716 char tmp[3 + 8 + 1 + 1];
717 int m = snprintf(tmp, _countof(tmp),
"&#x%x;", src[i++]);
719 if (
static_cast<size_t>(m) >= count_dst)
720 throw buffer_overrun;
721 memcpy(dst + j, tmp, m *
sizeof(
char)); j += m;
726 const size_t end = i + n;
728 if ((entity = chr2sgml(src + i, 1)) !=
nullptr) {
729 size_t m = strlen(entity);
730 if (j + m + 2 >= count_dst)
731 throw buffer_overrun;
733 memcpy(dst + j, entity, m *
sizeof(
char)); j += m;
737 else if (is7bit(src[i])) {
738 if (j + 1 >= count_dst)
739 throw buffer_overrun;
740 dst[j++] =
static_cast<char>(src[i++]);
745 if (i + 1 < end && is_surrogate_pair(src + i)) {
746 unicode = surrogate_pair_to_ucs4(src + i);
754 char tmp[3 + 8 + 1 + 1];
755 int m = snprintf(tmp, _countof(tmp),
"&#x%x;", unicode);
757 if (
static_cast<size_t>(m) >= count_dst)
758 throw buffer_overrun;
759 memcpy(dst + j, tmp, m *
sizeof(
char)); j += m;
766 throw buffer_overrun;
779 template <
class TR_to = std::
char_traits<
char>,
class AX_to = std::allocator<
char>>
780 inline void str2sgmlcpy(
781 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
782 _In_reads_or_z_opt_(count_src)
const wchar_t* src, _In_
size_t count_src,
786 str2sgmlcat(dst, src, count_src, what);
796 template <
class TR_to = std::
char_traits<
char>,
class AX_to = std::allocator<
char>>
798 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
799 _In_
const std::basic_string_view<
wchar_t, std::char_traits<wchar_t>> src,
802 str2sgmlcpy(dst, src.data(), src.size(), what);
816 inline size_t str2sgmlcpy(
817 _Inout_cap_(count_dst)
char* dst, _In_
size_t count_dst,
818 _In_reads_or_z_opt_(count_src)
const wchar_t* src, _In_
size_t count_src,
821 _Assume_(dst || !count_dst);
824 return str2sgmlcat(dst, count_dst, src, count_src, what);
836 inline std::string str2sgml(
837 _In_reads_or_z_opt_(count_src)
const wchar_t* src, _In_
size_t count_src,
841 str2sgmlcat(dst, src, count_src, what);
853 inline std::string str2sgml(
854 _In_
const std::basic_string_view<
wchar_t, std::char_traits<wchar_t>> src,
857 return str2sgml(src.data(), src.size(), what);