10#include "sgml_unicode.hpp"
18#pragma GCC diagnostic push
19#pragma GCC diagnostic ignored "-Wexit-time-destructors"
26 const utf32_t* sgml2uni(_In_reads_or_z_(count)
const T* entity, _In_
size_t count, utf32_t buf[2])
28 _Assume_(entity && count);
30 if (count < 2 || entity[0] !=
'#') {
31 for (
size_t i = 0, j = _countof(sgml_unicode); i < j; ) {
32 size_t m = (i + j) / 2;
33 if (sgml_unicode[m].sgml[0] < entity[0])
35 else if (sgml_unicode[m].sgml[0] > entity[0])
38 auto r = strncmp<char, T>(sgml_unicode[m].sgml + 1, _countof(sgml_unicode[0].sgml) - 1, entity + 1, count - 1);
44 for (; i < m && strncmp<char, T>(sgml_unicode[m - 1].sgml, _countof(sgml_unicode[0].sgml), entity, count) == 0; m--);
45 return reinterpret_cast<const utf32_t*
>(sgml_unicode[m].unicode);
52 buf[0] = entity[1] ==
'x' || entity[1] ==
'X' ?
53 static_cast<utf32_t
>(strtou32(&entity[2], count - 2,
nullptr, 16)) :
54 static_cast<utf32_t>(strtou32(&entity[1], count - 1, nullptr, 10));
59 inline const utf16_t* utf32_to_wstr(_In_opt_z_
const utf32_t* str, utf16_t* buf)
63 for (
size_t i = 0, j = 0;; ++i) {
69 buf[j++] =
static_cast<utf16_t
>(str[i]);
71 ucs4_to_surrogate_pair(&buf[j], str[i]);
77 inline const utf32_t* utf32_to_wstr(_In_opt_z_
const utf32_t* str, utf32_t* buf)
85 _In_reads_or_z_opt_(count)
const T* str, _In_
size_t count)
87 _Assume_(str || !count);
88 for (
size_t i = 0; i < count; i++) {
91 if (!str[i] || str[i] ==
'&' || isspace(str[i]))
98 constexpr int sgml_full = 0x40000000;
99 constexpr int sgml_quot = 0x00000001;
100 constexpr int sgml_apos = 0x00000002;
101 constexpr int sgml_quot_apos = sgml_quot | sgml_apos;
102 constexpr int sgml_amp = 0x00000004;
103 constexpr int sgml_lt_gt = 0x00000008;
104 constexpr int sgml_bsol = 0x00000010;
105 constexpr int sgml_dollar = 0x00000020;
106 constexpr int sgml_percnt = 0x00000040;
107 constexpr int sgml_commat = 0x00000080;
108 constexpr int sgml_num = 0x00000100;
109 constexpr int sgml_lpar_rpar = 0x00000200;
110 constexpr int sgml_lcub_rcub = 0x00000400;
111 constexpr int sgml_lsqb_rsqb = 0x00000800;
112 constexpr int sgml_sgml = sgml_amp | sgml_lt_gt;
113 constexpr int sgml_ml_attrib = sgml_amp | sgml_quot_apos;
114 constexpr int sgml_c = sgml_amp | sgml_bsol | sgml_quot_apos;
126 template <
class T_from>
128 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
131 _Assume_(src || !count_src);
134 do_ascii = (what & sgml_full) == 0;
136 for (
size_t i = 0; i < count_src && src[i];) {
138 auto end = sgmlend(&src[i + 1], count_src - i - 1);
141 size_t n = end - src - i - 1;
142 auto entity_w = sgml2uni(&src[i + 1], n, chr);
156 if (do_ascii && !is7bit(src[i])) {
176 template <
class T_to =
wchar_t,
class T_from,
class TR_to = std::
char_traits<T_to>,
class AX_to = std::allocator<T_to>>
178 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
179 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
181 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
182 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
184 _Assume_(src || !count_src);
187 skip_quot = (skip & sgml_quot) == 0,
188 skip_apos = (skip & sgml_apos) == 0,
189 skip_amp = (skip & sgml_amp) == 0,
190 skip_lt_gt = (skip & sgml_lt_gt) == 0,
191 skip_bsol = (skip & sgml_bsol) == 0,
192 skip_dollar = (skip & sgml_dollar) == 0,
193 skip_percnt = (skip & sgml_percnt) == 0,
194 skip_commat = (skip & sgml_commat) == 0,
195 skip_num = (skip & sgml_num) == 0,
196 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
197 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
198 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
200 count_src = strnlen(src, count_src);
201 dst.reserve(dst.size() + count_src);
202 for (
size_t i = 0; i < count_src;) {
204 auto end = sgmlend(&src[i + 1], count_src - i - 1);
207 _Assume_(&src[i + 1] <= end);
208 size_t n =
static_cast<size_t>(end - src) - i - 1;
210 auto entity_w = utf32_to_wstr(sgml2uni(&src[i + 1], n, chr32), chr);
212 (skip_quot || (entity_w[0] !=
'"')) &&
213 (skip_apos || (entity_w[0] !=
'\'')) &&
214 (skip_amp || (entity_w[0] !=
'&')) &&
215 (skip_lt_gt || (entity_w[0] !=
'<' && entity_w[0] !=
'>')) &&
216 (skip_bsol || (entity_w[0] !=
'\\')) &&
217 (skip_dollar || (entity_w[0] !=
'$')) &&
218 (skip_percnt || (entity_w[0] !=
'%')) &&
219 (skip_commat || (entity_w[0] !=
'@')) &&
220 (skip_num || (entity_w[0] !=
'#')) &&
221 (skip_lpar_rpar || (entity_w[0] !=
'(' && entity_w[0] !=
')')) &&
222 (skip_lcub_rcub || (entity_w[0] !=
'{' && entity_w[0] !=
'}')) &&
223 (skip_lsqb_rsqb || (entity_w[0] !=
'[' && entity_w[0] !=
']')))
225 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
226 dst.append(entity_w);
227 _Assume_(src <= end);
228 i =
static_cast<size_t>(end - src) + 1;
229 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
234 dst.append(1, src[i++]);
247 template <
class T_to =
wchar_t,
class T_from,
class TR_to = std::
char_traits<T_to>,
class AX_to = std::allocator<T_to>,
class TR_from = std::
char_traits<T_from>,
class AX_from = std::allocator<T_from>>
249 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
250 _In_
const std::basic_string<T_from, TR_from, AX_from>& src,
252 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
253 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
255 sgml2strcat(dst, src.data(), src.size(), skip, offset, map);
271 template <
class T_to =
wchar_t,
class T_from>
273 _Inout_cap_(count_dst) T_to* dst, _In_
size_t count_dst,
274 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
276 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
277 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
279 _Assume_(dst || !count_dst);
280 _Assume_(src || !count_src);
282 static const std::invalid_argument buffer_overrun(
"buffer overrun");
284 skip_quot = (skip & sgml_quot) == 0,
285 skip_apos = (skip & sgml_apos) == 0,
286 skip_amp = (skip & sgml_amp) == 0,
287 skip_lt_gt = (skip & sgml_lt_gt) == 0,
288 skip_bsol = (skip & sgml_bsol) == 0,
289 skip_dollar = (skip & sgml_dollar) == 0,
290 skip_percnt = (skip & sgml_percnt) == 0,
291 skip_commat = (skip & sgml_commat) == 0,
292 skip_num = (skip & sgml_num) == 0,
293 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
294 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
295 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
297 size_t j = strnlen(dst, count_dst);
298 count_src = strnlen(src, count_src);
299 for (
size_t i = 0; i < count_src;) {
301 auto end = sgmlend(&src[i + 1], count_src - i - 1);
305 size_t n = end - src - i - 1;
306 auto entity_w = utf32_to_wstr(sgml2uni(&src[i + 1], n, chr32), chr);
308 (skip_quot || (entity_w[0] !=
'"')) &&
309 (skip_apos || (entity_w[0] !=
'\'')) &&
310 (skip_amp || (entity_w[0] !=
'&')) &&
311 (skip_lt_gt || (entity_w[0] !=
'<' && entity_w[0] !=
'>')) &&
312 (skip_bsol || (entity_w[0] !=
'\\')) &&
313 (skip_dollar || (entity_w[0] !=
'$')) &&
314 (skip_percnt || (entity_w[0] !=
'%')) &&
315 (skip_commat || (entity_w[0] !=
'@')) &&
316 (skip_num || (entity_w[0] !=
'#')) &&
317 (skip_lpar_rpar || (entity_w[0] !=
'(' && entity_w[0] !=
')')) &&
318 (skip_lcub_rcub || (entity_w[0] !=
'{' && entity_w[0] !=
'}')) &&
319 (skip_lsqb_rsqb || (entity_w[0] !=
'[' && entity_w[0] !=
']')))
321 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
322 size_t m = strlen(entity_w);
323 if (j + m >= count_dst)
324 throw buffer_overrun;
325 memcpy(dst + j, entity_w, m *
sizeof(*entity_w)); j += m;
327 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
332 if (j + 1 >= count_dst)
333 throw buffer_overrun;
337 throw buffer_overrun;
352 template <
class T_to =
wchar_t,
class T_from,
class TR_to = std::
char_traits<T_to>,
class AX_to = std::allocator<T_to>>
354 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
355 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
357 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
358 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
363 sgml2strcat(dst, src, count_src, skip, offset, map);
375 template<
class T_to =
wchar_t,
class T_from,
class TR_to = std::
char_traits<T_to>,
class AX_to = std::allocator<T_to>,
class TR_from = std::
char_traits<T_from>,
class AX_from = std::allocator<T_from>>
377 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
378 _In_
const std::basic_string<T_from, TR_from, AX_from>& src,
380 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
381 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
383 sgml2strcpy(dst, src.data(), src.size(), skip, offset, map);
399 template <
class T_to =
wchar_t,
class T_from>
401 _Inout_cap_(count_dst) T_to* dst, _In_
size_t count_dst,
402 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
404 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
405 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
407 _Assume_(dst || !count_dst);
412 return sgml2strcat(dst, count_dst, src, count_src, skip, offset, map);
426 template <
class T_to =
wchar_t,
class T_from,
class TR_to = std::
char_traits<T_to>,
class AX_to = std::allocator<T_to>>
427 std::basic_string<T_to, TR_to, AX_to> sgml2str(
428 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
430 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
431 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
433 std::basic_string<T_to, TR_to, AX_to> dst;
434 sgml2strcat(dst, src, count_src, skip, offset, map);
448 template <
class T_to =
wchar_t,
class T_from,
class TR_to = std::
char_traits<T_to>,
class AX_to = std::allocator<T_to>,
class TR_from = std::
char_traits<T_from>,
class AX_from = std::allocator<T_from>>
449 std::basic_string<T_to, TR_to, AX_to> sgml2str(
450 _In_
const std::basic_string<T_from, TR_from, AX_from>& src,
452 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
453 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
455 return sgml2str<T_to, T_from, TR_to, AX_to>(src.data(), src.size(), skip, offset, map);
459 inline const char* chr2sgml(_In_reads_or_z_(count)
const utf16_t* entity, _In_
size_t count)
461 _Assume_(entity && count);
465 if (count < 2 || !is_surrogate_pair(entity)) {
466 e2 =
static_cast<utf32_t
>(entity[0]);
470 e2 = surrogate_pair_to_ucs4(entity);
473 for (
size_t i = 0, j = _countof(unicode_sgml); i < j; ) {
474 size_t m = (i + j) / 2;
475 auto e1 = sgml_unicode[unicode_sgml[m]].unicode[0];
481 auto r = strncmp(sgml_unicode[unicode_sgml[m]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + offset, count - offset);
487 for (; i < m && sgml_unicode[unicode_sgml[m - 1]].unicode[0] == e2 && strncmp(sgml_unicode[unicode_sgml[m - 1]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + offset, count - offset) == 0; m--);
488 return sgml_unicode[unicode_sgml[m]].sgml;
495 inline const char* chr2sgml(_In_reads_or_z_(count)
const utf32_t* entity, _In_
size_t count)
497 _Assume_(entity && count);
499 utf32_t e2 = entity[0];
500 for (
size_t i = 0, j = _countof(unicode_sgml); i < j; ) {
501 size_t m = (i + j) / 2;
502 auto e1 = sgml_unicode[unicode_sgml[m]].unicode[0];
508 auto r = strncmp(sgml_unicode[unicode_sgml[m]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1);
514 for (; i < m && sgml_unicode[unicode_sgml[m - 1]].unicode[0] == e2 && strncmp(sgml_unicode[unicode_sgml[m - 1]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1) == 0; m--);
515 return sgml_unicode[unicode_sgml[m]].sgml;
522 inline utf32_t wstr_to_utf32(_In_reads_(end)
const utf16_t* src, _Inout_
size_t& i, _In_
size_t end)
525 if (i + 1 >= end || !is_surrogate_pair(src + i))
528 utf32_t unicode = surrogate_pair_to_ucs4(src + i);
533 inline utf32_t wstr_to_utf32(_In_reads_(end)
const utf32_t* src, _Inout_
size_t& i, _In_
size_t end)
549 template <
class T_from =
wchar_t,
class TR_to = std::
char_traits<
char>,
class AX_to = std::allocator<
char>>
551 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
552 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
555 _Assume_(src || !count_src);
558 do_ascii = (what & sgml_full) == 0,
559 do_quot = (what & sgml_quot) == 0,
560 do_apos = (what & sgml_apos) == 0,
561 do_lt_gt = (what & sgml_lt_gt) == 0,
562 do_bsol = (what & sgml_bsol) == 0,
563 do_dollar = (what & sgml_dollar) == 0,
564 do_percnt = (what & sgml_percnt) == 0,
565 do_commat = (what & sgml_commat) == 0,
566 do_num = (what & sgml_num) == 0,
567 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
568 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
569 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
571 count_src = strnlen(src, count_src);
572 dst.reserve(dst.size() + count_src);
573 for (
size_t i = 0; i < count_src;) {
574 size_t n = glyphlen(src + i, count_src - i);
576 do_ascii && is7bit(src[i]) &&
578 (do_quot || (src[i] !=
'"')) &&
579 (do_apos || (src[i] !=
'\'')) &&
580 (do_lt_gt || (src[i] !=
'<' && src[i] !=
'>')) &&
581 (do_bsol || (src[i] !=
'\\')) &&
582 (do_dollar || (src[i] !=
'$')) &&
583 (do_percnt || (src[i] !=
'%')) &&
584 (do_commat || (src[i] !=
'@')) &&
585 (do_num || (src[i] !=
'#')) &&
586 (do_lpar_rpar || (src[i] !=
'(' && src[i] !=
')')) &&
587 (do_lcub_rcub || (src[i] !=
'{' && src[i] !=
'}')) &&
588 (do_lsqb_rsqb || (src[i] !=
'[' && src[i] !=
']')))
591 dst.append(1,
static_cast<char>(src[i++]));
594 const char* entity = chr2sgml(src + i, n);
604 dst.append(1,
static_cast<char>(src[i++]));
606 char tmp[3 + 8 + 1 + 1];
607 snprintf(tmp, _countof(tmp),
"&#x%x;",
static_cast<unsigned int>(src[i++]));
613 const size_t end = i + n;
615 if ((entity = chr2sgml(src + i, 1)) !=
nullptr) {
621 else if (is7bit(src[i]))
622 dst.append(1,
static_cast<char>(src[i++]));
624 char tmp[3 + 8 + 1 + 1];
625 snprintf(tmp, _countof(tmp),
"&#x%x;",
static_cast<unsigned int>(wstr_to_utf32(src, i, end)));
641 template <
class T_from =
wchar_t,
class TR_to = std::
char_traits<
char>,
class AX_to = std::allocator<
char>>
643 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
644 _In_
const std::basic_string_view<T_from, std::char_traits<T_from>> src,
647 str2sgmlcat(dst, src.data(), src.size(), what);
661 template <
class T_from =
wchar_t>
663 _Inout_cap_(count_dst)
char* dst, _In_
size_t count_dst,
664 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
667 _Assume_(dst || !count_dst);
668 _Assume_(src || !count_src);
670 static const std::invalid_argument buffer_overrun(
"buffer overrun");
672 do_ascii = (what & sgml_full) == 0,
673 do_quot = (what & sgml_quot) == 0,
674 do_apos = (what & sgml_apos) == 0,
675 do_lt_gt = (what & sgml_lt_gt) == 0,
676 do_bsol = (what & sgml_bsol) == 0,
677 do_dollar = (what & sgml_dollar) == 0,
678 do_percnt = (what & sgml_percnt) == 0,
679 do_commat = (what & sgml_commat) == 0,
680 do_num = (what & sgml_num) == 0,
681 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
682 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
683 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
685 size_t j = strnlen(dst, count_dst);
686 count_src = strnlen(src, count_src);
687 for (
size_t i = 0; i < count_src;) {
688 size_t n = glyphlen(src + i, count_src - i);
690 do_ascii && is7bit(src[i]) &&
692 (do_quot || (src[i] !=
'"')) &&
693 (do_apos || (src[i] !=
'\'')) &&
694 (do_lt_gt || (src[i] !=
'<' && src[i] !=
'>')) &&
695 (do_bsol || (src[i] !=
'\\')) &&
696 (do_dollar || (src[i] !=
'$')) &&
697 (do_percnt || (src[i] !=
'%')) &&
698 (do_commat || (src[i] !=
'@')) &&
699 (do_num || (src[i] !=
'#')) &&
700 (do_lpar_rpar || (src[i] !=
'(' && src[i] !=
')')) &&
701 (do_lcub_rcub || (src[i] !=
'{' && src[i] !=
'}')) &&
702 (do_lsqb_rsqb || (src[i] !=
'[' && src[i] !=
']')))
705 if (j + 1 >= count_dst)
706 throw buffer_overrun;
707 dst[j++] =
static_cast<char>(src[i++]);
710 const char* entity = chr2sgml(src + i, n);
712 size_t m = strlen(entity);
713 if (j + m + 2 >= count_dst)
714 throw buffer_overrun;
716 memcpy(dst + j, entity, m *
sizeof(
char)); j += m;
722 if (is7bit(src[i])) {
723 if (j + 1 >= count_dst)
724 throw buffer_overrun;
725 dst[j++] =
static_cast<char>(src[i++]);
728 char tmp[3 + 8 + 1 + 1];
729 int m = snprintf(tmp, _countof(tmp),
"&#x%x;",
static_cast<unsigned int>(src[i++]));
731 if (
static_cast<size_t>(m) >= count_dst)
732 throw buffer_overrun;
733 memcpy(dst + j, tmp,
static_cast<size_t>(m) *
sizeof(
char));
734 j +=
static_cast<size_t>(m);
739 const size_t end = i + n;
741 if ((entity = chr2sgml(src + i, 1)) !=
nullptr) {
742 size_t m = strlen(entity);
743 if (j + m + 2 >= count_dst)
744 throw buffer_overrun;
746 memcpy(dst + j, entity, m *
sizeof(
char)); j += m;
750 else if (is7bit(src[i])) {
751 if (j + 1 >= count_dst)
752 throw buffer_overrun;
753 dst[j++] =
static_cast<char>(src[i++]);
756 char tmp[3 + 8 + 1 + 1];
757 int m = snprintf(tmp, _countof(tmp),
"&#x%x;",
static_cast<unsigned int>(wstr_to_utf32(src, i, end)));
759 if (
static_cast<size_t>(m) >= count_dst)
760 throw buffer_overrun;
761 memcpy(dst + j, tmp,
static_cast<size_t>(m) *
sizeof(
char));
762 j +=
static_cast<size_t>(m);
769 throw buffer_overrun;
782 template <
class T_from =
wchar_t,
class TR_to = std::
char_traits<
char>,
class AX_to = std::allocator<
char>>
784 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
785 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
789 str2sgmlcat(dst, src, count_src, what);
799 template <
class T_from =
wchar_t,
class TR_to = std::
char_traits<
char>,
class AX_to = std::allocator<
char>>
801 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
802 _In_
const std::basic_string_view<T_from, std::char_traits<T_from>> src,
805 str2sgmlcpy(dst, src.data(), src.size(), what);
819 template <
class T_from =
wchar_t>
821 _Inout_cap_(count_dst)
char* dst, _In_
size_t count_dst,
822 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
825 _Assume_(dst || !count_dst);
828 return str2sgmlcat(dst, count_dst, src, count_src, what);
840 template <
class T_from =
wchar_t>
841 std::string str2sgml(
842 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
846 str2sgmlcat(dst, src, count_src, what);
858 template <
class T_from =
wchar_t>
859 std::string str2sgml(
860 _In_
const std::basic_string_view<T_from, std::char_traits<T_from>> src,
863 return str2sgml(src.data(), src.size(), what);
868#pragma GCC diagnostic pop