10#include "sgml_unicode.hpp"
18#pragma GCC diagnostic push
19#pragma GCC diagnostic ignored "-Wexit-time-destructors"
26 const utf32_t* sgml2uni(_In_reads_or_z_(count)
const T* entity, _In_
size_t count, utf32_t buf[2])
28 _Assume_(entity && count);
30 if (count < 2 || entity[0] !=
'#') {
31 for (
size_t i = 0, j = _countof(sgml_unicode); i < j; ) {
32 size_t m = (i + j) / 2;
33 if (sgml_unicode[m].sgml[0] < entity[0])
35 else if (sgml_unicode[m].sgml[0] > entity[0])
38 auto r = strncmp<char, T>(sgml_unicode[m].sgml + 1, _countof(sgml_unicode[0].sgml) - 1, entity + 1, count - 1);
44 for (; i < m && strncmp<char, T>(sgml_unicode[m - 1].sgml, _countof(sgml_unicode[0].sgml), entity, count) == 0; m--);
45 return reinterpret_cast<const utf32_t*
>(sgml_unicode[m].unicode);
52 buf[0] = entity[1] ==
'x' || entity[1] ==
'X' ?
53 static_cast<utf32_t
>(strtou32(&entity[2], count - 2,
nullptr, 16)) :
54 static_cast<utf32_t>(strtou32(&entity[1], count - 1, nullptr, 10));
59 inline const utf16_t* utf32_to_wstr(_In_opt_z_
const utf32_t* str, utf16_t* buf)
63 for (
size_t i = 0, j = 0;; ++i) {
69 buf[j++] =
static_cast<utf16_t
>(str[i]);
71 ucs4_to_surrogate_pair(&buf[j], str[i]);
77 inline const utf32_t* utf32_to_wstr(_In_opt_z_
const utf32_t* str, utf32_t* buf)
85 _In_reads_or_z_opt_(count)
const T* str, _In_
size_t count)
87 _Assume_(str || !count);
88 for (
size_t i = 0; i < count; i++) {
91 if (!str[i] || str[i] ==
'&' || isspace(str[i]))
98 constexpr int sgml_full = 0x40000000;
99 constexpr int sgml_quot = 0x00000001;
100 constexpr int sgml_apos = 0x00000002;
101 constexpr int sgml_quot_apos = sgml_quot | sgml_apos;
102 constexpr int sgml_amp = 0x00000004;
103 constexpr int sgml_lt_gt = 0x00000008;
104 constexpr int sgml_bsol = 0x00000010;
105 constexpr int sgml_dollar = 0x00000020;
106 constexpr int sgml_percnt = 0x00000040;
107 constexpr int sgml_commat = 0x00000080;
108 constexpr int sgml_num = 0x00000100;
109 constexpr int sgml_lpar_rpar = 0x00000200;
110 constexpr int sgml_lcub_rcub = 0x00000400;
111 constexpr int sgml_lsqb_rsqb = 0x00000800;
112 constexpr int sgml_sgml = sgml_amp | sgml_lt_gt;
113 constexpr int sgml_ml_attrib = sgml_amp | sgml_quot_apos;
114 constexpr int sgml_c = sgml_amp | sgml_bsol | sgml_quot_apos;
126 template <
class T_from>
128 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
131 _Assume_(src || !count_src);
134 do_ascii = (what & sgml_full) == 0;
136 for (
size_t i = 0; i < count_src && src[i];) {
138 auto end = sgmlend(&src[i + 1], count_src - i - 1);
141 size_t n = end - src - i - 1;
142 auto entity_w = sgml2uni(&src[i + 1], n, chr);
156 if (do_ascii && !is7bit(src[i])) {
176 template <
class T_to =
wchar_t,
class T_from,
class TR_to = std::
char_traits<T_to>,
class AX_to = std::allocator<T_to>>
178 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
179 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
181 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
182 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
184 _Assume_(src || !count_src);
187 skip_quot = (skip & sgml_quot) == 0,
188 skip_apos = (skip & sgml_apos) == 0,
189 skip_amp = (skip & sgml_amp) == 0,
190 skip_lt_gt = (skip & sgml_lt_gt) == 0,
191 skip_bsol = (skip & sgml_bsol) == 0,
192 skip_dollar = (skip & sgml_dollar) == 0,
193 skip_percnt = (skip & sgml_percnt) == 0,
194 skip_commat = (skip & sgml_commat) == 0,
195 skip_num = (skip & sgml_num) == 0,
196 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
197 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
198 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
200 count_src = strnlen(src, count_src);
201 dst.reserve(dst.size() + count_src);
202 for (
size_t i = 0; i < count_src;) {
204 auto end = sgmlend(&src[i + 1], count_src - i - 1);
207 _Assume_(&src[i + 1] <= end);
208 size_t n =
static_cast<size_t>(end - src) - i - 1;
210 auto entity_w = utf32_to_wstr(sgml2uni(&src[i + 1], n, chr32), chr);
212 (skip_quot || (entity_w[0] !=
'"')) &&
213 (skip_apos || (entity_w[0] !=
'\'')) &&
214 (skip_amp || (entity_w[0] !=
'&')) &&
215 (skip_lt_gt || (entity_w[0] !=
'<' && entity_w[0] !=
'>')) &&
216 (skip_bsol || (entity_w[0] !=
'\\')) &&
217 (skip_dollar || (entity_w[0] !=
'$')) &&
218 (skip_percnt || (entity_w[0] !=
'%')) &&
219 (skip_commat || (entity_w[0] !=
'@')) &&
220 (skip_num || (entity_w[0] !=
'#')) &&
221 (skip_lpar_rpar || (entity_w[0] !=
'(' && entity_w[0] !=
')')) &&
222 (skip_lcub_rcub || (entity_w[0] !=
'{' && entity_w[0] !=
'}')) &&
223 (skip_lsqb_rsqb || (entity_w[0] !=
'[' && entity_w[0] !=
']')))
225 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
226 dst.append(entity_w);
227 _Assume_(src <= end);
228 i =
static_cast<size_t>(end - src) + 1;
229 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
234 dst.append(1, src[i++]);
247 template <
class T_to =
wchar_t,
class T_from,
class TR_to = std::
char_traits<T_to>,
class AX_to = std::allocator<T_to>,
class TR_from = std::
char_traits<T_from>,
class AX_from = std::allocator<T_from>>
249 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
250 _In_
const std::basic_string<T_from, TR_from, AX_from>& src,
252 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
253 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
255 sgml2strcat(dst, src.data(), src.size(), skip, offset, map);
271 template <
class T_to =
wchar_t,
class T_from>
273 _Inout_cap_(count_dst) T_to* dst, _In_
size_t count_dst,
274 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
276 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
277 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
279 _Assume_(dst || !count_dst);
280 _Assume_(src || !count_src);
282 static const std::invalid_argument buffer_overrun(
"buffer overrun");
284 skip_quot = (skip & sgml_quot) == 0,
285 skip_apos = (skip & sgml_apos) == 0,
286 skip_amp = (skip & sgml_amp) == 0,
287 skip_lt_gt = (skip & sgml_lt_gt) == 0,
288 skip_bsol = (skip & sgml_bsol) == 0,
289 skip_dollar = (skip & sgml_dollar) == 0,
290 skip_percnt = (skip & sgml_percnt) == 0,
291 skip_commat = (skip & sgml_commat) == 0,
292 skip_num = (skip & sgml_num) == 0,
293 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
294 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
295 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
297 size_t j = strnlen(dst, count_dst);
298 count_src = strnlen(src, count_src);
299 for (
size_t i = 0; i < count_src;) {
301 auto end = sgmlend(&src[i + 1], count_src - i - 1);
305 size_t n = end - src - i - 1;
306 auto entity_w = utf32_to_wstr(sgml2uni(&src[i + 1], n, chr32), chr);
308 (skip_quot || (entity_w[0] !=
'"')) &&
309 (skip_apos || (entity_w[0] !=
'\'')) &&
310 (skip_amp || (entity_w[0] !=
'&')) &&
311 (skip_lt_gt || (entity_w[0] !=
'<' && entity_w[0] !=
'>')) &&
312 (skip_bsol || (entity_w[0] !=
'\\')) &&
313 (skip_dollar || (entity_w[0] !=
'$')) &&
314 (skip_percnt || (entity_w[0] !=
'%')) &&
315 (skip_commat || (entity_w[0] !=
'@')) &&
316 (skip_num || (entity_w[0] !=
'#')) &&
317 (skip_lpar_rpar || (entity_w[0] !=
'(' && entity_w[0] !=
')')) &&
318 (skip_lcub_rcub || (entity_w[0] !=
'{' && entity_w[0] !=
'}')) &&
319 (skip_lsqb_rsqb || (entity_w[0] !=
'[' && entity_w[0] !=
']')))
321 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
322 size_t m = strlen(entity_w);
323 if (j + m >= count_dst)
324 throw buffer_overrun;
325 memcpy(dst + j, entity_w, m *
sizeof(*entity_w)); j += m;
327 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
332 if (j + 1 >= count_dst)
333 throw buffer_overrun;
337 throw buffer_overrun;
352 template <
class T_to =
wchar_t,
class T_from,
class TR_to = std::
char_traits<T_to>,
class AX_to = std::allocator<T_to>>
354 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
355 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
357 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
358 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
363 sgml2strcat(dst, src, count_src, skip, offset, map);
375 template<
class T_to =
wchar_t,
class T_from,
class TR_to = std::
char_traits<T_to>,
class AX_to = std::allocator<T_to>,
class TR_from = std::
char_traits<T_from>,
class AX_from = std::allocator<T_from>>
377 _Inout_ std::basic_string<T_to, TR_to, AX_to>& dst,
378 _In_
const std::basic_string<T_from, TR_from, AX_from>& src,
380 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
381 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
383 sgml2strcpy(dst, src.data(), src.size(), skip, offset, map);
399 template <
class T_to =
wchar_t,
class T_from>
401 _Inout_cap_(count_dst) T_to* dst, _In_
size_t count_dst,
402 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
404 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
405 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
407 _Assume_(dst || !count_dst);
412 return sgml2strcat(dst, count_dst, src, count_src, skip, offset, map);
426 template <
class T_to =
wchar_t,
class T_from,
class TR_to = std::
char_traits<T_to>,
class AX_to = std::allocator<T_to>>
427 std::basic_string<T_to, TR_to, AX_to> sgml2str(
428 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
430 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
431 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
433 std::basic_string<T_to, TR_to, AX_to> dst;
434 sgml2strcat(dst, src, count_src, skip, offset, map);
448 template <
class T_to =
wchar_t,
class T_from,
class TR_to = std::
char_traits<T_to>,
class AX_to = std::allocator<T_to>,
class TR_from = std::
char_traits<T_from>,
class AX_from = std::allocator<T_from>>
449 std::basic_string<T_to, TR_to, AX_to> sgml2str(
450 _In_
const std::basic_string<T_from, TR_from, AX_from>& src,
452 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
453 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
455 return sgml2str<T_to, T_from, TR_to, AX_to>(src.data(), src.size(), skip, offset, map);
459 inline const char* chr2sgml(_In_reads_or_z_(count)
const utf16_t* entity, _In_
size_t count)
461 _Assume_(entity && count);
465 if (count < 2 || !is_surrogate_pair(entity)) {
466 e2 =
static_cast<utf32_t
>(entity[0]);
470 e2 = surrogate_pair_to_ucs4(entity);
473 for (
size_t i = 0, j = _countof(unicode_sgml); i < j; ) {
474 size_t m = (i + j) / 2;
475 auto e1 = sgml_unicode[unicode_sgml[m]].unicode[0];
481 auto r = strncmp(sgml_unicode[unicode_sgml[m]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + offset, count - offset);
487 for (; i < m && sgml_unicode[unicode_sgml[m - 1]].unicode[0] == e2 && strncmp(sgml_unicode[unicode_sgml[m - 1]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + offset, count - offset) == 0; m--);
488 return sgml_unicode[unicode_sgml[m]].sgml;
495 inline const char* chr2sgml(_In_reads_or_z_(count)
const utf32_t* entity, _In_
size_t count)
497 _Assume_(entity && count);
499 utf32_t e2 = entity[0];
500 for (
size_t i = 0, j = _countof(unicode_sgml); i < j; ) {
501 size_t m = (i + j) / 2;
502 auto e1 = sgml_unicode[unicode_sgml[m]].unicode[0];
508 auto r = strncmp(sgml_unicode[unicode_sgml[m]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1);
514 for (; i < m && sgml_unicode[unicode_sgml[m - 1]].unicode[0] == e2 && strncmp(sgml_unicode[unicode_sgml[m - 1]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1) == 0; m--);
515 return sgml_unicode[unicode_sgml[m]].sgml;
522 inline utf32_t wstr_to_utf32(_In_reads_(end)
const utf16_t* src, _Inout_
size_t& i, _In_
size_t end)
525 if (i + 1 >= end || !is_surrogate_pair(src + i))
528 utf32_t unicode = surrogate_pair_to_ucs4(src + i);
533 inline utf32_t wstr_to_utf32(_In_reads_(end)
const utf32_t* src, _Inout_
size_t& i, _In_
size_t end)
548 template <
class T_from =
wchar_t,
class TR_to = std::
char_traits<
char>,
class AX_to = std::allocator<
char>>
550 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
551 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
554 _Assume_(src || !count_src);
557 do_ascii = (what & sgml_full) == 0,
558 do_quot = (what & sgml_quot) == 0,
559 do_apos = (what & sgml_apos) == 0,
560 do_lt_gt = (what & sgml_lt_gt) == 0,
561 do_bsol = (what & sgml_bsol) == 0,
562 do_dollar = (what & sgml_dollar) == 0,
563 do_percnt = (what & sgml_percnt) == 0,
564 do_commat = (what & sgml_commat) == 0,
565 do_num = (what & sgml_num) == 0,
566 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
567 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
568 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
570 count_src = strnlen(src, count_src);
571 dst.reserve(dst.size() + count_src);
572 for (
size_t i = 0; i < count_src;) {
573 size_t n = glyphlen(src + i, count_src - i);
575 do_ascii && is7bit(src[i]) &&
577 (do_quot || (src[i] !=
'"')) &&
578 (do_apos || (src[i] !=
'\'')) &&
579 (do_lt_gt || (src[i] !=
'<' && src[i] !=
'>')) &&
580 (do_bsol || (src[i] !=
'\\')) &&
581 (do_dollar || (src[i] !=
'$')) &&
582 (do_percnt || (src[i] !=
'%')) &&
583 (do_commat || (src[i] !=
'@')) &&
584 (do_num || (src[i] !=
'#')) &&
585 (do_lpar_rpar || (src[i] !=
'(' && src[i] !=
')')) &&
586 (do_lcub_rcub || (src[i] !=
'{' && src[i] !=
'}')) &&
587 (do_lsqb_rsqb || (src[i] !=
'[' && src[i] !=
']')))
590 dst.append(1,
static_cast<char>(src[i++]));
593 const char* entity = chr2sgml(src + i, n);
603 dst.append(1,
static_cast<char>(src[i++]));
605 char tmp[3 + 8 + 1 + 1];
606 snprintf(tmp, _countof(tmp),
"&#x%x;",
static_cast<unsigned int>(src[i++]));
612 const size_t end = i + n;
614 if ((entity = chr2sgml(src + i, 1)) !=
nullptr) {
620 else if (is7bit(src[i]))
621 dst.append(1,
static_cast<char>(src[i++]));
623 char tmp[3 + 8 + 1 + 1];
624 snprintf(tmp, _countof(tmp),
"&#x%x;",
static_cast<unsigned int>(wstr_to_utf32(src, i, end)));
640 template <
class T_from =
wchar_t,
class TR_to = std::
char_traits<
char>,
class AX_to = std::allocator<
char>>
642 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
643 _In_
const std::basic_string_view<T_from, std::char_traits<T_from>> src,
646 str2sgmlcat(dst, src.data(), src.size(), what);
660 template <
class T_from =
wchar_t>
662 _Inout_cap_(count_dst)
char* dst, _In_
size_t count_dst,
663 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
666 _Assume_(dst || !count_dst);
667 _Assume_(src || !count_src);
669 static const std::invalid_argument buffer_overrun(
"buffer overrun");
671 do_ascii = (what & sgml_full) == 0,
672 do_quot = (what & sgml_quot) == 0,
673 do_apos = (what & sgml_apos) == 0,
674 do_lt_gt = (what & sgml_lt_gt) == 0,
675 do_bsol = (what & sgml_bsol) == 0,
676 do_dollar = (what & sgml_dollar) == 0,
677 do_percnt = (what & sgml_percnt) == 0,
678 do_commat = (what & sgml_commat) == 0,
679 do_num = (what & sgml_num) == 0,
680 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
681 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
682 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
684 size_t j = strnlen(dst, count_dst);
685 count_src = strnlen(src, count_src);
686 for (
size_t i = 0; i < count_src;) {
687 size_t n = glyphlen(src + i, count_src - i);
689 do_ascii && is7bit(src[i]) &&
691 (do_quot || (src[i] !=
'"')) &&
692 (do_apos || (src[i] !=
'\'')) &&
693 (do_lt_gt || (src[i] !=
'<' && src[i] !=
'>')) &&
694 (do_bsol || (src[i] !=
'\\')) &&
695 (do_dollar || (src[i] !=
'$')) &&
696 (do_percnt || (src[i] !=
'%')) &&
697 (do_commat || (src[i] !=
'@')) &&
698 (do_num || (src[i] !=
'#')) &&
699 (do_lpar_rpar || (src[i] !=
'(' && src[i] !=
')')) &&
700 (do_lcub_rcub || (src[i] !=
'{' && src[i] !=
'}')) &&
701 (do_lsqb_rsqb || (src[i] !=
'[' && src[i] !=
']')))
704 if (j + 1 >= count_dst)
705 throw buffer_overrun;
706 dst[j++] =
static_cast<char>(src[i++]);
709 const char* entity = chr2sgml(src + i, n);
711 size_t m = strlen(entity);
712 if (j + m + 2 >= count_dst)
713 throw buffer_overrun;
715 memcpy(dst + j, entity, m *
sizeof(
char)); j += m;
721 if (is7bit(src[i])) {
722 if (j + 1 >= count_dst)
723 throw buffer_overrun;
724 dst[j++] =
static_cast<char>(src[i++]);
727 char tmp[3 + 8 + 1 + 1];
728 int m = snprintf(tmp, _countof(tmp),
"&#x%x;",
static_cast<unsigned int>(src[i++]));
730 if (
static_cast<size_t>(m) >= count_dst)
731 throw buffer_overrun;
732 memcpy(dst + j, tmp,
static_cast<size_t>(m) *
sizeof(
char));
733 j +=
static_cast<size_t>(m);
738 const size_t end = i + n;
740 if ((entity = chr2sgml(src + i, 1)) !=
nullptr) {
741 size_t m = strlen(entity);
742 if (j + m + 2 >= count_dst)
743 throw buffer_overrun;
745 memcpy(dst + j, entity, m *
sizeof(
char)); j += m;
749 else if (is7bit(src[i])) {
750 if (j + 1 >= count_dst)
751 throw buffer_overrun;
752 dst[j++] =
static_cast<char>(src[i++]);
755 char tmp[3 + 8 + 1 + 1];
756 int m = snprintf(tmp, _countof(tmp),
"&#x%x;",
static_cast<unsigned int>(wstr_to_utf32(src, i, end)));
758 if (
static_cast<size_t>(m) >= count_dst)
759 throw buffer_overrun;
760 memcpy(dst + j, tmp,
static_cast<size_t>(m) *
sizeof(
char));
761 j +=
static_cast<size_t>(m);
768 throw buffer_overrun;
781 template <
class T_from =
wchar_t,
class TR_to = std::
char_traits<
char>,
class AX_to = std::allocator<
char>>
783 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
784 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
788 str2sgmlcat(dst, src, count_src, what);
798 template <
class T_from =
wchar_t,
class TR_to = std::
char_traits<
char>,
class AX_to = std::allocator<
char>>
800 _Inout_ std::basic_string<char, TR_to, AX_to>& dst,
801 _In_
const std::basic_string_view<T_from, std::char_traits<T_from>> src,
804 str2sgmlcpy(dst, src.data(), src.size(), what);
818 template <
class T_from =
wchar_t>
820 _Inout_cap_(count_dst)
char* dst, _In_
size_t count_dst,
821 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
824 _Assume_(dst || !count_dst);
827 return str2sgmlcat(dst, count_dst, src, count_src, what);
839 template <
class T_from =
wchar_t>
840 std::string str2sgml(
841 _In_reads_or_z_opt_(count_src)
const T_from* src, _In_
size_t count_src,
845 str2sgmlcat(dst, src, count_src, what);
857 template <
class T_from =
wchar_t>
858 std::string str2sgml(
859 _In_
const std::basic_string_view<T_from, std::char_traits<T_from>> src,
862 return str2sgml(src.data(), src.size(), what);
867#pragma GCC diagnostic pop