10#include "sgml_unicode.hpp"
20 inline const wchar_t* sgml2uni(_In_reads_or_z_(count)
const T* entity, _In_
size_t count)
22 _Assume_(entity && count);
23 _Assume_(count < 2 || entity[0] !=
'#');
25 for (
size_t i = 0, j = _countof(sgml_unicode); i < j; ) {
26 size_t m = (i + j) / 2;
27 if (sgml_unicode[m].sgml[0] < entity[0])
29 else if (sgml_unicode[m].sgml[0] > entity[0])
32 auto r = strncmp<char, T>(sgml_unicode[m].sgml + 1, _countof(sgml_unicode[0].sgml) - 1, entity + 1, count - 1);
38 for (; i < m && strncmp<char, T>(sgml_unicode[m - 1].sgml, _countof(sgml_unicode[0].sgml), entity, count) == 0; m--);
39 return sgml_unicode[m].unicode;
47 inline const T* sgmlend(
48 _In_reads_or_z_opt_(count)
const T* str, _In_
size_t count)
50 _Assume_(str || !count);
51 for (
size_t i = 0; i < count; i++) {
54 if (!str[i] || str[i] ==
'&' || isspace(str[i]))
61 constexpr int sgml_full = 0x80000000;
62 constexpr int sgml_quot = 0x00000001;
63 constexpr int sgml_apos = 0x00000002;
64 constexpr int sgml_quot_apos = sgml_quot | sgml_apos;
65 constexpr int sgml_amp = 0x00000004;
66 constexpr int sgml_lt_gt = 0x00000008;
67 constexpr int sgml_bsol = 0x00000010;
68 constexpr int sgml_dollar = 0x00000020;
69 constexpr int sgml_percnt = 0x00000040;
70 constexpr int sgml_commat = 0x00000080;
71 constexpr int sgml_num = 0x00000100;
72 constexpr int sgml_lpar_rpar = 0x00000200;
73 constexpr int sgml_lcub_rcub = 0x00000400;
74 constexpr int sgml_lsqb_rsqb = 0x00000800;
75 constexpr int sgml_sgml = sgml_amp | sgml_lt_gt;
76 constexpr int sgml_ml_attrib = sgml_amp | sgml_quot_apos;
77 constexpr int sgml_c = sgml_amp | sgml_bsol | sgml_quot_apos;
93 inline void sgml2strcat(
94 _Inout_ std::wstring& dst,
95 _In_reads_or_z_opt_(count_src)
const T* src, _In_
size_t count_src,
97 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
98 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
100 _Assume_(src || !count_src);
103 skip_quot = (skip & sgml_quot) == 0,
104 skip_apos = (skip & sgml_apos) == 0,
105 skip_amp = (skip & sgml_amp) == 0,
106 skip_lt_gt = (skip & sgml_lt_gt) == 0,
107 skip_bsol = (skip & sgml_bsol) == 0,
108 skip_dollar = (skip & sgml_dollar) == 0,
109 skip_percnt = (skip & sgml_percnt) == 0,
110 skip_commat = (skip & sgml_commat) == 0,
111 skip_num = (skip & sgml_num) == 0,
112 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
113 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
114 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
116 count_src = strnlen(src, count_src);
117 dst.reserve(dst.size() + count_src);
118 for (
size_t i = 0; i < count_src;) {
120 auto end = sgmlend(src + i + 1, count_src - i - 1);
122 const wchar_t* entity_w;
124 size_t n = end - src - i - 1;
125 if (n >= 2 && src[i + 1] ==
'#') {
127 if (src[i + 2] ==
'x' || src[i + 2] ==
'X')
128 unicode = strtou32(src + i + 3, n - 2,
nullptr, 16);
130 unicode = strtou32(src + i + 2, n - 1,
nullptr, 10);
132 if (unicode < 0x10000) {
133 chr[0] = (wchar_t)unicode;
137 ucs4_to_surrogate_pair(chr, unicode);
141 chr[0] = (wchar_t)unicode;
147 entity_w = sgml2uni(src + i + 1, n);
150 (skip_quot || (entity_w[0] != L
'"')) &&
151 (skip_apos || (entity_w[0] != L
'\'')) &&
152 (skip_amp || (entity_w[0] != L
'&')) &&
153 (skip_lt_gt || (entity_w[0] != L
'<' && entity_w[0] != L
'>')) &&
154 (skip_bsol || (entity_w[0] != L
'\\')) &&
155 (skip_dollar || (entity_w[0] != L
'$')) &&
156 (skip_percnt || (entity_w[0] != L
'%')) &&
157 (skip_commat || (entity_w[0] != L
'@')) &&
158 (skip_num || (entity_w[0] != L
'#')) &&
159 (skip_lpar_rpar || (entity_w[0] != L
'(' && entity_w[0] != L
')')) &&
160 (skip_lcub_rcub || (entity_w[0] != L
'{' && entity_w[0] != L
'}')) &&
161 (skip_lsqb_rsqb || (entity_w[0] != L
'[' && entity_w[0] != L
']')))
163 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
164 dst.append(entity_w);
166 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
171 dst.append(1, src[i++]);
185 inline void sgml2strcat(
186 _Inout_ std::wstring& dst,
187 _In_
const std::basic_string<T>& src,
189 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
190 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
192 sgml2strcat(dst, src.data(), src.size(), skip, offset, map);
209 inline size_t sgml2strcat(
210 _Inout_cap_(count_dst)
wchar_t* dst, _In_
size_t count_dst,
211 _In_reads_or_z_opt_(count_src)
const T* src, _In_
size_t count_src,
213 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
214 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
216 _Assume_(dst || !count_dst);
217 _Assume_(src || !count_src);
219 static const std::invalid_argument buffer_overrun(
"buffer overrun");
221 skip_quot = (skip & sgml_quot) == 0,
222 skip_apos = (skip & sgml_apos) == 0,
223 skip_amp = (skip & sgml_amp) == 0,
224 skip_lt_gt = (skip & sgml_lt_gt) == 0,
225 skip_bsol = (skip & sgml_bsol) == 0,
226 skip_dollar = (skip & sgml_dollar) == 0,
227 skip_percnt = (skip & sgml_percnt) == 0,
228 skip_commat = (skip & sgml_commat) == 0,
229 skip_num = (skip & sgml_num) == 0,
230 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
231 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
232 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
234 size_t j = wcsnlen(dst, count_dst);
235 count_src = strnlen(src, count_src);
236 for (
size_t i = 0; i < count_src;) {
238 auto end = sgmlend(src + i + 1, count_src - i - 1);
240 const wchar_t* entity_w;
242 size_t n = end - src - i - 1;
243 if (n >= 2 && src[i + 1] ==
'#') {
245 if (src[i + 2] ==
'x' || src[i + 2] ==
'X')
246 unicode = strtou32(src + i + 3, n - 2,
nullptr, 16);
248 unicode = strtou32(src + i + 2, n - 1,
nullptr, 10);
250 if (unicode < 0x10000) {
251 chr[0] = (wchar_t)unicode;
255 ucs4_to_surrogate_pair(chr, unicode);
259 chr[0] = (wchar_t)unicode;
265 entity_w = sgml2uni(src + i + 1, n);
268 (skip_quot || (entity_w[0] != L
'"')) &&
269 (skip_apos || (entity_w[0] != L
'\'')) &&
270 (skip_amp || (entity_w[0] != L
'&')) &&
271 (skip_lt_gt || (entity_w[0] != L
'<' && entity_w[0] != L
'>')) &&
272 (skip_bsol || (entity_w[0] != L
'\\')) &&
273 (skip_dollar || (entity_w[0] != L
'$')) &&
274 (skip_percnt || (entity_w[0] != L
'%')) &&
275 (skip_commat || (entity_w[0] != L
'@')) &&
276 (skip_num || (entity_w[0] != L
'#')) &&
277 (skip_lpar_rpar || (entity_w[0] != L
'(' && entity_w[0] != L
')')) &&
278 (skip_lcub_rcub || (entity_w[0] != L
'{' && entity_w[0] != L
'}')) &&
279 (skip_lsqb_rsqb || (entity_w[0] != L
'[' && entity_w[0] != L
']')))
281 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
282 size_t m = wcslen(entity_w);
283 if (j + m >= count_dst)
284 throw buffer_overrun;
285 memcpy(dst + j, entity_w, m *
sizeof(
wchar_t)); j += m;
287 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
292 if (j + 1 >= count_dst)
293 throw buffer_overrun;
297 throw buffer_overrun;
313 inline void sgml2strcpy(
314 _Inout_ std::wstring& dst,
315 _In_reads_or_z_opt_(count_src)
const T* src, _In_
size_t count_src,
317 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
318 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
323 sgml2strcat(dst, src, count_src, skip, offset, map);
335 template<
class _Elem,
class _Traits,
class _Ax>
336 inline void sgml2strcpy(
337 _Inout_ std::wstring& dst,
338 _In_
const std::basic_string<_Elem, _Traits, _Ax>& src,
340 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
341 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
343 sgml2strcpy(dst, src.data(), src.size(), skip, offset, map);
360 inline size_t sgml2strcpy(
361 _Inout_cap_(count_dst)
wchar_t* dst, _In_
size_t count_dst,
362 _In_reads_or_z_opt_(count_src)
const T* src, _In_
size_t count_src,
364 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
365 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
367 _Assume_(dst || !count_dst);
372 return sgml2strcat(dst, count_dst, src, count_src, skip, offset, map);
387 inline std::wstring sgml2str(
388 _In_reads_or_z_opt_(count_src)
const T* src, _In_
size_t count_src,
390 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
391 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
394 sgml2strcat(dst, src, count_src, skip, offset, map);
409 inline std::wstring sgml2str(
410 _In_
const std::basic_string<T>& src,
412 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
413 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
415 return sgml2str(src.c_str(), src.size(), skip, offset, map);
419 inline const char* chr2sgml(_In_reads_or_z_(count)
const wchar_t* entity, _In_
size_t count)
421 _Assume_(entity && count);
423 const wchar_t e2 = entity[0];
424 for (
size_t i = 0, j = _countof(unicode_sgml); i < j; ) {
425 size_t m = (i + j) / 2;
426 wchar_t e1 = sgml_unicode[unicode_sgml[m]].unicode[0];
432 auto r = strncmp(sgml_unicode[unicode_sgml[m]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1);
438 for (; i < m && sgml_unicode[unicode_sgml[m - 1]].unicode[0] == e2 && strncmp(sgml_unicode[unicode_sgml[m - 1]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1) == 0; m--);
439 return sgml_unicode[unicode_sgml[m]].sgml;
455 inline void str2sgmlcat(
456 _Inout_ std::string& dst,
457 _In_reads_or_z_opt_(count_src)
const wchar_t* src, _In_
size_t count_src,
458 _In_
size_t what = 0)
460 _Assume_(src || !count_src);
463 do_ascii = (what & sgml_full) == 0,
464 do_quot = (what & sgml_quot) == 0,
465 do_apos = (what & sgml_apos) == 0,
466 do_lt_gt = (what & sgml_lt_gt) == 0,
467 do_bsol = (what & sgml_bsol) == 0,
468 do_dollar = (what & sgml_dollar) == 0,
469 do_percnt = (what & sgml_percnt) == 0,
470 do_commat = (what & sgml_commat) == 0,
471 do_num = (what & sgml_num) == 0,
472 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
473 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
474 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
476 count_src = wcsnlen(src, count_src);
477 dst.reserve(dst.size() + count_src);
478 for (
size_t i = 0; i < count_src;) {
479 size_t n = glyphlen(src + i, count_src - i);
481 do_ascii && (
unsigned int)src[i] < 128 &&
483 (do_quot || (src[i] != L
'"')) &&
484 (do_apos || (src[i] != L
'\'')) &&
485 (do_lt_gt || (src[i] != L
'<' && src[i] != L
'>')) &&
486 (do_bsol || (src[i] != L
'\\')) &&
487 (do_dollar || (src[i] != L
'$')) &&
488 (do_percnt || (src[i] != L
'%')) &&
489 (do_commat || (src[i] != L
'@')) &&
490 (do_num || (src[i] != L
'#')) &&
491 (do_lpar_rpar || (src[i] != L
'(' && src[i] != L
')')) &&
492 (do_lcub_rcub || (src[i] != L
'{' && src[i] != L
'}')) &&
493 (do_lsqb_rsqb || (src[i] != L
'[' && src[i] != L
']')))
496 dst.append(1,
static_cast<char>(src[i++]));
499 const char* entity = chr2sgml(src + i, n);
508 if ((
unsigned int)src[i] < 128)
509 dst.append(1,
static_cast<char>(src[i++]));
511 char tmp[3 + 8 + 1 + 1];
512 snprintf(tmp, _countof(tmp),
"&#x%x;", src[i++]);
518 const size_t end = i + n;
520 if ((entity = chr2sgml(src + i, 1)) !=
nullptr) {
526 else if ((
unsigned int)src[i] < 128)
527 dst.append(1,
static_cast<char>(src[i++]));
531 if (i + 1 < end && is_surrogate_pair(src + i)) {
532 unicode = surrogate_pair_to_ucs4(src + i);
540 char tmp[3 + 8 + 1 + 1];
541 snprintf(tmp, _countof(tmp),
"&#x%x;", unicode);
557 inline void str2sgmlcat(
558 _Inout_ std::string& dst,
559 _In_
const std::wstring& src,
560 _In_
size_t what = 0)
562 str2sgmlcat(dst, src.c_str(), src.size(), what);
576 inline size_t str2sgmlcat(
577 _Inout_cap_(count_dst)
char* dst, _In_
size_t count_dst,
578 _In_reads_or_z_opt_(count_src)
const wchar_t* src, _In_
size_t count_src,
579 _In_
size_t what = 0)
581 _Assume_(dst || !count_dst);
582 _Assume_(src || !count_src);
584 static const std::invalid_argument buffer_overrun(
"buffer overrun");
586 do_ascii = (what & sgml_full) == 0,
587 do_quot = (what & sgml_quot) == 0,
588 do_apos = (what & sgml_apos) == 0,
589 do_lt_gt = (what & sgml_lt_gt) == 0,
590 do_bsol = (what & sgml_bsol) == 0,
591 do_dollar = (what & sgml_dollar) == 0,
592 do_percnt = (what & sgml_percnt) == 0,
593 do_commat = (what & sgml_commat) == 0,
594 do_num = (what & sgml_num) == 0,
595 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
596 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
597 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
599 size_t j = strnlen(dst, count_dst);
600 count_src = wcsnlen(src, count_src);
601 for (
size_t i = 0; i < count_src;) {
602 size_t n = glyphlen(src + i, count_src - i);
604 do_ascii && (
unsigned int)src[i] < 128 &&
606 (do_quot || (src[i] != L
'"')) &&
607 (do_apos || (src[i] != L
'\'')) &&
608 (do_lt_gt || (src[i] != L
'<' && src[i] != L
'>')) &&
609 (do_bsol || (src[i] != L
'\\')) &&
610 (do_dollar || (src[i] != L
'$')) &&
611 (do_percnt || (src[i] != L
'%')) &&
612 (do_commat || (src[i] != L
'@')) &&
613 (do_num || (src[i] != L
'#')) &&
614 (do_lpar_rpar || (src[i] != L
'(' && src[i] != L
')')) &&
615 (do_lcub_rcub || (src[i] != L
'{' && src[i] != L
'}')) &&
616 (do_lsqb_rsqb || (src[i] != L
'[' && src[i] != L
']')))
619 if (j + 1 >= count_dst)
620 throw buffer_overrun;
621 dst[j++] =
static_cast<char>(src[i++]);
624 const char* entity = chr2sgml(src + i, n);
626 size_t m = strlen(entity);
627 if (j + m + 2 >= count_dst)
628 throw buffer_overrun;
630 memcpy(dst + j, entity, m *
sizeof(
char)); j += m;
636 if ((
unsigned int)src[i] < 128) {
637 if (j + 1 >= count_dst)
638 throw buffer_overrun;
639 dst[j++] =
static_cast<char>(src[i++]);
642 char tmp[3 + 8 + 1 + 1];
643 int m = snprintf(tmp, _countof(tmp),
"&#x%x;", src[i++]);
645 if (
static_cast<size_t>(m) >= count_dst)
646 throw buffer_overrun;
647 memcpy(dst + j, tmp, m *
sizeof(
char)); j += m;
652 const size_t end = i + n;
654 if ((entity = chr2sgml(src + i, 1)) !=
nullptr) {
655 size_t m = strlen(entity);
656 if (j + m + 2 >= count_dst)
657 throw buffer_overrun;
659 memcpy(dst + j, entity, m *
sizeof(
char)); j += m;
663 else if ((
unsigned int)src[i] < 128) {
664 if (j + 1 >= count_dst)
665 throw buffer_overrun;
666 dst[j++] =
static_cast<char>(src[i++]);
671 if (i + 1 < end && is_surrogate_pair(src + i)) {
672 unicode = surrogate_pair_to_ucs4(src + i);
680 char tmp[3 + 8 + 1 + 1];
681 int m = snprintf(tmp, _countof(tmp),
"&#x%x;", unicode);
683 if (
static_cast<size_t>(m) >= count_dst)
684 throw buffer_overrun;
685 memcpy(dst + j, tmp, m *
sizeof(
char)); j += m;
692 throw buffer_overrun;
705 inline void str2sgmlcpy(
706 _Inout_ std::string& dst,
707 _In_reads_or_z_opt_(count_src)
const wchar_t* src, _In_
size_t count_src,
708 _In_
size_t what = 0)
711 str2sgmlcat(dst, src, count_src, what);
721 inline void str2sgmlcpy(
722 _Inout_ std::string& dst,
723 _In_
const std::wstring& src,
724 _In_
size_t what = 0)
726 str2sgmlcpy(dst, src.data(), src.size(), what);
740 inline size_t str2sgmlcpy(
741 _Inout_cap_(count_dst)
char* dst, _In_
size_t count_dst,
742 _In_reads_or_z_opt_(count_src)
const wchar_t* src, _In_
size_t count_src,
743 _In_
size_t what = 0)
745 _Assume_(dst || !count_dst);
748 return str2sgmlcat(dst, count_dst, src, count_src, what);
760 inline std::string str2sgml(
761 _In_reads_or_z_opt_(count_src)
const wchar_t* src, _In_
size_t count_src,
762 _In_
size_t what = 0)
765 str2sgmlcat(dst, src, count_src, what);
777 inline std::string str2sgml(
778 _In_
const std::wstring& src,
779 _In_
size_t what = 0)
781 return str2sgml(src.c_str(), src.size(), what);