10#include "sgml_unicode.hpp"
19 inline const wchar_t* sgml2uni(_In_reads_or_z_(count)
const T* entity, _In_
size_t count)
21 assert(entity && count);
22 assert(count < 2 || entity[0] !=
'#');
24 for (
size_t i = 0, j = _countof(sgml_unicode); i < j; ) {
25 size_t m = (i + j) / 2;
26 if (sgml_unicode[m].sgml[0] < entity[0])
28 else if (sgml_unicode[m].sgml[0] > entity[0])
31 auto r = strncmp<char, T>(sgml_unicode[m].sgml + 1, _countof(sgml_unicode[0].sgml) - 1, entity + 1, count - 1);
37 for (; i < m && strncmp<char, T>(sgml_unicode[m - 1].sgml, _countof(sgml_unicode[0].sgml), entity, count) == 0; m--);
38 return sgml_unicode[m].unicode;
46 inline const T* sgmlend(
47 _In_reads_or_z_opt_(count)
const T* str,
50 assert(str || !count);
51 for (
size_t i = 0; i < count; i++) {
54 if (!str[i] || str[i] ==
'&' || isspace(str[i]))
61 constexpr int sgml_full = 0x80000000;
62 constexpr int sgml_quot = 0x00000001;
63 constexpr int sgml_apos = 0x00000002;
64 constexpr int sgml_quot_apos = sgml_quot | sgml_apos;
65 constexpr int sgml_amp = 0x00000004;
66 constexpr int sgml_lt_gt = 0x00000008;
67 constexpr int sgml_bsol = 0x00000010;
68 constexpr int sgml_dollar = 0x00000020;
69 constexpr int sgml_percnt = 0x00000040;
70 constexpr int sgml_commat = 0x00000080;
71 constexpr int sgml_num = 0x00000100;
72 constexpr int sgml_lpar_rpar = 0x00000200;
73 constexpr int sgml_lcub_rcub = 0x00000400;
74 constexpr int sgml_lsqb_rsqb = 0x00000800;
75 constexpr int sgml_sgml = sgml_amp | sgml_lt_gt;
76 constexpr int sgml_ml_attrib = sgml_amp | sgml_quot_apos;
77 constexpr int sgml_c = sgml_amp | sgml_bsol | sgml_quot_apos;
95 inline void sgml2wstr(
96 _Inout_ std::wstring& dst,
97 _In_reads_or_z_opt_(count_src)
const T* src, _In_
size_t count_src,
99 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
100 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
102 assert(src || !count_src);
105 skip_quot = (skip & sgml_quot) == 0,
106 skip_apos = (skip & sgml_apos) == 0,
107 skip_amp = (skip & sgml_amp) == 0,
108 skip_lt_gt = (skip & sgml_lt_gt) == 0,
109 skip_bsol = (skip & sgml_bsol) == 0,
110 skip_dollar = (skip & sgml_dollar) == 0,
111 skip_percnt = (skip & sgml_percnt) == 0,
112 skip_commat = (skip & sgml_commat) == 0,
113 skip_num = (skip & sgml_num) == 0,
114 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
115 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
116 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
118 count_src = strnlen(src, count_src);
119 dst.reserve(dst.size() + count_src);
120 for (
size_t i = 0; i < count_src;) {
122 auto end = sgmlend(src + i + 1, count_src - i - 1);
124 const wchar_t* entity_w;
126 size_t n = end - src - i - 1;
127 if (n >= 2 && src[i + 1] ==
'#') {
129 if (src[i + 2] ==
'x' || src[i + 2] ==
'X')
130 unicode = strtou32(src + i + 3, n - 2,
nullptr, 16);
132 unicode = strtou32(src + i + 2, n - 1,
nullptr, 10);
134 if (unicode < 0x10000) {
135 chr[0] = (wchar_t)unicode;
139 ucs4_to_surrogate_pair(chr, unicode);
143 chr[0] = (wchar_t)unicode;
149 entity_w = sgml2uni(src + i + 1, n);
152 (skip_quot || (entity_w[0] != L
'"')) &&
153 (skip_apos || (entity_w[0] != L
'\'')) &&
154 (skip_amp || (entity_w[0] != L
'&')) &&
155 (skip_lt_gt || (entity_w[0] != L
'<' && entity_w[0] != L
'>')) &&
156 (skip_bsol || (entity_w[0] != L
'\\')) &&
157 (skip_dollar || (entity_w[0] != L
'$')) &&
158 (skip_percnt || (entity_w[0] != L
'%')) &&
159 (skip_commat || (entity_w[0] != L
'@')) &&
160 (skip_num || (entity_w[0] != L
'#')) &&
161 (skip_lpar_rpar || (entity_w[0] != L
'(' && entity_w[0] != L
')')) &&
162 (skip_lcub_rcub || (entity_w[0] != L
'{' && entity_w[0] != L
'}')) &&
163 (skip_lsqb_rsqb || (entity_w[0] != L
'[' && entity_w[0] != L
']')))
165 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
166 dst.append(entity_w);
168 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
173 dst.append(1, src[i++]);
189 inline void sgml2wstr(
190 _Inout_ std::wstring& dst,
191 _In_
const std::basic_string<T>& src,
193 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
194 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
196 sgml2wstr(dst, src.data(), src.size(), skip, offset, map);
211 inline std::wstring sgml2wstr(
212 _In_reads_or_z_opt_(count_src)
const T* src, _In_
size_t count_src,
214 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
215 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
218 sgml2wstr(dst, src, count_src, skip, offset, map);
233 inline std::wstring sgml2wstr(
234 _In_
const std::basic_string<T>& src,
236 _In_
const mapping<size_t>& offset = mapping<size_t>(0, 0),
237 _Inout_opt_ mapping_vector<size_t>* map =
nullptr)
239 return sgml2wstr(src.c_str(), src.size(), skip, offset, map);
243 inline const char* chr2sgml(_In_reads_or_z_(count)
const wchar_t* entity, _In_
size_t count)
245 assert(entity && count);
247 const wchar_t e2 = entity[0];
248 for (
size_t i = 0, j = _countof(unicode_sgml); i < j; ) {
249 size_t m = (i + j) / 2;
250 wchar_t e1 = sgml_unicode[unicode_sgml[m]].unicode[0];
256 auto r = strncmp(sgml_unicode[unicode_sgml[m]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1);
262 for (; i < m && sgml_unicode[unicode_sgml[m - 1]].unicode[0] == e2 && strncmp(sgml_unicode[unicode_sgml[m - 1]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1) == 0; m--);
263 return sgml_unicode[unicode_sgml[m]].sgml;
279 inline void wstr2sgml(
280 _Inout_ std::string& dst,
281 _In_reads_or_z_opt_(count_src)
const wchar_t* src,
282 _In_
size_t count_src,
283 _In_
size_t what = 0)
285 assert(src || !count_src);
288 do_ascii = (what & sgml_full) == 0,
289 do_quot = (what & sgml_quot) == 0,
290 do_apos = (what & sgml_apos) == 0,
291 do_lt_gt = (what & sgml_lt_gt) == 0,
292 do_bsol = (what & sgml_bsol) == 0,
293 do_dollar = (what & sgml_dollar) == 0,
294 do_percnt = (what & sgml_percnt) == 0,
295 do_commat = (what & sgml_commat) == 0,
296 do_num = (what & sgml_num) == 0,
297 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
298 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
299 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
301 count_src = wcsnlen(src, count_src);
302 dst.reserve(dst.size() + count_src);
303 for (
size_t i = 0; i < count_src;) {
304 size_t n = glyphlen(src + i, count_src - i);
306 do_ascii && (
unsigned int)src[i] < 128 &&
308 (do_quot || (src[i] != L
'"')) &&
309 (do_apos || (src[i] != L
'\'')) &&
310 (do_lt_gt || (src[i] != L
'<' && src[i] != L
'>')) &&
311 (do_bsol || (src[i] != L
'\\')) &&
312 (do_dollar || (src[i] != L
'$')) &&
313 (do_percnt || (src[i] != L
'%')) &&
314 (do_commat || (src[i] != L
'@')) &&
315 (do_num || (src[i] != L
'#')) &&
316 (do_lpar_rpar || (src[i] != L
'(' && src[i] != L
')')) &&
317 (do_lcub_rcub || (src[i] != L
'{' && src[i] != L
'}')) &&
318 (do_lsqb_rsqb || (src[i] != L
'[' && src[i] != L
']')))
321 dst.append(1, (
char)src[i++]);
324 const char* entity = chr2sgml(src + i, n);
333 if ((
unsigned int)src[i] < 128)
334 dst.append(1, (
char)src[i++]);
336 char tmp[3 + 8 + 1 + 1];
337 snprintf(tmp, _countof(tmp),
"&#x%x;", src[i++]);
343 const size_t end = i + n;
345 if ((entity = chr2sgml(src + i, 1)) !=
nullptr) {
351 else if ((
unsigned int)src[i] < 128)
352 dst.append(1, (
char)src[i++]);
356 if (i + 1 < end && is_surrogate_pair(src + i)) {
357 unicode = surrogate_pair_to_ucs4(src + i);
365 char tmp[3 + 8 + 1 + 1];
366 snprintf(tmp, _countof(tmp),
"&#x%x;", unicode);
382 inline void wstr2sgml(
383 _Inout_ std::string& dst,
384 _In_
const std::wstring& src,
385 _In_
size_t what = 0)
387 wstr2sgml(dst, src.c_str(), src.size(), what);
399 inline std::string wstr2sgml(
400 _In_reads_or_z_opt_(count_src)
const wchar_t* src,
401 _In_
size_t count_src,
402 _In_
size_t what = 0)
405 wstr2sgml(dst, src, count_src, what);
417 inline std::string wstr2sgml(
418 _In_
const std::wstring& src,
419 _In_
size_t what = 0)
421 return wstr2sgml(src.c_str(), src.size(), what);