stdex
Additional custom or not Standard C++ covered algorithms
Loading...
Searching...
No Matches
sgml.hpp
1/*
2 SPDX-License-Identifier: MIT
3 Copyright © 2023 Amebis
4*/
5
6#pragma once
7
8#include "mapping.hpp"
9#include "sal.hpp"
10#include "sgml_unicode.hpp"
11#include "string.hpp"
12#include <assert.h>
13#include <exception>
14#include <string>
15
16namespace stdex
17{
19 template <class T>
20 inline const wchar_t* sgml2uni(_In_reads_or_z_(count) const T* entity, _In_ size_t count)
21 {
22 assert(entity && count);
23 assert(count < 2 || entity[0] != '#'); // No numeric entities
24
25 for (size_t i = 0, j = _countof(sgml_unicode); i < j; ) {
26 size_t m = (i + j) / 2;
27 if (sgml_unicode[m].sgml[0] < entity[0])
28 i = m + 1;
29 else if (sgml_unicode[m].sgml[0] > entity[0])
30 j = m;
31 else {
32 auto r = strncmp<char, T>(sgml_unicode[m].sgml + 1, _countof(sgml_unicode[0].sgml) - 1, entity + 1, count - 1);
33 if (r < 0)
34 i = m + 1;
35 else if (r > 0)
36 j = m;
37 else {
38 for (; i < m && strncmp<char, T>(sgml_unicode[m - 1].sgml, _countof(sgml_unicode[0].sgml), entity, count) == 0; m--);
39 return sgml_unicode[m].unicode;
40 }
41 }
42 }
43 return nullptr;
44 }
45
46 template <class T>
47 inline const T* sgmlend(
48 _In_reads_or_z_opt_(count) const T* str, _In_ size_t count)
49 {
50 assert(str || !count);
51 for (size_t i = 0; i < count; i++) {
52 if (str[i] == ';')
53 return str + i;
54 if (!str[i] || str[i] == '&' || isspace(str[i]))
55 break;
56 }
57 return nullptr;
58 }
60
61 constexpr int sgml_full = 0x80000000;
62 constexpr int sgml_quot = 0x00000001;
63 constexpr int sgml_apos = 0x00000002;
64 constexpr int sgml_quot_apos = sgml_quot | sgml_apos;
65 constexpr int sgml_amp = 0x00000004;
66 constexpr int sgml_lt_gt = 0x00000008;
67 constexpr int sgml_bsol = 0x00000010;
68 constexpr int sgml_dollar = 0x00000020;
69 constexpr int sgml_percnt = 0x00000040;
70 constexpr int sgml_commat = 0x00000080;
71 constexpr int sgml_num = 0x00000100;
72 constexpr int sgml_lpar_rpar = 0x00000200;
73 constexpr int sgml_lcub_rcub = 0x00000400;
74 constexpr int sgml_lsqb_rsqb = 0x00000800;
75 constexpr int sgml_sgml = sgml_amp | sgml_lt_gt;
76 constexpr int sgml_ml_attrib = sgml_amp | sgml_quot_apos;
77 constexpr int sgml_c = sgml_amp | sgml_bsol | sgml_quot_apos;
78 // constexpr int sgml_ajt_lemma = sgml_amp | sgml_quot | sgml_dollar | sgml_percnt;
79 // constexpr int sgml_ajt_form = sgml_ajt_lemma;
80 // constexpr int sgml_kolos = sgml_amp | sgml_quot | sgml_dollar | sgml_percnt | sgml_lt_gt | sgml_bsol/* | sgml_commat | sgml_num*/ | sgml_lpar_rpar | sgml_lcub_rcub | sgml_lsqb_rsqb;
81
92 template <class T>
93 inline void sgml2wstr(
94 _Inout_ std::wstring& dst,
95 _In_reads_or_z_opt_(count_src) const T* src, _In_ size_t count_src,
96 _In_ int skip = 0,
97 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
98 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
99 {
100 assert(src || !count_src);
101
102 const bool
103 skip_quot = (skip & sgml_quot) == 0,
104 skip_apos = (skip & sgml_apos) == 0,
105 skip_amp = (skip & sgml_amp) == 0,
106 skip_lt_gt = (skip & sgml_lt_gt) == 0,
107 skip_bsol = (skip & sgml_bsol) == 0,
108 skip_dollar = (skip & sgml_dollar) == 0,
109 skip_percnt = (skip & sgml_percnt) == 0,
110 skip_commat = (skip & sgml_commat) == 0,
111 skip_num = (skip & sgml_num) == 0,
112 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
113 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
114 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
115
116 count_src = strnlen(src, count_src);
117 dst.reserve(dst.size() + count_src);
118 for (size_t i = 0; i < count_src;) {
119 if (src[i] == '&') {
120 auto end = sgmlend(src + i + 1, count_src - i - 1);
121 if (end) {
122 const wchar_t* entity_w;
123 wchar_t chr[3];
124 size_t n = end - src - i - 1;
125 if (n >= 2 && src[i + 1] == '#') {
126 uint32_t unicode;
127 if (src[i + 2] == 'x' || src[i + 2] == 'X')
128 unicode = strtou32(src + i + 3, n - 2, nullptr, 16);
129 else
130 unicode = strtou32(src + i + 2, n - 1, nullptr, 10);
131#ifdef _WIN32
132 if (unicode < 0x10000) {
133 chr[0] = (wchar_t)unicode;
134 chr[1] = 0;
135 }
136 else {
137 ucs4_to_surrogate_pair(chr, unicode);
138 chr[2] = 0;
139 }
140#else
141 chr[0] = (wchar_t)unicode;
142 chr[1] = 0;
143#endif
144 entity_w = chr;
145 }
146 else
147 entity_w = sgml2uni(src + i + 1, n);
148
149 if (entity_w &&
150 (skip_quot || (entity_w[0] != L'"')) &&
151 (skip_apos || (entity_w[0] != L'\'')) &&
152 (skip_amp || (entity_w[0] != L'&')) &&
153 (skip_lt_gt || (entity_w[0] != L'<' && entity_w[0] != L'>')) &&
154 (skip_bsol || (entity_w[0] != L'\\')) &&
155 (skip_dollar || (entity_w[0] != L'$')) &&
156 (skip_percnt || (entity_w[0] != L'%')) &&
157 (skip_commat || (entity_w[0] != L'@')) &&
158 (skip_num || (entity_w[0] != L'#')) &&
159 (skip_lpar_rpar || (entity_w[0] != L'(' && entity_w[0] != L')')) &&
160 (skip_lcub_rcub || (entity_w[0] != L'{' && entity_w[0] != L'}')) &&
161 (skip_lsqb_rsqb || (entity_w[0] != L'[' && entity_w[0] != L']')))
162 {
163 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
164 dst.append(entity_w);
165 i = end - src + 1;
166 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
167 continue;
168 }
169 }
170 }
171 dst.append(1, src[i++]);
172 }
173 }
174
188 template <class T>
189 inline size_t sgml2wstr(
190 _Inout_cap_(count_dst) wchar_t* dst, _In_ size_t count_dst,
191 _In_reads_or_z_opt_(count_src) const T* src, _In_ size_t count_src,
192 _In_ int skip = 0,
193 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
194 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
195 {
196 assert(src || !count_src);
197
198 static const std::invalid_argument buffer_overrun("buffer overrun");
199 const bool
200 skip_quot = (skip & sgml_quot) == 0,
201 skip_apos = (skip & sgml_apos) == 0,
202 skip_amp = (skip & sgml_amp) == 0,
203 skip_lt_gt = (skip & sgml_lt_gt) == 0,
204 skip_bsol = (skip & sgml_bsol) == 0,
205 skip_dollar = (skip & sgml_dollar) == 0,
206 skip_percnt = (skip & sgml_percnt) == 0,
207 skip_commat = (skip & sgml_commat) == 0,
208 skip_num = (skip & sgml_num) == 0,
209 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
210 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
211 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
212
213 size_t j = wcsnlen(dst, count_dst);
214 count_src = strnlen(src, count_src);
215 for (size_t i = 0; i < count_src;) {
216 if (src[i] == '&') {
217 auto end = sgmlend(src + i + 1, count_src - i - 1);
218 if (end) {
219 const wchar_t* entity_w;
220 wchar_t chr[3];
221 size_t n = end - src - i - 1;
222 if (n >= 2 && src[i + 1] == '#') {
223 uint32_t unicode;
224 if (src[i + 2] == 'x' || src[i + 2] == 'X')
225 unicode = strtou32(src + i + 3, n - 2, nullptr, 16);
226 else
227 unicode = strtou32(src + i + 2, n - 1, nullptr, 10);
228#ifdef _WIN32
229 if (unicode < 0x10000) {
230 chr[0] = (wchar_t)unicode;
231 chr[1] = 0;
232 }
233 else {
234 ucs4_to_surrogate_pair(chr, unicode);
235 chr[2] = 0;
236 }
237#else
238 chr[0] = (wchar_t)unicode;
239 chr[1] = 0;
240#endif
241 entity_w = chr;
242 }
243 else
244 entity_w = sgml2uni(src + i + 1, n);
245
246 if (entity_w &&
247 (skip_quot || (entity_w[0] != L'"')) &&
248 (skip_apos || (entity_w[0] != L'\'')) &&
249 (skip_amp || (entity_w[0] != L'&')) &&
250 (skip_lt_gt || (entity_w[0] != L'<' && entity_w[0] != L'>')) &&
251 (skip_bsol || (entity_w[0] != L'\\')) &&
252 (skip_dollar || (entity_w[0] != L'$')) &&
253 (skip_percnt || (entity_w[0] != L'%')) &&
254 (skip_commat || (entity_w[0] != L'@')) &&
255 (skip_num || (entity_w[0] != L'#')) &&
256 (skip_lpar_rpar || (entity_w[0] != L'(' && entity_w[0] != L')')) &&
257 (skip_lcub_rcub || (entity_w[0] != L'{' && entity_w[0] != L'}')) &&
258 (skip_lsqb_rsqb || (entity_w[0] != L'[' && entity_w[0] != L']')))
259 {
260 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
261 size_t m = wcslen(entity_w);
262 if (j + m >= count_dst)
263 throw buffer_overrun;
264 memcpy(dst + j, entity_w, m * sizeof(wchar_t)); j += m;
265 i = end - src + 1;
266 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + j));
267 continue;
268 }
269 }
270 }
271 if (j + 1 >= count_dst)
272 throw buffer_overrun;
273 dst[j++] = src[i++];
274 }
275 if (j >= count_dst)
276 throw buffer_overrun;
277 dst[j] = 0;
278 return j;
279 }
280
292 template <class T>
293 inline void sgml2wstr(
294 _Inout_ std::wstring& dst,
295 _In_ const std::basic_string<T>& src,
296 _In_ int skip = 0,
297 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
298 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
299 {
300 sgml2wstr(dst, src.data(), src.size(), skip, offset, map);
301 }
302
314 template <class T>
315 inline std::wstring sgml2wstr(
316 _In_reads_or_z_opt_(count_src) const T* src, _In_ size_t count_src,
317 _In_ int skip = 0,
318 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
319 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
320 {
321 std::wstring dst;
322 sgml2wstr(dst, src, count_src, skip, offset, map);
323 return dst;
324 }
325
336 template <class T>
337 inline std::wstring sgml2wstr(
338 _In_ const std::basic_string<T>& src,
339 _In_ int skip = 0,
340 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
341 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
342 {
343 return sgml2wstr(src.c_str(), src.size(), skip, offset, map);
344 }
345
347 inline const char* chr2sgml(_In_reads_or_z_(count) const wchar_t* entity, _In_ size_t count)
348 {
349 assert(entity && count);
350
351 const wchar_t e2 = entity[0];
352 for (size_t i = 0, j = _countof(unicode_sgml); i < j; ) {
353 size_t m = (i + j) / 2;
354 wchar_t e1 = sgml_unicode[unicode_sgml[m]].unicode[0];
355 if (e1 < e2)
356 i = m + 1;
357 else if (e1 > e2)
358 j = m;
359 else {
360 auto r = strncmp(sgml_unicode[unicode_sgml[m]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1);
361 if (r < 0)
362 i = m + 1;
363 else if (r > 0)
364 j = m;
365 else {
366 for (; i < m && sgml_unicode[unicode_sgml[m - 1]].unicode[0] == e2 && strncmp(sgml_unicode[unicode_sgml[m - 1]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1) == 0; m--);
367 return sgml_unicode[unicode_sgml[m]].sgml;
368 }
369 }
370 }
371 return nullptr;
372 }
374
383 inline void wstr2sgml(
384 _Inout_ std::string& dst,
385 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
386 _In_ size_t what = 0)
387 {
388 assert(src || !count_src);
389
390 const bool
391 do_ascii = (what & sgml_full) == 0,
392 do_quot = (what & sgml_quot) == 0,
393 do_apos = (what & sgml_apos) == 0,
394 do_lt_gt = (what & sgml_lt_gt) == 0,
395 do_bsol = (what & sgml_bsol) == 0,
396 do_dollar = (what & sgml_dollar) == 0,
397 do_percnt = (what & sgml_percnt) == 0,
398 do_commat = (what & sgml_commat) == 0,
399 do_num = (what & sgml_num) == 0,
400 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
401 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
402 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
403
404 count_src = wcsnlen(src, count_src);
405 dst.reserve(dst.size() + count_src);
406 for (size_t i = 0; i < count_src;) {
407 size_t n = glyphlen(src + i, count_src - i);
408 if (n == 1 &&
409 do_ascii && (unsigned int)src[i] < 128 &&
410 src[i] != L'&' &&
411 (do_quot || (src[i] != L'"')) &&
412 (do_apos || (src[i] != L'\'')) &&
413 (do_lt_gt || (src[i] != L'<' && src[i] != L'>')) &&
414 (do_bsol || (src[i] != L'\\')) &&
415 (do_dollar || (src[i] != L'$')) &&
416 (do_percnt || (src[i] != L'%')) &&
417 (do_commat || (src[i] != L'@')) &&
418 (do_num || (src[i] != L'#')) &&
419 (do_lpar_rpar || (src[i] != L'(' && src[i] != L')')) &&
420 (do_lcub_rcub || (src[i] != L'{' && src[i] != L'}')) &&
421 (do_lsqb_rsqb || (src[i] != L'[' && src[i] != L']')))
422 {
423 // 7-bit ASCII and no desire to encode it as an SGML entity.
424 dst.append(1, static_cast<char>(src[i++]));
425 }
426 else {
427 const char* entity = chr2sgml(src + i, n);
428 if (entity) {
429 dst.append(1, '&');
430 dst.append(entity);
431 dst.append(1, ';');
432 i += n;
433 }
434 else if (n == 1) {
435 // Trivial character (1 code unit, 1 glyph), no entity available.
436 if ((unsigned int)src[i] < 128)
437 dst.append(1, static_cast<char>(src[i++]));
438 else {
439 char tmp[3 + 8 + 1 + 1];
440 snprintf(tmp, _countof(tmp), "&#x%x;", src[i++]);
441 dst.append(tmp);
442 }
443 }
444 else {
445 // Non-trivial character. Decompose.
446 const size_t end = i + n;
447 while (i < end) {
448 if ((entity = chr2sgml(src + i, 1)) != nullptr) {
449 dst.append(1, '&');
450 dst.append(entity);
451 dst.append(1, ';');
452 i++;
453 }
454 else if ((unsigned int)src[i] < 128)
455 dst.append(1, static_cast<char>(src[i++]));
456 else {
457 uint32_t unicode;
458#ifdef _WIN32
459 if (i + 1 < end && is_surrogate_pair(src + i)) {
460 unicode = surrogate_pair_to_ucs4(src + i);
461 i += 2;
462 }
463 else
464#endif
465 {
466 unicode = src[i++];
467 }
468 char tmp[3 + 8 + 1 + 1];
469 snprintf(tmp, _countof(tmp), "&#x%x;", unicode);
470 dst.append(tmp);
471 }
472 }
473 }
474 }
475 }
476 }
477
489 inline size_t wstr2sgml(
490 _Inout_cap_(count_dst) char* dst, _In_ size_t count_dst,
491 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
492 _In_ size_t what = 0)
493 {
494 assert(dst || !count_dst);
495 assert(src || !count_src);
496
497 static const std::invalid_argument buffer_overrun("buffer overrun");
498 const bool
499 do_ascii = (what & sgml_full) == 0,
500 do_quot = (what & sgml_quot) == 0,
501 do_apos = (what & sgml_apos) == 0,
502 do_lt_gt = (what & sgml_lt_gt) == 0,
503 do_bsol = (what & sgml_bsol) == 0,
504 do_dollar = (what & sgml_dollar) == 0,
505 do_percnt = (what & sgml_percnt) == 0,
506 do_commat = (what & sgml_commat) == 0,
507 do_num = (what & sgml_num) == 0,
508 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
509 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
510 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
511
512 size_t j = strnlen(dst, count_dst);
513 count_src = wcsnlen(src, count_src);
514 for (size_t i = 0; i < count_src;) {
515 size_t n = glyphlen(src + i, count_src - i);
516 if (n == 1 &&
517 do_ascii && (unsigned int)src[i] < 128 &&
518 src[i] != L'&' &&
519 (do_quot || (src[i] != L'"')) &&
520 (do_apos || (src[i] != L'\'')) &&
521 (do_lt_gt || (src[i] != L'<' && src[i] != L'>')) &&
522 (do_bsol || (src[i] != L'\\')) &&
523 (do_dollar || (src[i] != L'$')) &&
524 (do_percnt || (src[i] != L'%')) &&
525 (do_commat || (src[i] != L'@')) &&
526 (do_num || (src[i] != L'#')) &&
527 (do_lpar_rpar || (src[i] != L'(' && src[i] != L')')) &&
528 (do_lcub_rcub || (src[i] != L'{' && src[i] != L'}')) &&
529 (do_lsqb_rsqb || (src[i] != L'[' && src[i] != L']')))
530 {
531 // 7-bit ASCII and no desire to encode it as an SGML entity.
532 if (j + 1 >= count_dst)
533 throw buffer_overrun;
534 dst[j++] = static_cast<char>(src[i++]);
535 }
536 else {
537 const char* entity = chr2sgml(src + i, n);
538 if (entity) {
539 size_t m = strlen(entity);
540 if (j + m + 2 >= count_dst)
541 throw buffer_overrun;
542 dst[j++] = '&';
543 memcpy(dst + j, entity, m * sizeof(char)); j += m;
544 dst[j++] = ';';
545 i += n;
546 }
547 else if (n == 1) {
548 // Trivial character (1 code unit, 1 glyph), no entity available.
549 if ((unsigned int)src[i] < 128) {
550 if (j + 1 >= count_dst)
551 throw buffer_overrun;
552 dst[j++] = static_cast<char>(src[i++]);
553 }
554 else {
555 char tmp[3 + 8 + 1 + 1];
556 int m = snprintf(tmp, _countof(tmp), "&#x%x;", src[i++]);
557 assert(m >= 0);
558 if (static_cast<size_t>(m) >= count_dst)
559 throw buffer_overrun;
560 memcpy(dst + j, tmp, m * sizeof(char)); j += m;
561 }
562 }
563 else {
564 // Non-trivial character. Decompose.
565 const size_t end = i + n;
566 while (i < end) {
567 if ((entity = chr2sgml(src + i, 1)) != nullptr) {
568 size_t m = strlen(entity);
569 if (j + m + 2 >= count_dst)
570 throw buffer_overrun;
571 dst[j++] = '&';
572 memcpy(dst + j, entity, m * sizeof(char)); j += m;
573 dst[j++] = ';';
574 i++;
575 }
576 else if ((unsigned int)src[i] < 128) {
577 if (j + 1 >= count_dst)
578 throw buffer_overrun;
579 dst[j++] = static_cast<char>(src[i++]);
580 }
581 else {
582 uint32_t unicode;
583#ifdef _WIN32
584 if (i + 1 < end && is_surrogate_pair(src + i)) {
585 unicode = surrogate_pair_to_ucs4(src + i);
586 i += 2;
587 }
588 else
589#endif
590 {
591 unicode = src[i++];
592 }
593 char tmp[3 + 8 + 1 + 1];
594 int m = snprintf(tmp, _countof(tmp), "&#x%x;", unicode);
595 assert(m >= 0);
596 if (static_cast<size_t>(m) >= count_dst)
597 throw buffer_overrun;
598 memcpy(dst + j, tmp, m * sizeof(char)); j += m;
599 }
600 }
601 }
602 }
603 }
604 if (j >= count_dst)
605 throw buffer_overrun;
606 dst[j] = 0;
607 return j;
608 }
609
617 inline void wstr2sgml(
618 _Inout_ std::string& dst,
619 _In_ const std::wstring& src,
620 _In_ size_t what = 0)
621 {
622 wstr2sgml(dst, src.c_str(), src.size(), what);
623 }
624
634 inline std::string wstr2sgml(
635 _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src,
636 _In_ size_t what = 0)
637 {
638 std::string dst;
639 wstr2sgml(dst, src, count_src, what);
640 return dst;
641 }
642
651 inline std::string wstr2sgml(
652 _In_ const std::wstring& src,
653 _In_ size_t what = 0)
654 {
655 return wstr2sgml(src.c_str(), src.size(), what);
656 }
657}