stdex
Additional custom or not Standard C++ covered algorithms
Loading...
Searching...
No Matches
sgml.hpp
1/*
2 SPDX-License-Identifier: MIT
3 Copyright © 2023 Amebis
4*/
5
6#pragma once
7
8#include "mapping.hpp"
9#include "sal.hpp"
10#include "sgml_unicode.hpp"
11#include "string.hpp"
12#include <assert.h>
13
14namespace stdex
15{
17 template <class T>
18 inline const wchar_t* sgml2uni(_In_reads_or_z_(count) const T* entity, _In_ size_t count)
19 {
20 assert(entity && count);
21 assert(count < 2 || entity[0] != '#'); // No numeric entities
22
23 for (size_t i = 0, j = _countof(sgml_unicode); i < j; ) {
24 size_t m = (i + j) / 2;
25 if (sgml_unicode[m].sgml[0] < entity[0])
26 i = m + 1;
27 else if (sgml_unicode[m].sgml[0] > entity[0])
28 j = m;
29 else {
30 auto r = strncmp<char, T>(sgml_unicode[m].sgml + 1, _countof(sgml_unicode[0].sgml) - 1, entity + 1, count - 1);
31 if (r < 0)
32 i = m + 1;
33 else if (r > 0)
34 j = m;
35 else {
36 for (; i < m && strncmp<char, T>(sgml_unicode[m - 1].sgml, _countof(sgml_unicode[0].sgml), entity, count) == 0; m--);
37 return sgml_unicode[m].unicode;
38 }
39 }
40 }
41 return nullptr;
42 }
43
44 template <class T>
45 inline const T* sgmlend(
46 _In_reads_or_z_opt_(count) const T* str,
47 _In_ size_t count)
48 {
49 assert(str || !count);
50 for (size_t i = 0; i < count; i++) {
51 if (str[i] == ';')
52 return str + i;
53 if (!str[i] || str[i] == '&' || isspace(str[i]))
54 break;
55 }
56 return nullptr;
57 }
59
60 constexpr int sgml_full = 0x80000000;
61 constexpr int sgml_quot = 0x00000001;
62 constexpr int sgml_apos = 0x00000002;
63 constexpr int sgml_quot_apos = sgml_quot | sgml_apos;
64 constexpr int sgml_amp = 0x00000004;
65 constexpr int sgml_lt_gt = 0x00000008;
66 constexpr int sgml_bsol = 0x00000010;
67 constexpr int sgml_dollar = 0x00000020;
68 constexpr int sgml_percnt = 0x00000040;
69 constexpr int sgml_commat = 0x00000080;
70 constexpr int sgml_num = 0x00000100;
71 constexpr int sgml_lpar_rpar = 0x00000200;
72 constexpr int sgml_lcub_rcub = 0x00000400;
73 constexpr int sgml_lsqb_rsqb = 0x00000800;
74 constexpr int sgml_sgml = sgml_amp | sgml_lt_gt;
75 constexpr int sgml_ml_attrib = sgml_amp | sgml_quot_apos;
76 constexpr int sgml_c = sgml_amp | sgml_bsol | sgml_quot_apos;
77 // constexpr int sgml_ajt_lemma = sgml_amp | sgml_quot | sgml_dollar | sgml_percnt;
78 // constexpr int sgml_ajt_form = sgml_ajt_lemma;
79 // constexpr int sgml_kolos = sgml_amp | sgml_quot | sgml_dollar | sgml_percnt | sgml_lt_gt | sgml_bsol/* | sgml_commat | sgml_num*/ | sgml_lpar_rpar | sgml_lcub_rcub | sgml_lsqb_rsqb;
80
93 template <class T>
94 inline void sgml2str(
95 _Inout_ std::wstring& dst,
96 _In_reads_or_z_opt_(count_src) const T* src, _In_ size_t count_src,
97 _In_ int skip = 0,
98 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
99 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
100 {
101 assert(src || !count_src);
102
103 const bool
104 skip_quot = (skip & sgml_quot) == 0,
105 skip_apos = (skip & sgml_apos) == 0,
106 skip_amp = (skip & sgml_amp) == 0,
107 skip_lt_gt = (skip & sgml_lt_gt) == 0,
108 skip_bsol = (skip & sgml_bsol) == 0,
109 skip_dollar = (skip & sgml_dollar) == 0,
110 skip_percnt = (skip & sgml_percnt) == 0,
111 skip_commat = (skip & sgml_commat) == 0,
112 skip_num = (skip & sgml_num) == 0,
113 skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
114 skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
115 skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
116
117 count_src = strnlen(src, count_src);
118 dst.reserve(dst.size() + count_src);
119 for (size_t i = 0; i < count_src;) {
120 if (src[i] == '&') {
121 auto end = sgmlend(src + i + 1, count_src - i - 1);
122 if (end) {
123 const wchar_t* entity_w;
124 wchar_t chr[3];
125 size_t n = end - src - i - 1;
126 if (n >= 2 && src[i + 1] == '#') {
127 uint32_t unicode;
128 if (src[i + 2] == 'x' || src[i + 2] == 'X')
129 unicode = strtou32(src + i + 3, n - 2, nullptr, 16);
130 else
131 unicode = strtou32(src + i + 2, n - 1, nullptr, 10);
132#ifdef _WIN32
133 if (unicode < 0x10000) {
134 chr[0] = (wchar_t)unicode;
135 chr[1] = 0;
136 }
137 else {
138 ucs4_to_surrogate_pair(chr, unicode);
139 chr[2] = 0;
140 }
141#else
142 chr[0] = (wchar_t)unicode;
143 chr[1] = 0;
144#endif
145 entity_w = chr;
146 }
147 else
148 entity_w = sgml2uni(src + i + 1, n);
149
150 if (entity_w &&
151 (skip_quot || (entity_w[0] != L'"')) &&
152 (skip_apos || (entity_w[0] != L'\'')) &&
153 (skip_amp || (entity_w[0] != L'&')) &&
154 (skip_lt_gt || (entity_w[0] != L'<' && entity_w[0] != L'>')) &&
155 (skip_bsol || (entity_w[0] != L'\\')) &&
156 (skip_dollar || (entity_w[0] != L'$')) &&
157 (skip_percnt || (entity_w[0] != L'%')) &&
158 (skip_commat || (entity_w[0] != L'@')) &&
159 (skip_num || (entity_w[0] != L'#')) &&
160 (skip_lpar_rpar || (entity_w[0] != L'(' && entity_w[0] != L')')) &&
161 (skip_lcub_rcub || (entity_w[0] != L'{' && entity_w[0] != L'}')) &&
162 (skip_lsqb_rsqb || (entity_w[0] != L'[' && entity_w[0] != L']')))
163 {
164 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
165 dst.append(entity_w);
166 i = end - src + 1;
167 if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
168 continue;
169 }
170 }
171 }
172 dst.append(1, src[i++]);
173 }
174 }
175
187 template <class T>
188 inline void sgml2str(
189 _Inout_ std::wstring& dst,
190 _In_ const std::basic_string<T>& src,
191 _In_ int skip = 0,
192 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
193 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
194 {
195 sgml2str(dst, src.data(), src.size(), skip, offset, map);
196 }
197
209 template <class T>
210 inline std::wstring sgml2str(
211 _In_reads_or_z_opt_(count_src) const T* src, _In_ size_t count_src,
212 _In_ int skip = 0,
213 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
214 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
215 {
216 std::wstring dst;
217 sgml2str(dst, src, count_src, skip, offset, map);
218 return dst;
219 }
220
231 template <class T>
232 inline std::wstring sgml2str(
233 _In_ const std::basic_string<T>& src,
234 _In_ int skip = 0,
235 _In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
236 _Inout_opt_ mapping_vector<size_t>* map = nullptr)
237 {
238 return sgml2str(src.c_str(), src.size(), skip, offset, map);
239 }
240
242 inline const char* chr2sgml(_In_reads_or_z_(count) const wchar_t* entity, _In_ size_t count)
243 {
244 assert(entity && count);
245
246 const wchar_t e2 = entity[0];
247 for (size_t i = 0, j = _countof(unicode_sgml); i < j; ) {
248 size_t m = (i + j) / 2;
249 wchar_t e1 = sgml_unicode[unicode_sgml[m]].unicode[0];
250 if (e1 < e2)
251 i = m + 1;
252 else if (e1 > e2)
253 j = m;
254 else {
255 auto r = strncmp(sgml_unicode[unicode_sgml[m]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1);
256 if (r < 0)
257 i = m + 1;
258 else if (r > 0)
259 j = m;
260 else {
261 for (; i < m && sgml_unicode[unicode_sgml[m - 1]].unicode[0] == e2 && strncmp(sgml_unicode[unicode_sgml[m - 1]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1) == 0; m--);
262 return sgml_unicode[unicode_sgml[m]].sgml;
263 }
264 }
265 }
266 return nullptr;
267 }
269
278 inline void str2sgml(
279 _Inout_ std::string& dst,
280 _In_reads_or_z_opt_(count_src) const wchar_t* src,
281 _In_ size_t count_src,
282 _In_ size_t what = 0)
283 {
284 assert(src || !count_src);
285
286 const bool
287 do_ascii = (what & sgml_full) == 0,
288 do_quot = (what & sgml_quot) == 0,
289 do_apos = (what & sgml_apos) == 0,
290 do_lt_gt = (what & sgml_lt_gt) == 0,
291 do_bsol = (what & sgml_bsol) == 0,
292 do_dollar = (what & sgml_dollar) == 0,
293 do_percnt = (what & sgml_percnt) == 0,
294 do_commat = (what & sgml_commat) == 0,
295 do_num = (what & sgml_num) == 0,
296 do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
297 do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
298 do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
299
300 count_src = wcsnlen(src, count_src);
301 dst.reserve(dst.size() + count_src);
302 for (size_t i = 0; i < count_src;) {
303 size_t n = glyphlen(src + i, count_src - i);
304 if (n == 1 &&
305 do_ascii && (unsigned int)src[i] < 128 &&
306 src[i] != L'&' &&
307 (do_quot || (src[i] != L'"')) &&
308 (do_apos || (src[i] != L'\'')) &&
309 (do_lt_gt || (src[i] != L'<' && src[i] != L'>')) &&
310 (do_bsol || (src[i] != L'\\')) &&
311 (do_dollar || (src[i] != L'$')) &&
312 (do_percnt || (src[i] != L'%')) &&
313 (do_commat || (src[i] != L'@')) &&
314 (do_num || (src[i] != L'#')) &&
315 (do_lpar_rpar || (src[i] != L'(' && src[i] != L')')) &&
316 (do_lcub_rcub || (src[i] != L'{' && src[i] != L'}')) &&
317 (do_lsqb_rsqb || (src[i] != L'[' && src[i] != L']')))
318 {
319 // 7-bit ASCII and no desire to encode it as an SGML entity.
320 dst.append(1, (char)src[i++]);
321 }
322 else {
323 const char* entity = chr2sgml(src + i, n);
324 if (entity) {
325 dst.append(1, '&');
326 dst.append(entity);
327 dst.append(1, ';');
328 i += n;
329 }
330 else if (n == 1) {
331 // Trivial character (1 code unit, 1 glyph), no entity available.
332 if ((unsigned int)src[i] < 128)
333 dst.append(1, (char)src[i++]);
334 else {
335 char tmp[3 + 8 + 1 + 1];
336 snprintf(tmp, _countof(tmp), "&#x%x;", src[i++]);
337 dst.append(tmp);
338 }
339 }
340 else {
341 // Non-trivial character. Decompose.
342 const size_t end = i + n;
343 while (i < end) {
344 if ((entity = chr2sgml(src + i, 1)) != nullptr) {
345 dst.append(1, '&');
346 dst.append(entity);
347 dst.append(1, ';');
348 i++;
349 }
350 else if ((unsigned int)src[i] < 128)
351 dst.append(1, (char)src[i++]);
352 else {
353 uint32_t unicode;
354#ifdef _WIN32
355 if (i + 1 < end && is_surrogate_pair(src + i)) {
356 unicode = surrogate_pair_to_ucs4(src + i);
357 i += 2;
358 }
359 else
360#endif
361 {
362 unicode = src[i++];
363 }
364 char tmp[3 + 8 + 1 + 1];
365 snprintf(tmp, _countof(tmp), "&#x%x;", unicode);
366 dst.append(tmp);
367 }
368 }
369 }
370 }
371 }
372 }
373
381 inline void str2sgml(
382 _Inout_ std::string& dst,
383 _In_ const std::wstring& src,
384 _In_ size_t what = 0)
385 {
386 str2sgml(dst, src.c_str(), src.size(), what);
387 }
388
398 inline std::string str2sgml(
399 _In_reads_or_z_opt_(count_src) const wchar_t* src,
400 _In_ size_t count_src,
401 _In_ size_t what = 0)
402 {
403 std::string dst;
404 str2sgml(dst, src, count_src, what);
405 return dst;
406 }
407
416 inline std::string str2sgml(
417 _In_ const std::wstring& src,
418 _In_ size_t what = 0)
419 {
420 return str2sgml(src.c_str(), src.size(), what);
421 }
422}