stdex
Additional custom or not Standard C++ covered algorithms
Loading...
Searching...
No Matches
html.hpp
1/*
2 SPDX-License-Identifier: MIT
3 Copyright © 2016-2023 Amebis
4*/
5
6#pragma once
7
8#include "compat.hpp"
9#include "exception.hpp"
10#include "interval.hpp"
11#include "mapping.hpp"
12#include "parser.hpp"
13#include "progress.hpp"
14#include "sgml.hpp"
15#include "string.hpp"
16#include "system.hpp"
17#include "unicode.hpp"
18#include <exception>
19#include <list>
20#include <map>
21#include <memory>
22#include <stdexcept>
23#include <vector>
24
25#ifdef _WIN32
26#undef small
27#endif
28
29namespace stdex
30{
31 namespace html
32 {
40 template<class _Traits = std::char_traits<char>, class _Alloc = std::allocator<char>>
41 inline void escape(
42 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
43 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
44 {
45 _Assume_(src || !num_chars);
46 for (size_t i = 0; i < num_chars && src[i]; ++i) {
47 switch (src[i]) {
48 case '&': dst += "&amp;"; break;
49 case ';': dst += "&semi;"; break;
50 case '\"': dst += "&quot;"; break;
51 case '\'': dst += "&#x27;"; break;
52 case '<': dst += "&lt;"; break;
53 case '>': dst += "&gt;"; break;
54 case 0x00a0: dst += "&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
55 default: dst += src[i]; break;
56 }
57 }
58 }
59
67 template<class _Traits = std::char_traits<wchar_t>, class _Alloc = std::allocator<wchar_t>>
68 inline void escape(
69 _Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& dst,
70 _In_reads_or_z_opt_(num_chars) const wchar_t* src, _In_ size_t num_chars)
71 {
72 _Assume_(src || !num_chars);
73 for (size_t i = 0; i < num_chars && src[i]; ++i) {
74 switch (src[i]) {
75 case L'&': dst += L"&amp;"; break;
76 case L';': dst += L"&semi;"; break;
77 case L'\"': dst += L"&quot;"; break;
78 case L'\'': dst += L"&#x27;"; break;
79 case L'<': dst += L"&lt;"; break;
80 case L'>': dst += L"&gt;"; break;
81 case L'\u00a0': dst += L"&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
82 default: dst += src[i]; break;
83 }
84 }
85 }
86
93 template<class _Elem, size_t _Size, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
94 inline void escape(
95 _Inout_ std::basic_string<_Elem, _Traits, _Alloc>& dst,
96 _In_ const _Elem (&src)[_Size])
97 {
98 escape(dst, src, _Size);
99 }
100
107 template<class _Elem, class _Traits_dst = std::char_traits<_Elem>, class _Alloc_dst = std::allocator<_Elem>, class _Traits_src = std::char_traits<_Elem>, class _Alloc_src = std::allocator<_Elem>>
108 inline void escape(
109 _Inout_ std::basic_string<_Elem, _Traits_dst, _Alloc_dst>& dst,
110 _In_ const std::basic_string<_Elem, _Traits_src, _Alloc_src>& src)
111 {
112 escape(dst, src.data(), src.size());
113 }
114
121 template<class _Traits = std::char_traits<char>, class _Alloc = std::allocator<char>>
122 inline void escape_min(_Inout_ std::basic_string<char, _Traits, _Alloc>& dst, _In_ char chr)
123 {
124 switch (chr) {
125 case '&': dst += "&amp;"; break;
126 case '<': dst += "&lt;"; break;
127 case '>': dst += "&gt;"; break;
128 case 0x00a0: dst += "&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
129 default: dst += chr; break;
130 }
131 }
132
139 template<class _Traits = std::char_traits<wchar_t>, class _Alloc = std::allocator<wchar_t>>
140 inline void escape_min(_Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& dst, _In_ wchar_t chr)
141 {
142 switch (chr) {
143 case L'&': dst += L"&amp;"; break;
144 case L'<': dst += L"&lt;"; break;
145 case L'>': dst += L"&gt;"; break;
146 case L'\u00a0': dst += L"&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
147 default: dst += chr; break;
148 }
149 }
150
158 template<class _Traits = std::char_traits<char>, class _Alloc = std::allocator<char>>
159 inline void escape_min(
160 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
161 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
162 {
163 _Assume_(src || !num_chars);
164 for (size_t i = 0; i < num_chars && src[i]; ++i) {
165 switch (src[i]) {
166 case '&': dst += "&amp;"; break;
167 case '<': dst += "&lt;"; break;
168 case '>': dst += "&gt;"; break;
169 case 0x00a0: dst += "&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
170 default: dst += src[i]; break;
171 }
172 }
173 }
174
182 template<class _Traits = std::char_traits<wchar_t>, class _Alloc = std::allocator<wchar_t>>
183 inline void escape_min(
184 _Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& dst,
185 _In_reads_or_z_opt_(num_chars) const wchar_t* src, _In_ size_t num_chars)
186 {
187 _Assume_(src || !num_chars);
188 for (size_t i = 0; i < num_chars && src[i]; ++i) {
189 switch (src[i]) {
190 case L'&': dst += L"&amp;"; break;
191 case L'<': dst += L"&lt;"; break;
192 case L'>': dst += L"&gt;"; break;
193 case L'\u00a0': dst += L"&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
194 default: dst += src[i]; break;
195 }
196 }
197 }
198
205 template<class _Elem, size_t _Size, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
206 inline void escape_min(
207 _Inout_ std::basic_string<_Elem, _Traits, _Alloc>& dst,
208 _In_ const _Elem (&src)[_Size])
209 {
210 escape_min(dst, src, _Size);
211 }
212
219 template<class _Elem, class _Traits_dst = std::char_traits<_Elem>, class _Alloc_dst = std::allocator<_Elem>, class _Traits_src = std::char_traits<_Elem>, class _Alloc_src = std::allocator<_Elem>>
220 inline void escape_min(
221 _Inout_ std::basic_string<_Elem, _Traits_dst, _Alloc_dst>& dst,
222 _In_ const std::basic_string<_Elem, _Traits_src, _Alloc_src>& src)
223 {
224 escape_min(dst, src.data(), src.size());
225 }
226
234 template<class _Traits = std::char_traits<char>, class _Alloc = std::allocator<char>>
235 inline void url_unescape(
236 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
237 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
238 {
239 _Assume_(src || !num_chars);
240 for (size_t i = 0; i < num_chars && src[i];) {
241 switch (src[i]) {
242 case '+':
243 dst += ' '; i++;
244 break;
245
246 case '%': {
247 i++;
248
249 uint8_t chr;
250 if ('0' <= src[i] && src[i] <= '9') chr = (src[i++] - '0') << 4;
251 else if ('A' <= src[i] && src[i] <= 'F') chr = (src[i++] - 'A' + 10) << 4;
252 else if ('a' <= src[i] && src[i] <= 'f') chr = (src[i++] - 'a' + 10) << 4;
253 else { dst += '%'; continue; }
254 if ('0' <= src[i] && src[i] <= '9') chr |= (src[i++] - '0');
255 else if ('A' <= src[i] && src[i] <= 'F') chr |= (src[i++] - 'A' + 10);
256 else if ('a' <= src[i] && src[i] <= 'f') chr |= (src[i++] - 'a' + 10);
257 else { dst += '%'; dst += src[i - 1]; continue; }
258
259 dst += static_cast<char>(chr);
260 break;
261 }
262
263 default:
264 dst += src[i++];
265 }
266 }
267 }
268
275 template<size_t _Size, class _Traits = std::char_traits<char>, class _Alloc = std::allocator<char>>
276 inline void url_unescape(
277 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
278 _In_ const char (&src)[_Size])
279 {
280 url_unescape(dst, src, _Size);
281 }
282
289 template<class _Traits_dst = std::char_traits<char>, class _Alloc_dst = std::allocator<char>, class _Traits_src = std::char_traits<char>, class _Alloc_src = std::allocator<char>>
290 inline void url_unescape(
291 _Inout_ std::basic_string<char, _Traits_dst, _Alloc_dst>& dst,
292 _In_ const std::basic_string<char, _Traits_src, _Alloc_src>& src)
293 {
294 url_unescape(dst, src.data(), src.size());
295 }
296
304 template<class _Traits = std::char_traits<char>, class _Alloc = std::allocator<char>>
305 inline void url_escape(
306 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
307 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
308 {
309 _Assume_(src || !num_chars);
310 for (size_t i = 0; i < num_chars && src[i]; ++i) {
311 switch (src[i]) {
312 case ' ': dst += "+"; break;
313 case '<': dst += "%3C"; break;
314 case '>': dst += "%3E"; break;
315 case '#': dst += "%23"; break;
316 case '%': dst += "%25"; break;
317 case '{': dst += "%7B"; break;
318 case '}': dst += "%7D"; break;
319 case '|': dst += "%7C"; break;
320 case '\\': dst += "%5C"; break;
321 case '^': dst += "%5E"; break;
322 case '~': dst += "%7E"; break;
323 case '[': dst += "%5B"; break;
324 case ']': dst += "%5D"; break;
325 case '`': dst += "%60"; break;
326 case ';': dst += "%3B"; break;
327 case '/': dst += "%2F"; break;
328 case '?': dst += "%3F"; break;
329 case ':': dst += "%3A"; break;
330 case '@': dst += "%40"; break;
331 case '=': dst += "%3D"; break;
332 case '&': dst += "%26"; break;
333 case '$': dst += "%24"; break;
334 default:
335 if (0x20 < static_cast<uint8_t>(src[i]) && static_cast<uint8_t>(src[i]) < 0x7f)
336 dst += src[i];
337 else {
338 dst += '%';
339 uint8_t n = (static_cast<uint8_t>(src[i]) & 0xf0) >> 4;
340 dst += n < 10 ? static_cast<char>('0' + n) : static_cast<char>('A' + n - 10);
341 n = ((uint8_t)src[i] & 0x0f);
342 dst += n < 10 ? static_cast<char>('0' + n) : static_cast<char>('A' + n - 10);
343 }
344 }
345 }
346 }
347
354 template<size_t _Size, class _Traits = std::char_traits<char>, class _Alloc = std::allocator<char>>
355 inline void url_escape(
356 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
357 _In_ const char (&src)[_Size])
358 {
359 url_escape(dst, src, _Size);
360 }
361
368 template<class _Traits_dst = std::char_traits<char>, class _Alloc_dst = std::allocator<char>, class _Traits_src = std::char_traits<char>, class _Alloc_src = std::allocator<char>>
369 inline void url_escape(
370 _Inout_ std::basic_string<char, _Traits_dst, _Alloc_dst>& dst,
371 _In_ const std::basic_string<char, _Traits_src, _Alloc_src>& src)
372 {
373 url_escape(dst, src.data(), src.size());
374 }
375
383 template<class _Elem, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
384 inline void css_unescape(
385 _Inout_ std::basic_string<_Elem, _Traits, _Alloc>& dst,
386 _In_reads_or_z_opt_(num_chars) const _Elem* src, _In_ size_t num_chars)
387 {
388 _Assume_(src || !num_chars);
389 for (size_t i = 0; i < num_chars && src[i];) {
390 if (src[i] != '\\')
391 dst += src[i++];
392 else if (i + 1 < num_chars) {
393 i++;
394
395 switch (src[i]) {
396 // Classic escapes
397 case 'n': dst += '\n'; i++; break;
398 case 'r': dst += '\r'; i++; break;
399 case 't': dst += '\t'; i++; break;
400
401 // `\` at the end of the line
402 case '\n': i++; break;
403
404 // `\nnnn` escape
405 case '0':
406 case '1':
407 case '2':
408 case '3':
409 case '4':
410 case '5':
411 case '6':
412 case '7':
413 case '8':
414 case '9':
415 case 'A': case 'a':
416 case 'B': case 'b':
417 case 'C': case 'c':
418 case 'D': case 'd':
419 case 'E': case 'e':
420 case 'F': case 'f': {
421 wchar_t chr = 0;
422 size_t end = std::min(num_chars, i + 6);
423
424 for (; i < end; ++i) {
425 if ('0' <= src[i] && src[i] <= '9') chr = chr * 0x10 + src[i] - '0';
426 else if ('A' <= src[i] && src[i] <= 'F') chr = chr * 0x10 + src[i] - 'A' + 10;
427 else if ('a' <= src[i] && src[i] <= 'f') chr = chr * 0x10 + src[i] - 'a' + 10;
428 else break;
429 }
430
431 dst += static_cast<_Elem>(chr);
432
433 if (i < end && src[i] == ' ') {
434 // Skip space after `\nnnn`.
435 i++;
436 }
437 break;
438 }
439
440 default: dst += src[i++];
441 }
442 }
443 }
444 }
445
452 template<class _Elem, size_t _Size, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
453 inline void css_unescape(
454 _Inout_ std::basic_string<_Elem, _Traits, _Alloc>& dst,
455 _In_ const _Elem (&src)[_Size])
456 {
457 css_unescape(dst, src, _Size);
458 }
459
466 template<class _Elem, class _Traits_dst = std::char_traits<_Elem>, class _Alloc_dst = std::allocator<_Elem>, class _Traits_src = std::char_traits<_Elem>, class _Alloc_src = std::allocator<_Elem>>
467 inline void css_unescape(
468 _Inout_ std::basic_string<_Elem, _Traits_dst, _Alloc_dst>& dst,
469 _In_ const std::basic_string<_Elem, _Traits_src, _Alloc_src>& src)
470 {
471 css_unescape(dst, src.data(), src.size());
472 }
473
481 template<class _Traits = std::char_traits<char>, class _Alloc = std::allocator<char>>
482 inline void css_escape(
483 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
484 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
485 {
486 _Assume_(src || !num_chars);
487 for (size_t i = 0; i < num_chars && src[i]; ++i) {
488 switch (src[i]) {
489 case '\\': dst += "\\\\"; break;
490 case '\n': dst += "\\n"; break;
491 case '\r': dst += "\\r"; break;
492 case '\t': dst += "\\t"; break;
493 case '\"': dst += "\\\""; break;
494 case '\'': dst += "\\'"; break;
495 default: dst += src[i]; break;
496 }
497 }
498 }
499
507 template<class _Traits = std::char_traits<wchar_t>, class _Alloc = std::allocator<wchar_t>>
508 inline void css_escape(
509 _Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& dst,
510 _In_reads_or_z_opt_(num_chars) const wchar_t* src, _In_ size_t num_chars)
511 {
512 _Assume_(src || !num_chars);
513 for (size_t i = 0; i < num_chars && src[i]; ++i) {
514 switch (src[i]) {
515 case L'\\': dst += L"\\\\"; break;
516 case L'\n': dst += L"\\n"; break;
517 case L'\r': dst += L"\\r"; break;
518 case L'\t': dst += L"\\t"; break;
519 case L'\"': dst += L"\\\""; break;
520 case L'\'': dst += L"\\'"; break;
521 default: dst += src[i]; break;
522 }
523 }
524 }
525
532 template<class _Elem, size_t _Size, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
533 inline void css_escape(
534 _Inout_ std::basic_string<_Elem, _Traits, _Alloc>& dst,
535 _In_ const _Elem (&src)[_Size])
536 {
537 css_escape(dst, src, _Size);
538 }
539
546 template<class _Elem, class _Traits_dst = std::char_traits<_Elem>, class _Alloc_dst = std::allocator<_Elem>, class _Traits_src = std::char_traits<_Elem>, class _Alloc_src = std::allocator<_Elem>>
547 inline void css_escape(
548 _Inout_ std::basic_string<_Elem, _Traits_dst, _Alloc_dst>& dst,
549 _In_ const std::basic_string<_Elem, _Traits_src, _Alloc_src>& src)
550 {
551 css_escape(dst, src.data(), src.size());
552 }
553
557 enum class element_t {
558 empty = 0,
559 a,
560 abbr,
561 acronym,
562 address,
563 applet,
564 area,
565 b,
566 base,
567 basefont,
568 bdo,
569 bgsound, // Microsoft Specific
570 big,
571 blink, // Microsoft Specific
572 blockquote,
573 body,
574 br,
575 button,
576 caption,
577 center,
578 cite,
579 code,
580 col,
581 colgroup,
582 comment, // Microsoft Specific
583 dd,
584 del,
585 dfn,
586 dir,
587 div,
588 dl,
589 dt,
590 em,
591 embed, // Microsoft Specific
592 fieldset,
593 font,
594 form,
595 frame,
596 frameset,
597 h1,
598 h2,
599 h3,
600 h4,
601 h5,
602 h6,
603 head,
604 hr,
605 html,
606 i,
607 iframe,
608 img,
609 input,
610 ins,
611 isindex,
612 kbd,
613 label,
614 legend,
615 li,
616 link,
617 listing, // Microsoft Specific
618 map,
619 marquee, // Microsoft Specific
620 menu,
621 meta,
622 nextid, // Microsoft Specific
623 nobr, // Microsoft Specific
624 noembed, // Microsoft Specific
625 noframes,
626 noscript,
627 object,
628 ol,
629 optgroup,
630 option,
631 p,
632 param,
633 plaintext, // Microsoft Specific
634 pre,
635 q,
636 rt, // Microsoft Specific
637 ruby, // Microsoft Specific
638 s,
639 samp,
640 script,
641 select,
642 small,
643 span,
644 strike,
645 strong,
646 style,
647 sub,
648 sup,
649 table,
650 tbody,
651 td,
652 textarea,
653 tfoot,
654 th,
655 thead,
656 title,
657 tr,
658 tt,
659 u,
660 ul,
661 var,
662 wbr, // Microsoft Specific
663 xmp, // Microsoft Specific
664
665 unknown = -1,
666 PCDATA = -2,
667 CDATA = -3,
668 };
669
673 enum class element_span_t {
674 needs_end = 0,
675 end_optional,
676 immediate,
677 };
678
683 {
689 static inline element_span_t span(_In_ element_t code)
690 {
691 static element_span_t lookup[] = {
692 element_span_t::needs_end, // a
693 element_span_t::needs_end, // abbr
694 element_span_t::needs_end, // acronym
695 element_span_t::needs_end, // address
696 element_span_t::needs_end, // applet
697 element_span_t::immediate, // area
698 element_span_t::needs_end, // b
699 element_span_t::immediate, // base
700 element_span_t::immediate, // basefont
701 element_span_t::needs_end, // bdo
702 element_span_t::immediate, // bgsound
703 element_span_t::needs_end, // big
704 element_span_t::needs_end, // blink
705 element_span_t::needs_end, // blockquote
706 element_span_t::end_optional, // body
707 element_span_t::immediate, // br
708 element_span_t::needs_end, // button
709 element_span_t::needs_end, // caption
710 element_span_t::needs_end, // center
711 element_span_t::needs_end, // cite
712 element_span_t::needs_end, // code
713 element_span_t::immediate, // col
714 element_span_t::end_optional, // colgroup
715 element_span_t::needs_end, // comment
716 element_span_t::end_optional, // dd
717 element_span_t::needs_end, // del
718 element_span_t::needs_end, // dfn
719 element_span_t::needs_end, // dir
720 element_span_t::needs_end, // div
721 element_span_t::needs_end, // dl
722 element_span_t::end_optional, // dt
723 element_span_t::needs_end, // em
724 element_span_t::immediate, // embed
725 element_span_t::needs_end, // fieldset
726 element_span_t::needs_end, // font
727 element_span_t::needs_end, // form
728 element_span_t::immediate, // frame
729 element_span_t::needs_end, // frameset
730 element_span_t::needs_end, // h1
731 element_span_t::needs_end, // h2
732 element_span_t::needs_end, // h3
733 element_span_t::needs_end, // h4
734 element_span_t::needs_end, // h5
735 element_span_t::needs_end, // h6
736 element_span_t::end_optional, // head
737 element_span_t::immediate, // hr
738 element_span_t::end_optional, // html
739 element_span_t::needs_end, // i
740 element_span_t::needs_end, // iframe
741 element_span_t::immediate, // img
742 element_span_t::immediate, // input
743 element_span_t::needs_end, // ins
744 element_span_t::immediate, // isindex
745 element_span_t::needs_end, // kbd
746 element_span_t::needs_end, // label
747 element_span_t::needs_end, // legend
748 element_span_t::end_optional, // li
749 element_span_t::immediate, // link
750 element_span_t::needs_end, // listing
751 element_span_t::needs_end, // map
752 element_span_t::needs_end, // marquee
753 element_span_t::needs_end, // menu
754 element_span_t::immediate, // meta
755 element_span_t::immediate, // nextid
756 element_span_t::needs_end, // nobr
757 element_span_t::needs_end, // noembed
758 element_span_t::needs_end, // noframes
759 element_span_t::needs_end, // noscript
760 element_span_t::needs_end, // object
761 element_span_t::needs_end, // ol
762 element_span_t::needs_end, // optgroup
763 element_span_t::end_optional, // option
764 element_span_t::end_optional, // p
765 element_span_t::immediate, // param
766 element_span_t::end_optional, // plaintext
767 element_span_t::needs_end, // pre
768 element_span_t::needs_end, // q
769 element_span_t::immediate, // rt
770 element_span_t::needs_end, // ruby
771 element_span_t::needs_end, // s
772 element_span_t::needs_end, // samp
773 element_span_t::needs_end, // script
774 element_span_t::needs_end, // select
775 element_span_t::needs_end, // small
776 element_span_t::needs_end, // span
777 element_span_t::needs_end, // strike
778 element_span_t::needs_end, // strong
779 element_span_t::needs_end, // style
780 element_span_t::needs_end, // sub
781 element_span_t::needs_end, // sup
782 element_span_t::needs_end, // table
783 element_span_t::end_optional, // tbody
784 element_span_t::end_optional, // td
785 element_span_t::needs_end, // textarea
786 element_span_t::end_optional, // tfoot
787 element_span_t::end_optional, // th
788 element_span_t::end_optional, // thead
789 element_span_t::needs_end, // title
790 element_span_t::end_optional, // tr
791 element_span_t::needs_end, // tt
792 element_span_t::needs_end, // u
793 element_span_t::needs_end, // ul
794 element_span_t::needs_end, // var
795 element_span_t::immediate, // wbr
796 element_span_t::needs_end, // xmp
797 };
798 return element_t::a <= code && code <= element_t::xmp ?
799 lookup[static_cast<size_t>(code) - static_cast<size_t>(element_t::a)] :
800 element_span_t::needs_end;
801 }
802
808 static inline bool is_fontstyle(_In_ element_t code)
809 {
810 switch (code) {
811 case element_t::tt:
812 case element_t::i:
813 case element_t::b:
814 case element_t::u:
815 case element_t::s:
816 case element_t::strike:
817 case element_t::blink:
818 case element_t::big:
819 case element_t::small:
820 return true;
821 };
822 return false;
823 }
824
830 static inline bool is_phrase(_In_ element_t code)
831 {
832 switch (code) {
833 case element_t::em:
834 case element_t::strong:
835 case element_t::dfn:
836 case element_t::code:
837 case element_t::samp:
838 case element_t::kbd:
839 case element_t::var:
840 case element_t::cite:
841 case element_t::abbr:
842 case element_t::acronym:
843 case element_t::xmp:
844 return true;
845 };
846 return false;
847 }
848
854 static inline bool is_special(_In_ element_t code)
855 {
856 switch (code) {
857 case element_t::a:
858 case element_t::img:
859 case element_t::applet:
860 case element_t::object:
861 case element_t::embed:
862 case element_t::font:
863 case element_t::basefont:
864 case element_t::br:
865 case element_t::wbr:
866 case element_t::rt:
867 case element_t::script:
868 case element_t::map:
869 case element_t::q:
870 case element_t::sub:
871 case element_t::sup:
872 case element_t::ruby:
873 case element_t::span:
874 case element_t::bdo:
875 case element_t::iframe:
876 case element_t::nobr:
877 return true;
878 };
879 return false;
880 }
881
887 static inline bool is_formctrl(_In_ element_t code)
888 {
889 switch (code) {
890 case element_t::input:
891 case element_t::select:
892 case element_t::textarea:
893 case element_t::label:
894 case element_t::button:
895 return true;
896 };
897 return false;
898 }
899
905 static inline bool is_inline(_In_ element_t code)
906 {
907 return
908 code == element_t::PCDATA ||
909 is_fontstyle(code) ||
910 is_phrase(code) ||
911 is_special(code) ||
912 is_formctrl(code);
913 }
914
920 static inline bool is_heading(_In_ element_t code)
921 {
922 switch (code) {
923 case element_t::h1:
924 case element_t::h2:
925 case element_t::h3:
926 case element_t::h4:
927 case element_t::h5:
928 case element_t::h6:
929 return true;
930 };
931 return false;
932 }
933
939 static inline bool is_list(_In_ element_t code)
940 {
941 switch (code) {
942 case element_t::ul:
943 case element_t::ol:
944 case element_t::dir:
945 case element_t::menu:
946 return true;
947 };
948 return false;
949 }
950
956 static inline bool is_preformatted(_In_ element_t code)
957 {
958 switch (code) {
959 case element_t::pre:
960 case element_t::listing:
961 return true;
962 }
963 return false;
964 }
965
971 static inline bool is_block(_In_ element_t code)
972 {
973 if (is_heading(code) ||
974 is_list(code) ||
975 is_preformatted(code)) return true;
976 switch (code) {
977 case element_t::p:
978 case element_t::dl:
979 case element_t::div:
980 case element_t::center:
981 case element_t::marquee:
982 case element_t::noscript:
983 case element_t::noframes:
984 case element_t::noembed:
985 case element_t::blockquote:
986 case element_t::form:
987 case element_t::isindex:
988 case element_t::hr:
989 case element_t::table:
990 case element_t::fieldset:
991 case element_t::address:
992 return true;
993 };
994 return false;
995 }
996
1002 static inline bool is_flow(_In_ element_t code)
1003 {
1004 return is_block(code) || is_inline(code);
1005 }
1006
1012 static inline bool is_head_content(_In_ element_t code)
1013 {
1014 switch (code) {
1015 case element_t::title:
1016 case element_t::isindex:
1017 case element_t::base:
1018 case element_t::nextid:
1019 return true;
1020 };
1021 return false;
1022 }
1023
1029 static inline bool is_head_misc(_In_ element_t code)
1030 {
1031 switch (code) {
1032 case element_t::script:
1033 case element_t::style:
1034 case element_t::meta:
1035 case element_t::link:
1036 case element_t::object:
1037 return true;
1038 };
1039 return false;
1040 }
1041
1047 static inline bool is_pre_exclusion(_In_ element_t code)
1048 {
1049 switch (code) {
1050 case element_t::img:
1051 case element_t::object:
1052 case element_t::applet:
1053 case element_t::embed:
1054 case element_t::big:
1055 case element_t::small:
1056 case element_t::sub:
1057 case element_t::sup:
1058 case element_t::ruby:
1059 case element_t::font:
1060 case element_t::basefont:
1061 case element_t::nobr:
1062 return true;
1063 };
1064 return false;
1065 }
1066
1072 static inline bool is_html_content(_In_ element_t code)
1073 {
1074 switch (code) {
1075 case element_t::head:
1076 case element_t::body:
1077 case element_t::frameset:
1078 return true;
1079 };
1080 return false;
1081 }
1082
1088 static inline bool is_group(_In_ element_t code)
1089 {
1090 if (is_block(code) ||
1091 is_html_content(code) ||
1092 is_head_content(code)) return true;
1093 switch (code) {
1094 case element_t::col:
1095 case element_t::colgroup:
1096 case element_t::dd:
1097 case element_t::dir:
1098 case element_t::dt:
1099 case element_t::frame:
1100 case element_t::iframe:
1101 case element_t::legend:
1102 case element_t::td:
1103 case element_t::th:
1104 case element_t::tr:
1105 return true;
1106 };
1107 return false;
1108 }
1109
1118 static inline bool may_contain(_In_ element_t parent, _In_ element_t child)
1119 {
1120 if (child == element_t::unknown || child == element_t::comment)
1121 return true;
1122 if (is_fontstyle(parent) || is_phrase(parent))
1123 return is_inline(child);
1124 if (is_heading(parent))
1125 return is_inline(child);
1126
1127 switch (parent) {
1128 case element_t::a: return is_inline(child) && child != element_t::a;
1129 case element_t::address: return is_inline(child) || child == element_t::p;
1130 case element_t::applet: return is_flow(child) || child == element_t::param;
1131 case element_t::area: return false;
1132 case element_t::base: return false;
1133 case element_t::basefont: return false;
1134 case element_t::bdo: return is_inline(child);
1135 case element_t::blockquote: return is_flow(child);
1136 case element_t::body: return is_flow(child) || child == element_t::ins || child == element_t::del;
1137 case element_t::br: return false;
1138 case element_t::button: return is_flow(child) && !is_formctrl(child) && child != element_t::a && child != element_t::form && child != element_t::isindex && child != element_t::fieldset && child != element_t::iframe;
1139 case element_t::caption: return is_inline(child);
1140 case element_t::center: return is_flow(child);
1141 case element_t::col: return false;
1142 case element_t::colgroup: return child == element_t::col;
1143 case element_t::comment: return child == element_t::CDATA;
1144 case element_t::dd: return is_flow(child);
1145 case element_t::del: return is_flow(child);
1146 case element_t::dir: return child == element_t::li;
1147 case element_t::div: return is_flow(child);
1148 case element_t::dl: return child == element_t::dt || child == element_t::dd;
1149 case element_t::dt: return is_inline(child);
1150 case element_t::embed: return is_flow(child) || child == element_t::param;
1151 case element_t::fieldset: return is_flow(child) || child == element_t::legend || child == element_t::PCDATA;
1152 case element_t::font: return is_inline(child);
1153 case element_t::form: return is_flow(child) && child != element_t::form;
1154 case element_t::frame: return false;
1155 case element_t::frameset: return child == element_t::frameset || child == element_t::frame || child == element_t::noframes;
1156 case element_t::head: return is_head_content(child) || is_head_misc(child);
1157 case element_t::hr: return false;
1158 case element_t::html: return is_html_content(child);
1159 case element_t::iframe: return is_flow(child);
1160 case element_t::img: return false;
1161 case element_t::input: return false;
1162 case element_t::ins: return is_flow(child);
1163 case element_t::isindex: return false;
1164 case element_t::label: return is_inline(child) && child != element_t::label;
1165 case element_t::legend: return is_inline(child);
1166 case element_t::li: return is_flow(child);
1167 case element_t::link: return false;
1168 case element_t::listing: return child == element_t::CDATA;
1169 case element_t::map: return is_block(child) || child == element_t::area;
1170 case element_t::marquee: return is_flow(child);
1171 case element_t::menu: return child == element_t::li;
1172 case element_t::meta: return false;
1173 case element_t::nobr: return is_inline(child) || child == element_t::wbr;
1174 case element_t::noframes: return (is_flow(child) || child == element_t::body) && child != element_t::noframes;
1175 case element_t::noscript: return is_flow(child);
1176 case element_t::noembed: return is_flow(child);
1177 case element_t::object: return is_flow(child) || child == element_t::param;
1178 case element_t::ol: return child == element_t::li;
1179 case element_t::optgroup: return child == element_t::option;
1180 case element_t::option: return child == element_t::PCDATA;
1181 case element_t::p: return is_inline(child);
1182 case element_t::param: return false;
1183 case element_t::plaintext: return is_flow(child);
1184 case element_t::pre: return is_inline(child) && !is_pre_exclusion(child);
1185 case element_t::q: return is_inline(child);
1186 case element_t::rt: return false;
1187 case element_t::ruby: return is_inline(child);
1188 case element_t::script: return child == element_t::CDATA;
1189 case element_t::select: return child == element_t::optgroup || child == element_t::option;
1190 case element_t::span: return is_inline(child);
1191 case element_t::style: return child == element_t::CDATA;
1192 case element_t::sub: return is_inline(child);
1193 case element_t::sup: return is_inline(child);
1194 case element_t::table: return child == element_t::caption || child == element_t::col || child == element_t::colgroup || child == element_t::thead || child == element_t::tfoot || child == element_t::tbody;
1195 case element_t::tbody: return child == element_t::tr;
1196 case element_t::td: return is_flow(child);
1197 case element_t::textarea: return child == element_t::PCDATA;
1198 case element_t::tfoot: return child == element_t::tr;
1199 case element_t::th: return is_flow(child);
1200 case element_t::thead: return child == element_t::tr;
1201 case element_t::title: return child == element_t::PCDATA;
1202 case element_t::tr: return child == element_t::td || child == element_t::th;
1203 case element_t::ul: return child == element_t::li;
1204 case element_t::wbr: return false;
1205 case element_t::unknown: return true;
1206 }
1207 return false;
1208 }
1209
1217 template <class T>
1218 static inline bool is_uri(_In_ element_t code, _In_reads_or_z_opt_(num_chars) const T* attr_name, _In_ size_t num_chars)
1219 {
1220 _Assume_(attr_name || !num_chars);
1221 switch (code) {
1222 case element_t::a: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX);
1223 case element_t::applet: return !stdex::strnicmp(attr_name, num_chars, "code", SIZE_MAX) ||
1224 !stdex::strnicmp(attr_name, num_chars, "codebase", SIZE_MAX) ||
1225 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1226 case element_t::area: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX);
1227 case element_t::base: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX);
1228 case element_t::bgsound: return !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1229 case element_t::blockquote: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX);
1230 case element_t::body: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX);
1231 case element_t::comment: return !stdex::strnicmp(attr_name, num_chars, "data", SIZE_MAX);
1232 case element_t::del: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX);
1233 case element_t::embed: return !stdex::strnicmp(attr_name, num_chars, "pluginspage", SIZE_MAX) ||
1234 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1235 case element_t::form: return !stdex::strnicmp(attr_name, num_chars, "action", SIZE_MAX);
1236 case element_t::frame: return !stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX) ||
1237 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1238 case element_t::head: return !stdex::strnicmp(attr_name, num_chars, "profile", SIZE_MAX);
1239 case element_t::iframe: return !stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX) ||
1240 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1241 case element_t::img: return !stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX) ||
1242 !stdex::strnicmp(attr_name, num_chars, "lowsrc", SIZE_MAX) ||
1243 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) ||
1244 !stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX);
1245 case element_t::input: return !stdex::strnicmp(attr_name, num_chars, "lowsrc", SIZE_MAX) ||
1246 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) ||
1247 !stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX);
1248 case element_t::ins: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX);
1249 case element_t::link: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX);
1250 case element_t::object: return !stdex::strnicmp(attr_name, num_chars, "basehref", SIZE_MAX) ||
1251 !stdex::strnicmp(attr_name, num_chars, "classid", SIZE_MAX) ||
1252 !stdex::strnicmp(attr_name, num_chars, "code", SIZE_MAX) ||
1253 !stdex::strnicmp(attr_name, num_chars, "codebase", SIZE_MAX) ||
1254 !stdex::strnicmp(attr_name, num_chars, "data", SIZE_MAX) ||
1255 !stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX);
1256 case element_t::q: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX);
1257 case element_t::script: return !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1258 case element_t::table: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX);
1259 case element_t::td: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX);
1260 case element_t::th: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX);
1261 }
1262 return false;
1263 }
1264
1272 template <class T>
1273 static inline bool is_localizable(element_t code, const T* attr_name, size_t num_chars)
1274 {
1275 _Assume_(attr_name || !num_chars);
1276 if (!stdex::strnicmp(attr_name, num_chars, "title", SIZE_MAX))
1277 return true;
1278 switch (code) {
1279 case element_t::applet: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1280 case element_t::area: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1281 case element_t::img: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1282 case element_t::input: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1283 case element_t::object: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1284 case element_t::table: return !stdex::strnicmp(attr_name, num_chars, "summary", SIZE_MAX);
1285 case element_t::td: return !stdex::strnicmp(attr_name, num_chars, "abbr", SIZE_MAX);
1286 case element_t::th: return !stdex::strnicmp(attr_name, num_chars, "abbr", SIZE_MAX);
1287 }
1288 return false;
1289 }
1290 };
1291
1292 class sequence;
1293 using sequence_store = std::vector<std::unique_ptr<sequence>>;
1294
1299 {
1300 public:
1301 stdex::parser::html_sequence_t type;
1304
1305 sequence(_In_ stdex::parser::html_sequence_t _type = stdex::parser::html_sequence_t::unknown, _In_ size_t start = 0, size_t end = 0, _In_opt_ sequence* _parent = nullptr) :
1306 type(_type),
1307 interval(start, end),
1308 parent(_parent)
1309 {}
1310
1311 virtual ~sequence() {} // make polymorphic
1312 };
1313
1317 class element : public sequence
1318 {
1319 public:
1320 template <class T>
1321 inline element(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr) :
1322 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1323 code(element_code(src + tag.name.start, tag.name.size())),
1324 name(std::move(tag.name)),
1325 attributes(std::move(tag.attributes))
1326 {}
1327
1328 template <class T>
1329 static element_t element_code(_In_reads_z_(num_chars) const T* name, size_t num_chars)
1330 {
1331 static const struct {
1332 const char* name;
1333 element_t code;
1334 } mapping[] = {
1335 { "a", element_t::a, },
1336 { "abbr", element_t::abbr, },
1337 { "acronym", element_t::acronym, },
1338 { "address", element_t::address, },
1339 { "applet", element_t::applet, },
1340 { "area", element_t::area, },
1341 { "b", element_t::b, },
1342 { "base", element_t::base, },
1343 { "basefont", element_t::basefont, },
1344 { "bdo", element_t::bdo, },
1345 { "bgsound", element_t::bgsound, },
1346 { "big", element_t::big, },
1347 { "blink", element_t::blink, },
1348 { "blockquote", element_t::blockquote, },
1349 { "body", element_t::body, },
1350 { "br", element_t::br, },
1351 { "button", element_t::button, },
1352 { "caption", element_t::caption, },
1353 { "center", element_t::center, },
1354 { "cite", element_t::cite, },
1355 { "code", element_t::code, },
1356 { "col", element_t::col, },
1357 { "colgroup", element_t::colgroup, },
1358 { "comment", element_t::comment, },
1359 { "dd", element_t::dd, },
1360 { "del", element_t::del, },
1361 { "dfn", element_t::dfn, },
1362 { "dir", element_t::dir, },
1363 { "div", element_t::div, },
1364 { "dl", element_t::dl, },
1365 { "dt", element_t::dt, },
1366 { "em", element_t::em, },
1367 { "embed", element_t::embed, },
1368 { "fieldset", element_t::fieldset, },
1369 { "font", element_t::font, },
1370 { "form", element_t::form, },
1371 { "frame", element_t::frame, },
1372 { "frameset", element_t::frameset, },
1373 { "h1", element_t::h1, },
1374 { "h2", element_t::h2, },
1375 { "h3", element_t::h3, },
1376 { "h4", element_t::h4, },
1377 { "h5", element_t::h5, },
1378 { "h6", element_t::h6, },
1379 { "head", element_t::head, },
1380 { "hr", element_t::hr, },
1381 { "html", element_t::html, },
1382 { "i", element_t::i, },
1383 { "iframe", element_t::iframe, },
1384 { "img", element_t::img, },
1385 { "input", element_t::input, },
1386 { "ins", element_t::ins, },
1387 { "isindex", element_t::isindex, },
1388 { "kbd", element_t::kbd, },
1389 { "label", element_t::label, },
1390 { "legend", element_t::legend, },
1391 { "li", element_t::li, },
1392 { "link", element_t::link, },
1393 { "listing", element_t::listing, },
1394 { "map", element_t::map, },
1395 { "marquee", element_t::marquee, },
1396 { "menu", element_t::menu, },
1397 { "meta", element_t::meta, },
1398 { "nextid", element_t::nextid, },
1399 { "nobr", element_t::nobr, },
1400 { "noembed", element_t::noembed, },
1401 { "noframes", element_t::noframes, },
1402 { "noscript", element_t::noscript, },
1403 { "object", element_t::object, },
1404 { "ol", element_t::ol, },
1405 { "optgroup", element_t::optgroup, },
1406 { "option", element_t::option, },
1407 { "p", element_t::p, },
1408 { "param", element_t::param, },
1409 { "plaintext", element_t::plaintext, },
1410 { "pre", element_t::pre, },
1411 { "q", element_t::q, },
1412 { "rt", element_t::rt, },
1413 { "ruby", element_t::ruby, },
1414 { "s", element_t::s, },
1415 { "samp", element_t::samp, },
1416 { "script", element_t::script, },
1417 { "select", element_t::select, },
1418 { "small", element_t::small, },
1419 { "span", element_t::span, },
1420 { "strike", element_t::strike, },
1421 { "strong", element_t::strong, },
1422 { "style", element_t::style, },
1423 { "sub", element_t::sub, },
1424 { "sup", element_t::sup, },
1425 { "table", element_t::table, },
1426 { "tbody", element_t::tbody, },
1427 { "td", element_t::td, },
1428 { "textarea", element_t::textarea, },
1429 { "tfoot", element_t::tfoot, },
1430 { "th", element_t::th, },
1431 { "thead", element_t::thead, },
1432 { "title", element_t::title, },
1433 { "tr", element_t::tr, },
1434 { "tt", element_t::tt, },
1435 { "u", element_t::u, },
1436 { "ul", element_t::ul, },
1437 { "var", element_t::var, },
1438 { "wbr", element_t::wbr, },
1439 { "xmp", element_t::xmp, },
1440 };
1441#ifdef _DEBUG
1442 // The mapping table MUST be sorted and all names in lowercase.
1443 for (size_t i = 1; i < _countof(mapping); i++)
1444 _Assume_(stdex::strcmp(mapping[i - 1].name, mapping[i].name) <= 0);
1445 for (size_t i = 0; i < _countof(mapping); i++) {
1446 for (size_t j = 0; mapping[i].name[j]; j++)
1447 _Assume_(stdex::islower(mapping[i].name[j]) | stdex::isdigit(mapping[i].name[j]));
1448 }
1449#endif
1450 for (size_t i = 0, j = _countof(mapping); i < j; ) {
1451 size_t m = (i + j) / 2;
1452 int r = 0;
1453 for (size_t i1 = 0, i2 = 0;;) {
1454 if (!mapping[m].name[i1]) {
1455 r = i2 >= num_chars || !name[i2] ? 0 : -1;
1456 break;
1457 }
1458 if (i2 >= num_chars || !name[i2]) {
1459 r = 1;
1460 break;
1461 }
1462
1463 auto chr = static_cast<char>(stdex::tolower(name[i2++]));
1464 if (mapping[m].name[i1] > chr) {
1465 r = 1;
1466 break;
1467 }
1468 if (mapping[m].name[i1] < chr) {
1469 r = -1;
1470 break;
1471 }
1472 i1++;
1473 }
1474
1475 if (r < 0)
1476 i = m + 1;
1477 else if (r > 0)
1478 j = m;
1479 else
1480 return mapping[m].code;
1481 }
1482 return element_t::unknown;
1483 }
1484
1485 public:
1486 element_t code;
1488 std::vector<stdex::parser::html_attribute> attributes;
1489 };
1490
1491 class element_end;
1492
1496 class element_start : public element
1497 {
1498 public:
1499 template <class T>
1500 inline element_start(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr, _In_opt_ sequence* _end = nullptr) :
1501 element(std::move(tag), src, parent),
1502 end(_end)
1503 {}
1504
1505 public:
1507 };
1508
1512 class element_end : public sequence
1513 {
1514 public:
1515 template <class T>
1516 inline element_end(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr, _In_opt_ element_start* _start = nullptr) :
1517 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1518 code(element::element_code(src + tag.name.start, tag.name.size())),
1519 name(std::move(tag.name)),
1520 start(_start)
1521 {}
1522
1523 public:
1524 element_t code;
1527 };
1528
1532 class declaration : public sequence
1533 {
1534 public:
1535 template <class T>
1536 inline declaration(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_opt_ sequence* parent = nullptr) :
1537 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1538 name(std::move(tag.name)),
1539 attributes(std::move(tag.attributes))
1540 {}
1541
1542 public:
1544 std::vector<stdex::parser::html_attribute> attributes;
1545 };
1546
1550 class comment : public sequence
1551 {
1552 public:
1553 template <class T>
1554 inline comment(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_opt_ sequence* parent = nullptr) :
1555 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1556 content(std::move(tag.name))
1557 {}
1558
1559 public:
1561 };
1562
1566 class instruction : public sequence
1567 {
1568 public:
1569 template <class T>
1570 inline instruction(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_opt_ sequence* parent = nullptr) :
1571 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1572 content(std::move(tag.name))
1573 {}
1574
1575 public:
1577 };
1578
1582 template<class _Elem, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
1583 struct entity
1584 {
1586 std::basic_string<_Elem, _Traits, _Alloc> value;
1587 };
1588
1592 template<class _Elem, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
1593 class parser;
1594
1598 template<class _Elem, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
1600 {
1601 public:
1602 document() :
1603 m_num_parsed(0),
1604 m_charset(stdex::charset_id::system),
1605
1606 // Declaration parsing data
1609 m_is_cdata(false),
1610 m_is_rcdata(false),
1611
1612 // Element parsing data
1614 {}
1615
1619 void clear()
1620 {
1621 m_source.clear();
1622 m_num_parsed = 0;
1623 m_charset = stdex::charset_id::system;
1624
1625 // Declaration parsing data
1627 m_is_cdata = m_is_rcdata = false;
1628 m_entities.clear();
1629
1630 // Element parsing data
1631 m_sequences.clear();
1632
1633 m_element_stack.clear();
1634 m_is_special_element = false;
1635 }
1636
1640 void append(_In_reads_or_z_opt_(num_chars) const _Elem* source, _In_ size_t num_chars)
1641 {
1642 _Assume_(source || !num_chars);
1643 m_source.append(source, stdex::strnlen(source, num_chars));
1644 source = m_source.data();
1645 num_chars = m_source.size();
1646
1647 for (size_t i = m_num_parsed; i < num_chars;) {
1648 if (m_is_cdata || m_is_rcdata) {
1649 if (m_condition_end.match(source, i, num_chars)) {
1650 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(
1651 m_is_cdata ? stdex::parser::html_sequence_t::CDATA : stdex::parser::html_sequence_t::PCDATA,
1652 m_num_parsed, i,
1653 active_element()))));
1654 m_is_cdata = m_is_rcdata = false;
1655 i = m_num_parsed = m_condition_end.interval.end;
1656 continue;
1657 }
1658 goto next_char;
1659 }
1660
1662 if (m_condition_end.match(source, i, num_chars)) {
1664 i = m_num_parsed = m_condition_end.interval.end;
1665 continue;
1666 }
1667 goto next_char;
1668 }
1669
1670 if (m_num_valid_conditions && m_condition_end.match(source, i, num_chars)) {
1671 if (m_num_parsed < i)
1672 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));
1673
1675 i = m_num_parsed = m_condition_end.interval.end;
1676 continue;
1677 }
1678
1679 if (m_condition_start.match(source, i, num_chars)) {
1680 auto condition_src(replace_entities(source + m_condition_start.condition.start, m_condition_start.condition.size()));
1681 if (!stdex::strcmp(condition_src.c_str(), "CDATA"))
1682 m_is_cdata = true;
1683 else if (!stdex::strcmp(condition_src.c_str(), "RCDATA"))
1684 m_is_rcdata = true;
1687 else if (!stdex::strcmp(condition_src.c_str(), "IGNORE"))
1689 else
1691
1692 i = m_num_parsed = m_condition_start.interval.end;
1693 continue;
1694 }
1695
1697 auto parent = active_element();
1698 _Assume_(parent);
1699 if (m_tag.match(source, i, num_chars) &&
1700 m_tag.type == stdex::parser::html_sequence_t::element_end &&
1701 element::element_code(source + m_tag.name.start, m_tag.name.size()) == parent->code)
1702 {
1703 if (m_num_parsed < i)
1704 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, parent))));
1705 i = m_num_parsed = m_tag.interval.end;
1706 std::unique_ptr<element_end> e(new element_end(std::move(m_tag), source, parent->parent, parent));
1707 parent->end = e.get();
1708 m_sequences.push_back(std::move(e));
1709 m_element_stack.pop_back();
1710 m_is_special_element = false;
1711 continue;
1712 }
1713 goto next_char;
1714 }
1715
1716 if (m_tag.match(source, i, num_chars)) {
1717 if (m_num_parsed < i)
1718 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));
1719 i = m_num_parsed = m_tag.interval.end;
1720
1721 switch (m_tag.type) {
1722 case stdex::parser::html_sequence_t::element:
1723 case stdex::parser::html_sequence_t::element_start: {
1724 std::unique_ptr<element> e(
1725 m_tag.type == stdex::parser::html_sequence_t::element ? new element(std::move(m_tag), source) :
1726 m_tag.type == stdex::parser::html_sequence_t::element_start ? new element_start(std::move(m_tag), source) :
1727 nullptr);
1728
1729 // Does this tag end any of the started elements?
1730 for (size_t j = m_element_stack.size(); j--; ) {
1731 auto starting_tag = m_element_stack[j];
1732 _Assume_(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1733 if (element_traits::may_contain(starting_tag->code, e->code)) {
1734 e->parent = starting_tag;
1735 break;
1736 }
1737 e->parent = starting_tag->parent;
1738 starting_tag->end = e.get();
1739 m_element_stack.resize(j);
1740 }
1741
1742 if (e->type == stdex::parser::html_sequence_t::element_start) {
1743 auto e_start = static_cast<element_start*>(e.get());
1744 if (element_traits::span(e->code) == element_span_t::immediate)
1745 e_start->end = e.get();
1746 else {
1747 m_element_stack.push_back(e_start);
1748 switch (e->code) {
1749 case element_t::code:
1750 case element_t::comment:
1751 case element_t::script:
1752 case element_t::style:
1753 m_is_special_element = true;
1754 break;
1755 }
1756 }
1757 }
1758
1759 if (e->code == element_t::meta && m_charset == stdex::charset_id::system) {
1760 bool is_content_type = false;
1761 stdex::parser::html_attribute* content_attr = nullptr;
1762 for (auto& attr : e->attributes) {
1763 if (!stdex::strnicmp(source + attr.name.start, attr.name.size(), "http-equiv", SIZE_MAX) &&
1764 !stdex::strnicmp(source + attr.value.start, attr.value.size(), "content-type", SIZE_MAX))
1765 is_content_type = true;
1766 else if (!stdex::strnicmp(source + attr.name.start, attr.name.size(), "content", SIZE_MAX))
1767 content_attr = &attr;
1768 }
1769 if (is_content_type && content_attr) {
1770 // <meta http-equiv="Content-Type" content="..."> found.
1772 if (content.match(source, content_attr->value.start, content_attr->value.end) &&
1773 content.charset)
1774 {
1775 std::string str;
1776 str.reserve(content.charset.size());
1777 for (size_t j = content.charset.start; j < content.charset.end; ++j)
1778 str.push_back(static_cast<char>(source[j]));
1779 m_charset = stdex::charset_from_name(str.c_str());
1780 }
1781 }
1782 }
1783
1784 m_sequences.push_back(std::move(e));
1785 break;
1786 }
1787 case stdex::parser::html_sequence_t::element_end: {
1788 std::unique_ptr<element_end> e(new element_end(std::move(m_tag), source, active_element()));
1789
1790 for (size_t j = m_element_stack.size(); j--; ) {
1791 auto starting_tag = m_element_stack[j];
1792 _Assume_(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1793 if (starting_tag->code == e->code ||
1794 starting_tag->code == element_t::unknown && e->code == element_t::unknown && !stdex::strnicmp(source + starting_tag->name.start, starting_tag->name.size(), source + e->name.start, e->name.size()))
1795 {
1796 e->start = starting_tag;
1797 e->parent = starting_tag->parent;
1798 starting_tag->end = e.get();
1799 m_element_stack.resize(j);
1800 break;
1801 }
1802 }
1803
1804 m_sequences.push_back(std::move(e));
1805 break;
1806 }
1807 case stdex::parser::html_sequence_t::declaration:
1808 if (m_tag.attributes.size() > 3 &&
1809 !stdex::strnicmp(source + m_tag.attributes[0].name.start, m_tag.attributes[0].name.size(), "entity", SIZE_MAX))
1810 {
1811 if (!stdex::strncmp(source + m_tag.attributes[1].name.start, m_tag.attributes[1].name.size(), "%", SIZE_MAX) &&
1812 stdex::strncmp(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size(), "SYSTEM", SIZE_MAX) &&
1813 stdex::strncmp(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size(), "PUBLIC", SIZE_MAX))
1814 {
1815 std::unique_ptr<entity<_Elem, _Traits, _Alloc>> e(new entity<_Elem, _Traits, _Alloc>());
1816 e->name = m_tag.attributes[2].name;
1817 e->value = std::move(replace_entities(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size()));
1818 m_entities.push_back(std::move(e));
1819 }
1820
1821 // TODO: Parse & entities and entities in SYSTEM and PUBLIC external files.
1822 }
1823 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new declaration(std::move(m_tag), active_element()))));
1824 break;
1825 case stdex::parser::html_sequence_t::comment:
1826 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new comment(std::move(m_tag), active_element()))));
1827 break;
1828 case stdex::parser::html_sequence_t::instruction:
1829 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new instruction(std::move(m_tag), active_element()))));
1830 break;
1831 default:
1832 throw std::invalid_argument("unknown tag type");
1833 }
1834
1835 continue;
1836 }
1837
1838 next_char:
1839 if (m_any_char.match(source, i, num_chars)) {
1840 // Skip any character, but don't declare it as parsed yet. It might be a part of unfinished tag.
1841 i = m_any_char.interval.end;
1842 }
1843 else
1844 break;
1845 }
1846 }
1847
1852 {
1853 size_t i = m_source.size();
1854 if (m_num_parsed < i)
1855 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));
1856 m_num_parsed = i;
1857 m_element_stack.clear();
1858 }
1859
1863 inline void assign(_In_reads_or_z_opt_(num_chars) const _Elem* source, _In_ size_t num_chars)
1864 {
1865 clear();
1866 append(source, num_chars);
1867 finalize();
1868 }
1869
1873 inline const std::basic_string<_Elem, _Traits, _Alloc>& source() const { return m_source; }
1874
1875 friend class parser<_Elem, _Traits, _Alloc>;
1876
1877 protected:
1882 {
1883 return m_element_stack.empty() ? nullptr : m_element_stack.back();
1884 }
1885
1889 std::basic_string<_Elem, _Traits, _Alloc> replace_entities(_In_reads_or_z_opt_(num_chars) const _Elem* input, _In_ size_t num_chars) const
1890 {
1891 _Assume_(input || !num_chars);
1892 const size_t num_entities = m_entities.size();
1893 const _Elem* source = m_source.data();
1894 std::basic_string<_Elem, _Traits, _Alloc> output;
1895 for (size_t i = 0; i < num_chars && input[i];) {
1896 if (input[i] == '%') {
1897 for (size_t j = 0; j < num_entities; j++) {
1898 auto& e = m_entities[j];
1899 size_t entity_size = e->name.size();
1900 if (i + entity_size + 1 < num_chars &&
1901 !stdex::strncmp(input + i + 1, source + e->name.start, entity_size) &&
1902 input[i + entity_size + 1] == ';')
1903 {
1904 output += e->value;
1905 i += entity_size + 2;
1906 goto next_char;
1907 }
1908 }
1909 throw std::runtime_error("undefined entity");
1910 }
1911 output += input[i++];
1912 next_char:;
1913 }
1914 return output;
1915 }
1916
1917 protected:
1918 std::basic_string<_Elem, _Traits, _Alloc> m_source;
1920 stdex::charset_id m_charset;
1921
1922 // Declaration parsing data
1930 std::vector<std::unique_ptr<entity<_Elem, _Traits, _Alloc>>> m_entities;
1931
1932 // Element parsing data
1934 sequence_store m_sequences;
1935 std::vector<element_start*> m_element_stack;
1937 };
1938
1942 enum class token_t {
1943 root = 0,
1944 complete,
1945 starting,
1946 ending,
1947 url,
1948 };
1949
1953 constexpr size_t token_tag_max =
1954 sizeof(void*) * 2 // Memory address in hexadecimal
1955 + 2 // Leading and trailing parenthesis
1956 + 1; // Zero terminator
1957
1962 constexpr char token_tag_start = '\x12';
1963
1968 constexpr char token_tag_end = '\x13';
1969
1973 class token
1974 {
1975 protected:
1976 inline token(_In_ token_t _type = token_t::root, _In_opt_ sequence* _sequence = nullptr, _In_ uintptr_t _data = 0) :
1977 type(_type),
1978 sequence(_sequence),
1979 data(_data)
1980 {}
1981
1982 template<class _Elem, class _Traits, class _Alloc>
1983 friend class parser;
1984
1985 public:
1986 virtual ~token() {} // make polymorphic
1987
1995 template<class _Traits = std::char_traits<char>, class _Alloc = std::allocator<char>>
1996 inline size_t append_tag(_Inout_ std::basic_string<char, _Traits, _Alloc>& str) const
1997 {
1998 size_t n = str.size();
1999 // Use %X instead of %p to ommit leading zeros and save space.
2000 stdex::appendf(str, "%c%zX%c", stdex::locale_C.get(), token_tag_start, reinterpret_cast<uintptr_t>(this), token_tag_end);
2001 return str.size() - n;
2002 }
2003
2011 template<class _Traits = std::char_traits<wchar_t>, class _Alloc = std::allocator<wchar_t>>
2012 inline size_t append_tag(_Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& str) const
2013 {
2014 // Use %X instead of %p to ommit leading zeros and save space.
2015 return stdex::appendf(str, L"%c%zX%c", stdex::locale_C.get(), static_cast<wchar_t>(token_tag_start), reinterpret_cast<uintptr_t>(this), static_cast<wchar_t>(token_tag_end));
2016 }
2017
2018 template<class T>
2019 static inline token* parse_tag(const T* str, size_t& offset)
2020 {
2021 if (str[offset] != static_cast<T>(token_tag_start))
2022 return nullptr;
2023
2024 // Locate tag end.
2025 size_t end;
2026 for (end = offset + 1; ; end++) {
2027 if (!str[end])
2028 return nullptr;
2029 if (str[end] == token_tag_end)
2030 break;
2031 }
2032
2033 // Parse hexadecimal token memory address.
2034 token* t = reinterpret_cast<token*>(stdex::strtouint<T, uintptr_t>(str + offset + 1, end - offset - 1, nullptr, 16));
2035 if (!t)
2036 throw std::invalid_argument("null token");
2037 offset = end + 1;
2038 return t;
2039 }
2040
2041 public:
2042 token_t type;
2044 uintptr_t data;
2045 };
2046
2047 using token_vector = std::vector<std::unique_ptr<token>>;
2048 using token_list = std::list<token*>;
2049
2053 enum text_type_flag_t : uint32_t {
2054 has_tokens = 1 << 0,
2055 has_text = 1 << 1,
2056 is_title = 1 << 2,
2057 is_bullet = 1 << 3,
2058 };
2059
2063 template<class _Elem, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
2064 class text_token : public token
2065 {
2066 protected:
2067 inline text_token(
2068 _In_ token_t type = token_t::complete,
2069 _In_reads_or_z_opt_(num_chars) const _Elem* _text = nullptr, _In_ size_t num_chars = 0,
2070 _In_ uint32_t _text_type = 0,
2071 _In_opt_ stdex::html::sequence* sequence = nullptr, _In_ uintptr_t data = 0) :
2073 text(_text, num_chars),
2074 text_type(_text_type)
2075 {}
2076
2077 friend class parser<_Elem, _Traits, _Alloc>;
2078
2079 public:
2080 std::basic_string<_Elem, _Traits, _Alloc> text;
2081 uint32_t text_type;
2082 stdex::mapping_vector<size_t> mapping;
2083 };
2084
2088 template<class _Elem, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
2089 class starting_token : public text_token<_Elem, _Traits, _Alloc>
2090 {
2091 protected:
2092 inline starting_token(
2093 _In_reads_or_z_opt_(num_chars_text) const _Elem* _text = nullptr, _In_ size_t num_chars_text = 0,
2094 _In_reads_or_z_opt_(num_chars_name) const _Elem* _name = nullptr, _In_ size_t num_chars_name = 0,
2095 _In_ uint32_t text_type = 0,
2096 _In_opt_ stdex::html::sequence* sequence = nullptr,
2097 _In_opt_ stdex::html::sequence* _end_sequence = nullptr,
2098 _In_ uintptr_t data = 0) :
2099 text_token(token_t::starting, _text, num_chars_text, text_type, sequence, data),
2100 name(_name, num_chars_name),
2101 end_sequence(_end_sequence)
2102 {}
2103
2104 friend class parser<_Elem, _Traits, _Alloc>;
2105
2106 public:
2107 std::basic_string<_Elem, _Traits, _Alloc> name;
2109 };
2110
2114 enum class token_url_t {
2115 plain = 0, // URL is not using any particular encoding scheme (as-is)
2116 sgml, // URL is encoded using SGML entities
2117 css, // URL is encoded using CSS escaping scheme
2118 };
2119
2123 template<class _Elem, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
2124 class url_token : public token
2125 {
2126 protected:
2127 inline url_token(
2128 _In_reads_or_z_opt_(num_chars) const _Elem* _url = nullptr, _In_ size_t num_chars = 0,
2129 token_url_t _encoding = token_url_t::plain,
2130 _In_opt_ stdex::html::sequence* sequence = nullptr, _In_ uintptr_t data = 0) :
2131 token(token_t::url, sequence, data),
2132 url(_url, num_chars),
2133 encoding(_encoding)
2134 {}
2135
2136 friend class parser<_Elem, _Traits, _Alloc>;
2137
2138 public:
2139 std::basic_string<_Elem, _Traits, _Alloc> url;
2140 token_url_t encoding;
2141 };
2142
2148 std::list<stdex::html::token*> active_tokens;
2149 size_t word_index;
2151 };
2152
2153 using inserted_token_list = std::list<inserted_token>;
2154
2155 template<class _Elem, class _Traits, class _Alloc>
2157 {
2158 public:
2159 inline parser(
2161 _In_reads_or_z_opt_(num_chars) const stdex::schar_t* url = nullptr, _In_ size_t num_chars = 0,
2162 _In_ bool parse_frames = false, _In_ stdex::progress<size_t>* progress = nullptr) :
2164 m_url(url, stdex::strnlen(url, num_chars)),
2165 m_parse_frames(parse_frames),
2167 m_source(nullptr)
2168 {}
2169
2174 {
2175 _Assume_(m_tokens.empty());
2176
2177 if (m_progress) {
2178 m_progress->set_range(0, m_document.source().size());
2179 m_progress->set(0);
2180 }
2181
2182 m_source = m_document.source().data();
2184 return parse(m_document.m_sequences.end());
2185 }
2186
2193 static void link(_Inout_ std::basic_string<_Elem, _Traits, _Alloc>& source, _In_ const text_token<_Elem, _Traits, _Alloc>* t)
2194 {
2195 _Assume_(t);
2196 _Assume_(
2197 t->type == token_t::complete ||
2198 t->type == token_t::starting ||
2199 t->type == token_t::ending ||
2200 t->type == token_t::root);
2201
2202 if (t->text_type & has_tokens) {
2203 const _Elem* root = t->text.data();
2204 for (size_t i = 0, num_chars = t->text.size(); i < num_chars && root[i];) {
2205 _Assume_(root[i] != token_tag_end);
2206 const token* t2 = token::parse_tag(root, i);
2207 if (t2) {
2208 switch (t2->type) {
2209 case token_t::complete:
2210 case token_t::starting:
2211 case token_t::ending:
2212 case token_t::root:
2213 link(source, dynamic_cast<const text_token<_Elem, _Traits, _Alloc>*>(t2));
2214 break;
2215 case token_t::url: {
2216 auto t2_url = dynamic_cast<const url_token<_Elem, _Traits, _Alloc>*>(t2);
2217 switch (t2_url->encoding) {
2218 case token_url_t::plain:
2219 source += t2_url->url;
2220 break;
2221 case token_url_t::sgml:
2222 escape(source, t2_url->url.data(), t2_url->url.size());
2223 break;
2224 case token_url_t::css:
2225 css_escape(source, t2_url->url.data(), t2_url->url.size());
2226 break;
2227 default:
2228 throw std::invalid_argument("unsupported URL encoding");
2229 }
2230 break;
2231 }
2232 default:
2233 throw std::invalid_argument("unsupported token type");
2234 }
2235 }
2236 else if (t->text_type & has_text) {
2237 escape_min(source, root[i]);
2238 i++;
2239 }
2240 else
2241 source += root[i++];
2242 }
2243 }
2244 else if (t->text_type & has_text) {
2245 // Token contains no references to other tokens. But, it does contain text that requires escaping.
2246 escape_min(source, t->text.data(), t->text.size());
2247 }
2248 else
2249 source += t->text;
2250 }
2251
2260 static void start_tokens(_Inout_ std::basic_string<_Elem, _Traits, _Alloc>& source, _Inout_ token_list& active_tokens, _In_ const token_list& new_tokens, _In_ token_list::const_iterator from)
2261 {
2262 for (; from != new_tokens.cend(); ++from) {
2263 auto t = *from;
2264 t->append_tag(source);
2265 active_tokens.push_back(t);
2266 }
2267 }
2268
2278 token_list::const_iterator end_tokens(_Inout_ std::basic_string<_Elem, _Traits, _Alloc>& source, _Inout_ token_list& active_tokens, _In_ const token_list& new_tokens)
2279 {
2280 // Skip matching tokens in active_tokens and new_tokens.
2281 token_list::const_iterator i1, i2;
2282 for (i1 = active_tokens.cbegin(), i2 = new_tokens.cbegin(); i1 != active_tokens.cend(); ++i1, ++i2) {
2283 if (i2 == new_tokens.cend() || *i1 != *i2) {
2284 // Got two tokens, where lists don't match anymore, or new_tokens list is out.
2285 // End tokens not relevant anymore in reverse order of starting.
2286 for (auto i = active_tokens.cend(); i != active_tokens.cbegin(); ) {
2287 auto t1 = dynamic_cast<starting_token<_Elem, _Traits, _Alloc>*>(*(--i));
2288 _Assume_(t1 && t1->type == token_t::starting);
2289
2290 std::unique_ptr<text_token<_Elem, _Traits, _Alloc>> t2(new text_token<_Elem, _Traits, _Alloc>(token_t::ending));
2291 t2->text.reserve(t1->name.size() + 3);
2292 t2->text += '<';
2293 t2->text += '/';
2294 t2->text += t1->name;
2295 t2->text += '>';
2296 append_token(std::move(t2), source);
2297
2298 // Pop the active token.
2299 if (i1 == i) {
2300 active_tokens.erase(i);
2301 break;
2302 }
2303 active_tokens.erase(i);
2304 i = active_tokens.cend();
2305 }
2306 break;
2307 }
2308 }
2309 return i2;
2310 }
2311
2321 void append_inserted_tokens(_Inout_ std::basic_string<_Elem, _Traits, _Alloc>& source, _Inout_ inserted_token_list& inserted_tokens,
2322 _In_ size_t word_index, _In_ bool after_word,
2323 _Inout_ token_list& active_tokens)
2324 {
2325 for (auto i = inserted_tokens.begin(); i != inserted_tokens.end(); ) {
2326 auto& t = *i;
2327 _Assume_(t.token);
2328 if (t.word_index == word_index && t.after_word == after_word) {
2329 if (t.token->type != token_t::ending)
2330 start_tokens(source, active_tokens, t.active_tokens, end_tokens(source, active_tokens, t.active_tokens));
2331 t.token->append_tag(source);
2332 inserted_tokens.erase(i++);
2333 }
2334 else
2335 ++i;
2336 }
2337 }
2338
2345 static void merge(_Inout_ token_list& a, _In_ const token_list& b)
2346 {
2347 for (auto i2 = b.begin(); i2 != b.end(); ++i2) {
2348 auto t2 = *i2;
2349 for (auto i1 = a.begin(); i1 != a.end(); ++i1) {
2350 if (i1 == a.end()) {
2351 a.push_back(t2);
2352 break;
2353 }
2354 auto t1 = *i1;
2355 if (t1 == t2)
2356 break;
2357 }
2358 }
2359 }
2360
2364 void make_absolute_url(std::basic_string<_Elem, _Traits, _Alloc>& rel)
2365 {
2366 _Unreferenced_(rel);
2367
2368 if (m_url.empty())
2369 return;
2370
2371 // TODO: Implement!
2372 }
2373
2377 inline const token_vector& tokens() const { return m_tokens; }
2378
2379 protected:
2387 template <class T>
2388 inline T* append_token(_Inout_ std::unique_ptr<T>&& token)
2389 {
2390 if (!token)
2391 return nullptr;
2392 auto t = token.get();
2393 m_tokens.push_back(std::move(token));
2394 return t;
2395 }
2396
2405 template <class T>
2406 inline size_t append_token(_Inout_ std::unique_ptr<T>&& token, _Inout_ std::basic_string<_Elem, _Traits, _Alloc>& source)
2407 {
2408 if (!token)
2409 return 0;
2410 size_t n = token->append_tag(source);
2411 m_tokens.push_back(std::move(token));
2412 return n;
2413 }
2414
2423 text_token<_Elem, _Traits, _Alloc>* parse(_In_ const sequence_store::const_iterator& end, _In_ uint32_t text_type = 0)
2424 {
2426 std::unique_ptr<text_token<_Elem, _Traits, _Alloc>> token(new text_token<_Elem, _Traits, _Alloc>(
2427 token_t::complete,
2428 nullptr, 0,
2429 text_type,
2430 m_offset != end ? m_offset->get() : nullptr));
2431
2432 while (m_offset != end) {
2433 auto& s = *m_offset;
2434
2435 if (m_progress) {
2436 if (m_progress->cancel())
2437 throw stdex::user_cancelled();
2438 m_progress->set(s->interval.start);
2439 }
2440
2441 // No token_tag_start and token_tag_end chars, please.
2442 _Assume_(
2443 stdex::strnchr(m_source + s->interval.start, s->interval.size(), static_cast<_Elem>(token_tag_start)) == stdex::npos &&
2444 stdex::strnchr(m_source + s->interval.start, s->interval.size(), static_cast<_Elem>(token_tag_end)) == stdex::npos);
2445
2446 if (s->type == stdex::parser::html_sequence_t::text) {
2447 rel.from = s->interval.start;
2448 token->mapping.push_back(rel);
2449 stdex::sgml2strcat(token->text, m_source + s->interval.start, s->interval.size(), 0, rel, &token->mapping);
2450 rel.to = token->text.size();
2451 if (!(token->text_type & has_text) &&
2452 !stdex::isblank(m_source + s->interval.start, s->interval.size()))
2453 token->text_type |= has_text;
2454 ++m_offset;
2455 }
2456 else if (s->type == stdex::parser::html_sequence_t::element || s->type == stdex::parser::html_sequence_t::element_start) {
2457 const element* s_el = static_cast<const element*>(s.get());
2458 _Assume_(s_el);
2459 const element_start* s_el_start = s->type == stdex::parser::html_sequence_t::element_start ? static_cast<const element_start*>(s.get()) : nullptr;
2460 if (s_el->code == element_t::frameset && !m_parse_frames)
2461 throw std::invalid_argument("<frameset> detected");
2462
2463 {
2464 size_t offset = s->interval.start;
2465 std::unique_ptr<text_token<_Elem, _Traits, _Alloc>> t(s->type == stdex::parser::html_sequence_t::element || element_traits::span(s_el_start->code) == element_span_t::immediate ?
2466 new text_token<_Elem, _Traits, _Alloc>(token_t::complete, nullptr, 0, 0, s.get()) :
2467 new starting_token<_Elem, _Traits, _Alloc>(nullptr, 0, m_source + s_el_start->name.start, s_el_start->name.size(), 0, s.get(), s_el_start->end));
2468
2469 // Copy the tag contents, but mind any attributes containing localizable text.
2470 for (auto& a : s_el->attributes) {
2471 if (a.value.empty() ||
2472 stdex::isblank(m_source + a.value.start, a.value.size()))
2473 continue;
2474
2475 if (element_traits::is_uri(s_el->code, m_source + a.name.start, a.name.size())) {
2476 t->text.append(m_source + offset, a.value.start - offset);
2477 std::unique_ptr<url_token<_Elem, _Traits, _Alloc>> t_url(new url_token<_Elem, _Traits, _Alloc>(
2478 nullptr, 0,
2479 token_url_t::sgml,
2480 s.get()));
2481 stdex::sgml2strcat(t_url->url, m_source + a.value.start, a.value.size());
2482 append_token(std::move(t_url), t->text);
2483 t->text_type |= has_tokens;
2484 offset = a.value.end;
2485 }
2486 else if (element_traits::is_localizable(s_el->code, m_source + a.name.start, a.name.size())) {
2487 t->text.append(m_source + offset, a.value.start - offset);
2488 std::unique_ptr<text_token<_Elem, _Traits, _Alloc>> t_value(new text_token<_Elem, _Traits, _Alloc>(
2489 token_t::complete,
2490 nullptr, 0,
2491 has_text | is_title,
2492 s.get()));
2493 stdex::mapping<size_t> rel_value(a.value.start, 0);
2494 t_value->mapping.push_back(rel_value);
2495 stdex::sgml2strcat(t_value->text, m_source + a.value.start, a.value.size(), 0, rel_value, &t_value->mapping);
2496 append_token(std::move(t_value), t->text);
2497 t->text_type |= has_tokens;
2498 offset = a.value.end;
2499 }
2500 }
2501
2502 t->text.append(m_source + offset, s->interval.end - offset);
2503 rel.from = s->interval.start;
2504 token->mapping.push_back(rel);
2505 rel.to += append_token(std::move(t), token->text);
2506 token->text_type |= has_tokens;
2507 }
2508 ++m_offset;
2509
2510 if (s_el_start) {
2511 if (s_el_start->code == element_t::address ||
2512 s_el_start->code == element_t::code ||
2513 s_el_start->code == element_t::comment ||
2514 s_el_start->code == element_t::cite ||
2515 s_el_start->code == element_t::kbd ||
2516 s_el_start->code == element_t::samp ||
2517 s_el_start->code == element_t::script ||
2518 s_el_start->code == element_t::style)
2519 {
2520 // Non-localizable
2521 auto s_end = s_el_start->end;
2522 _Assume_(s_end);
2523
2524 if (s->interval.end < s_end->interval.start) {
2525 if (s_el_start->code != element_t::style) {
2526 rel.from = s->interval.start;
2527 token->mapping.push_back(rel);
2528 rel.to += append_token(std::move(std::unique_ptr<text_token<_Elem, _Traits, _Alloc>>(
2530 token_t::complete,
2531 m_source + s->interval.end, s_end->interval.start - s->interval.end,
2532 0,
2533 m_offset->get()))),
2534 token->text);
2535 }
2536 else {
2537 // Partially parse CSS. It may contain URLs we need to make absolute.
2538 auto t = parse_css(s->interval.end, s_end->interval.start);
2539 _Assume_(t);
2540 rel.from = s->interval.start;
2541 token->mapping.push_back(rel);
2542 rel.to += t->append_tag(token->text);
2543 }
2544 token->text_type |= has_tokens;
2545 }
2546 while (m_offset != end && m_offset->get() != s_end)
2547 ++m_offset;
2548 }
2549 else if (element_traits::is_group(s_el_start->code)) {
2550 auto limit = m_offset;
2551 while (limit != end && limit->get() != s_el_start->end)
2552 ++limit;
2553 auto t = parse(limit,
2554 (element_traits::is_heading(s_el_start->code) || s_el_start->code == element_t::dt || s_el_start->code == element_t::title ? is_title : 0) |
2555 (element_traits::is_list(s_el_start->code) ? is_bullet : 0));
2556 rel.from = s->interval.start;
2557 token->mapping.push_back(rel);
2558 rel.to += t->append_tag(token->text);
2559 token->text_type |= has_tokens;
2560 }
2561 }
2562 }
2563 else if (s->type == stdex::parser::html_sequence_t::element_end) {
2564 rel.from = s->interval.start;
2565 token->mapping.push_back(rel);
2566 rel.to += append_token(std::move(std::unique_ptr<text_token<_Elem, _Traits, _Alloc>>(
2568 token_t::ending,
2569 m_source + s->interval.start, s->interval.size(),
2570 0,
2571 s.get()))),
2572 token->text);
2573 token->text_type |= has_tokens;
2574 ++m_offset;
2575 }
2576 else {
2577 // Declaration, instruction, (P)CDATA section, comment...
2578 rel.from = s->interval.start;
2579 token->mapping.push_back(rel);
2580 rel.to += append_token(std::move(std::unique_ptr<text_token<_Elem, _Traits, _Alloc>>(
2582 token_t::complete,
2583 m_source + s->interval.start, s->interval.size(),
2584 0,
2585 s.get()))),
2586 token->text);
2587 token->text_type |= has_tokens;
2588 ++m_offset;
2589 }
2590 }
2591
2592 return append_token(std::move(token));
2593 }
2594
2599 {
2600 stdex::interval<size_t> section, content;
2601 std::unique_ptr<text_token<_Elem, _Traits, _Alloc>> token(
2603 token_t::complete,
2604 nullptr, 0,
2605 0,
2606 m_offset->get()));
2607
2608 for (;;) {
2609 if (m_css_comment.match(m_source, start, end)) {
2610 token->text.append(m_source + start, m_css_comment.interval.end - start);
2611 start = m_css_comment.interval.end;
2612 }
2613 else if (m_css_cdo.match(m_source, start, end)) {
2614 token->text.append(m_source + start, m_css_cdo.interval.end - start);
2615 start = m_css_cdo.interval.end;
2616 }
2617 else if (m_css_cdc.match(m_source, start, end)) {
2618 token->text.append(m_source + start, m_css_cdc.interval.end - start);
2619 start = m_css_cdc.interval.end;
2620 }
2621 else if (
2622 m_css_import.match(m_source, start, end) && (section = m_css_import.interval, content = m_css_import.content, true) ||
2623 m_css_uri.match(m_source, start, end) && (section = m_css_uri.interval, content = m_css_uri.content, true))
2624 {
2625 std::unique_ptr<url_token<_Elem, _Traits, _Alloc>> t_url(
2627 nullptr, 0,
2628 token_url_t::css,
2629 m_offset->get()));
2630 css_unescape(t_url->url, m_source + content.start, content.size());
2631 token->text.append(m_source + start, content.start - start);
2632 append_token(std::move(t_url), token->text);
2633 token->text.append(m_source + content.end, section.end - content.end);
2634 token->text_type |= has_tokens;
2635 start = section.end;
2636 }
2637 else if (m_any_char.match(m_source, start, end)) {
2638 token->text.append(m_source + start, m_any_char.interval.end - start);
2639 start = m_any_char.interval.end;
2640 }
2641 else
2642 break;
2643 }
2644
2645 return append_token(std::move(token));
2646 }
2647
2648 protected:
2650 const stdex::sstring m_url;
2651 const bool m_parse_frames;
2653 const _Elem* m_source;
2654 token_vector m_tokens;
2655 sequence_store::const_iterator m_offset;
2656
2657 // For detecting URLs in CSS
2665 };
2666 }
2667}
HTML comment.
Definition html.hpp:1551
stdex::interval< size_t > content
Comment content position in source.
Definition html.hpp:1560
HTML declaration.
Definition html.hpp:1533
stdex::interval< size_t > name
Declaration name position in source.
Definition html.hpp:1543
std::vector< stdex::parser::html_attribute > attributes
Declaration attribute positions in source.
Definition html.hpp:1544
HTML document.
Definition html.hpp:1600
std::vector< element_start * > m_element_stack
LIFO stack of started elements.
Definition html.hpp:1935
void finalize()
Finalizes document when no more appending is planned.
Definition html.hpp:1851
stdex::charset_id m_charset
Document charset.
Definition html.hpp:1920
bool m_is_rcdata
Inside of RCDATA?
Definition html.hpp:1926
bool m_is_special_element
Inside of a special element (<SCRIPT>, <STYLE>, ...)?
Definition html.hpp:1936
sequence_store m_sequences
Store of sequences.
Definition html.hpp:1934
size_t m_num_invalid_conditions
Number of started invalid conditions.
Definition html.hpp:1924
std::vector< std::unique_ptr< entity< _Elem, _Traits, _Alloc > > > m_entities
Array of entities.
Definition html.hpp:1930
std::basic_string< _Elem, _Traits, _Alloc > m_source
Document HTML source code.
Definition html.hpp:1918
void assign(_In_reads_or_z_opt_(num_chars) const _Elem *source, size_t num_chars)
Parses HTML document source code.
Definition html.hpp:1863
element_start * active_element() const
Returns starting tag of currently active element or nullptr if no element is known to be started.
Definition html.hpp:1881
std::basic_string< _Elem, _Traits, _Alloc > replace_entities(_In_reads_or_z_opt_(num_chars) const _Elem *input, size_t num_chars) const
Replaces entities with their content.
Definition html.hpp:1889
void clear()
Empties document.
Definition html.hpp:1619
size_t m_num_valid_conditions
Number of started valid conditions.
Definition html.hpp:1923
const std::basic_string< _Elem, _Traits, _Alloc > & source() const
Returns document HTML source code.
Definition html.hpp:1873
size_t m_num_parsed
Number of characters already parsed.
Definition html.hpp:1919
bool m_is_cdata
Inside of CDATA?
Definition html.hpp:1925
void append(_In_reads_or_z_opt_(num_chars) const _Elem *source, size_t num_chars)
Parses HTML source code by chunks.
Definition html.hpp:1640
Ending tag of an HTML element </...>
Definition html.hpp:1513
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1525
element_start * start
Corresponding starting tag.
Definition html.hpp:1526
element_t code
Element code.
Definition html.hpp:1524
Starting tag of an HTML element <...>
Definition html.hpp:1497
sequence * end
Corresponding ending tag of type element_end; When element is ended by a start of another element,...
Definition html.hpp:1506
HTML element <.../>
Definition html.hpp:1318
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1487
std::vector< stdex::parser::html_attribute > attributes
Element attribute positions in source.
Definition html.hpp:1488
element_t code
Element code.
Definition html.hpp:1486
HTML instruction.
Definition html.hpp:1567
stdex::interval< size_t > content
Instruction content position in source.
Definition html.hpp:1576
HTML parser.
Definition html.hpp:2157
stdex::progress< size_t > * m_progress
Progress indicator.
Definition html.hpp:2652
text_token< _Elem, _Traits, _Alloc > * parse_css(size_t start, size_t end)
Parses CSS.
Definition html.hpp:2598
static void merge(token_list &a, const token_list &b)
Adds tokens from list b to list a creating an union.
Definition html.hpp:2345
token_list::const_iterator end_tokens(std::basic_string< _Elem, _Traits, _Alloc > &source, token_list &active_tokens, const token_list &new_tokens)
Pops ending tokens from the active token list and append their tags to the source code string.
Definition html.hpp:2278
static void link(std::basic_string< _Elem, _Traits, _Alloc > &source, const text_token< _Elem, _Traits, _Alloc > *t)
Rebuilds HTML source code from the token tree.
Definition html.hpp:2193
text_token< _Elem, _Traits, _Alloc > * parse(const sequence_store::const_iterator &end, uint32_t text_type=0)
Recursively parses HTML document.
Definition html.hpp:2423
const _Elem * m_source
HTML source code.
Definition html.hpp:2653
token_vector m_tokens
HTML token storage.
Definition html.hpp:2654
text_token< _Elem, _Traits, _Alloc > * parse()
Parses HTML document.
Definition html.hpp:2173
const document< _Elem, _Traits, _Alloc > & m_document
Document being analyzed.
Definition html.hpp:2649
void make_absolute_url(std::basic_string< _Elem, _Traits, _Alloc > &rel)
Converts URL to absolute.
Definition html.hpp:2364
size_t append_token(std::unique_ptr< T > &&token, std::basic_string< _Elem, _Traits, _Alloc > &source)
Adds token to the collection and appends its tag to the source code string.
Definition html.hpp:2406
const token_vector & tokens() const
Returns collection of tokens.
Definition html.hpp:2377
const stdex::sstring m_url
Absolute document URL.
Definition html.hpp:2650
const bool m_parse_frames
Parse frames.
Definition html.hpp:2651
static void start_tokens(std::basic_string< _Elem, _Traits, _Alloc > &source, token_list &active_tokens, const token_list &new_tokens, token_list::const_iterator from)
Pushes tokens to the active token list and appends their tags to the source code string.
Definition html.hpp:2260
T * append_token(std::unique_ptr< T > &&token)
Adds token to the collection.
Definition html.hpp:2388
void append_inserted_tokens(std::basic_string< _Elem, _Traits, _Alloc > &source, inserted_token_list &inserted_tokens, size_t word_index, bool after_word, token_list &active_tokens)
Adds matching inserted tokens before/after the given word in source code.
Definition html.hpp:2321
sequence_store::const_iterator m_offset
Index of active section.
Definition html.hpp:2655
Base class for HTML sequences.
Definition html.hpp:1299
stdex::interval< size_t > interval
Sequence position in source.
Definition html.hpp:1302
stdex::parser::html_sequence_t type
Sequence type. Enum is used for performance reasons (vs. dynamic_cast)
Definition html.hpp:1301
sequence * parent
Parent sequence.
Definition html.hpp:1303
Token representing start HTML tag.
Definition html.hpp:2090
stdex::html::sequence * end_sequence
Ending tag sequence.
Definition html.hpp:2108
std::basic_string< _Elem, _Traits, _Alloc > name
Element name allowing later recreation of ending </tag>
Definition html.hpp:2107
Token representing part of HTML text.
Definition html.hpp:2065
stdex::mapping_vector< size_t > mapping
Mapping between source and text positions.
Definition html.hpp:2082
std::basic_string< _Elem, _Traits, _Alloc > text
Token text.
Definition html.hpp:2080
uint32_t text_type
Mask of text_type_flag_t to specify text content.
Definition html.hpp:2081
HTML token base class.
Definition html.hpp:1974
sequence * sequence
Pointer to the sequence this token represents or nullptr when it doesn't trivially represent one sequ...
Definition html.hpp:2043
size_t append_tag(std::basic_string< char, _Traits, _Alloc > &str) const
Appends token tag to the source code.
Definition html.hpp:1996
uintptr_t data
Any user-supplied data.
Definition html.hpp:2044
token_t type
Token type.
Definition html.hpp:2042
size_t append_tag(std::basic_string< wchar_t, _Traits, _Alloc > &str) const
Appends token tag to the source code.
Definition html.hpp:2012
HTTP token representing an URL.
Definition html.hpp:2125
token_url_t encoding
URL encoding.
Definition html.hpp:2140
std::basic_string< _Elem, _Traits, _Alloc > url
URL.
Definition html.hpp:2139
stdex::interval< size_t > content
content position in source
Definition parser.hpp:7833
stdex::interval< size_t > content
content position in source
Definition parser.hpp:7748
std::vector< html_attribute > attributes
tag attributes
Definition parser.hpp:8355
html_sequence_t type
tag type
Definition parser.hpp:8353
stdex::interval< size_t > name
tag name position in source
Definition parser.hpp:8354
stdex::interval< size_t > interval
Region of the last match.
Definition parser.hpp:172
Test for given string.
Definition parser.hpp:818
Progress indicator base class.
Definition progress.hpp:19
virtual bool cancel()
Query whether user requested abort.
Definition progress.hpp:65
virtual void set(T value)
Set current progress.
Definition progress.hpp:47
virtual void set_range(T start, T end)
Set progress range extent.
Definition progress.hpp:37
User cancelled exception.
Definition exception.hpp:17
Describes attributes associated with a HTML element.
Definition html.hpp:683
static bool is_group(element_t code)
Does element represent a separate part of text?
Definition html.hpp:1088
static bool is_flow(element_t code)
Does element typically represent text?
Definition html.hpp:1002
static bool is_heading(element_t code)
Does element represent a heading?
Definition html.hpp:920
static bool is_head_content(element_t code)
Is element part of the document head?
Definition html.hpp:1012
static bool is_fontstyle(element_t code)
Does element represent font styling?
Definition html.hpp:808
static bool is_block(element_t code)
Is element typically displayed as a stand-alone section of text?
Definition html.hpp:971
static bool is_head_misc(element_t code)
May element be a part of document head?
Definition html.hpp:1029
static bool is_list(element_t code)
Does element represent a list of items?
Definition html.hpp:939
static bool is_uri(element_t code, _In_reads_or_z_opt_(num_chars) const T *attr_name, size_t num_chars)
Checks if expected element attribute value is URI.
Definition html.hpp:1218
static bool is_preformatted(element_t code)
Does element represent preformatted text, source code etc.?
Definition html.hpp:956
static bool is_localizable(element_t code, const T *attr_name, size_t num_chars)
Checks if expected element attribute value is localizable.
Definition html.hpp:1273
static bool is_special(element_t code)
Does element represent non-textual item in the document?
Definition html.hpp:854
static bool is_pre_exclusion(element_t code)
May element be a part of.
Definition html.hpp:1047
static bool is_inline(element_t code)
Is element typically displayed inline with text?
Definition html.hpp:905
static bool is_html_content(element_t code)
Does element represent the document body?
Definition html.hpp:1072
static bool is_formctrl(element_t code)
Does element represent a form control?
Definition html.hpp:887
static bool is_phrase(element_t code)
Does element represent a phrase-of-speech?
Definition html.hpp:830
static bool may_contain(element_t parent, element_t child)
Checks if one element may nest inside another.
Definition html.hpp:1118
static element_span_t span(element_t code)
Returns expected element span in HTML code.
Definition html.hpp:689
HTML entity.
Definition html.hpp:1584
stdex::interval< size_t > name
Name position in source.
Definition html.hpp:1585
std::basic_string< _Elem, _Traits, _Alloc > value
Entity value.
Definition html.hpp:1586
Inserted HTML token.
Definition html.hpp:2146
bool after_word
true if token is anchored after the word; false if anchored before the word
Definition html.hpp:2150
std::list< stdex::html::token * > active_tokens
List of started tokens at inserted token.
Definition html.hpp:2148
size_t word_index
Index of the word, token is anchored to.
Definition html.hpp:2149
token * token
Points to the token.
Definition html.hpp:2147
Numerical interval.
Definition interval.hpp:18
T size() const
Returns interval size.
Definition interval.hpp:47
T end
interval end
Definition interval.hpp:20
T start
interval start
Definition interval.hpp:19
Maps index in source string to index in destination string.
Definition mapping.hpp:17
mapping()
Constructs a zero to zero mapping.
Definition mapping.hpp:24
Tag attribute.
Definition parser.hpp:8127
stdex::interval< size_t > value
attribute value position in source
Definition parser.hpp:8129