stdex
Additional custom or not Standard C++ covered algorithms
Loading...
Searching...
No Matches
html.hpp
1/*
2 SPDX-License-Identifier: MIT
3 Copyright © 2016-2024 Amebis
4*/
5
6#pragma once
7
8#include "assert.hpp"
9#include "compat.hpp"
10#include "exception.hpp"
11#include "interval.hpp"
12#include "mapping.hpp"
13#include "parser.hpp"
14#include "progress.hpp"
15#include "sgml.hpp"
16#include "string.hpp"
17#include "system.hpp"
18#include "unicode.hpp"
19#include <exception>
20#include <list>
21#include <map>
22#include <memory>
23#include <stdexcept>
24#include <string_view>
25#include <string>
26#include <vector>
27
28#ifdef _WIN32
29#undef small
30#endif
31
32namespace stdex
33{
34 namespace html
35 {
43 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
44 void escape(
45 _Inout_ std::basic_string<char, TR, AX>& dst,
46 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
47 {
48 stdex_assert(src || !num_chars);
49 for (size_t i = 0; i < num_chars && src[i]; ++i) {
50 switch (src[i]) {
51 case '&': dst += "&amp;"; break;
52 case ';': dst += "&semi;"; break;
53 case '\"': dst += "&quot;"; break;
54 case '\'': dst += "&#x27;"; break;
55 case '<': dst += "&lt;"; break;
56 case '>': dst += "&gt;"; break;
57 case 0x00a0: dst += "&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
58 default: dst += src[i]; break;
59 }
60 }
61 }
62
70 template<class TR = std::char_traits<wchar_t>, class AX = std::allocator<wchar_t>>
71 void escape(
72 _Inout_ std::basic_string<wchar_t, TR, AX>& dst,
73 _In_reads_or_z_opt_(num_chars) const wchar_t* src, _In_ size_t num_chars)
74 {
75 stdex_assert(src || !num_chars);
76 for (size_t i = 0; i < num_chars && src[i]; ++i) {
77 switch (src[i]) {
78 case L'&': dst += L"&amp;"; break;
79 case L';': dst += L"&semi;"; break;
80 case L'\"': dst += L"&quot;"; break;
81 case L'\'': dst += L"&#x27;"; break;
82 case L'<': dst += L"&lt;"; break;
83 case L'>': dst += L"&gt;"; break;
84 case L'\u00a0': dst += L"&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
85 default: dst += src[i]; break;
86 }
87 }
88 }
89
96 template<class T, size_t N, class TR = std::char_traits<T>, class AX = std::allocator<T>>
97 void escape(
98 _Inout_ std::basic_string<T, TR, AX>& dst,
99 _In_ const T (&src)[N])
100 {
101 escape(dst, src, N);
102 }
103
110 template<class T, class TR_dst = std::char_traits<T>, class AX_dst = std::allocator<T>, class TR_src = std::char_traits<T>, class AX_src = std::allocator<T>>
111 void escape(
112 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
113 _In_ const std::basic_string<T, TR_src, AX_src>& src)
114 {
115 escape(dst, src.data(), src.size());
116 }
117
124 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
125 void escape_min(_Inout_ std::basic_string<char, TR, AX>& dst, _In_ char chr)
126 {
127 switch (chr) {
128 case '&': dst += "&amp;"; break;
129 case '<': dst += "&lt;"; break;
130 case '>': dst += "&gt;"; break;
131 case 0x00a0: dst += "&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
132 default: dst += chr; break;
133 }
134 }
135
142 template<class TR = std::char_traits<wchar_t>, class AX = std::allocator<wchar_t>>
143 void escape_min(_Inout_ std::basic_string<wchar_t, TR, AX>& dst, _In_ wchar_t chr)
144 {
145 switch (chr) {
146 case L'&': dst += L"&amp;"; break;
147 case L'<': dst += L"&lt;"; break;
148 case L'>': dst += L"&gt;"; break;
149 case L'\u00a0': dst += L"&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
150 default: dst += chr; break;
151 }
152 }
153
161 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
162 void escape_min(
163 _Inout_ std::basic_string<char, TR, AX>& dst,
164 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
165 {
166 stdex_assert(src || !num_chars);
167 for (size_t i = 0; i < num_chars && src[i]; ++i) {
168 switch (src[i]) {
169 case '&': dst += "&amp;"; break;
170 case '<': dst += "&lt;"; break;
171 case '>': dst += "&gt;"; break;
172 case 0x00a0: dst += "&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
173 default: dst += src[i]; break;
174 }
175 }
176 }
177
185 template<class TR = std::char_traits<wchar_t>, class AX = std::allocator<wchar_t>>
186 void escape_min(
187 _Inout_ std::basic_string<wchar_t, TR, AX>& dst,
188 _In_reads_or_z_opt_(num_chars) const wchar_t* src, _In_ size_t num_chars)
189 {
190 stdex_assert(src || !num_chars);
191 for (size_t i = 0; i < num_chars && src[i]; ++i) {
192 switch (src[i]) {
193 case L'&': dst += L"&amp;"; break;
194 case L'<': dst += L"&lt;"; break;
195 case L'>': dst += L"&gt;"; break;
196 case L'\u00a0': dst += L"&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
197 default: dst += src[i]; break;
198 }
199 }
200 }
201
208 template<class T, size_t N, class TR = std::char_traits<T>, class AX = std::allocator<T>>
209 void escape_min(
210 _Inout_ std::basic_string<T, TR, AX>& dst,
211 _In_ const T (&src)[N])
212 {
213 escape_min(dst, src, N);
214 }
215
222 template<class T, class TR_dst = std::char_traits<T>, class AX_dst = std::allocator<T>, class TR_src = std::char_traits<T>, class AX_src = std::allocator<T>>
223 void escape_min(
224 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
225 _In_ const std::basic_string<T, TR_src, AX_src>& src)
226 {
227 escape_min(dst, src.data(), src.size());
228 }
229
237 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
238 void url_unescape(
239 _Inout_ std::basic_string<char, TR, AX>& dst,
240 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
241 {
242 stdex_assert(src || !num_chars);
243 for (size_t i = 0; i < num_chars && src[i];) {
244 switch (src[i]) {
245 case '+':
246 dst += ' '; i++;
247 break;
248
249 case '%': {
250 i++;
251
252 char chr;
253 if ('0' <= src[i] && src[i] <= '9') chr = static_cast<char>((src[i++] - '0') << 4);
254 else if ('A' <= src[i] && src[i] <= 'F') chr = static_cast<char>((src[i++] - 'A' + 10) << 4);
255 else if ('a' <= src[i] && src[i] <= 'f') chr = static_cast<char>((src[i++] - 'a' + 10) << 4);
256 else { dst += '%'; continue; }
257 if ('0' <= src[i] && src[i] <= '9') chr |= static_cast<char>((src[i++] - '0'));
258 else if ('A' <= src[i] && src[i] <= 'F') chr |= static_cast<char>((src[i++] - 'A' + 10));
259 else if ('a' <= src[i] && src[i] <= 'f') chr |= static_cast<char>((src[i++] - 'a' + 10));
260 else { dst += '%'; dst += src[i - 1]; continue; }
261
262 dst += chr;
263 break;
264 }
265
266 default:
267 dst += src[i++];
268 }
269 }
270 }
271
278 template<size_t N, class TR = std::char_traits<char>, class AX = std::allocator<char>>
279 void url_unescape(
280 _Inout_ std::basic_string<char, TR, AX>& dst,
281 _In_ const char (&src)[N])
282 {
283 url_unescape(dst, src, N);
284 }
285
292 template<class TR_dst = std::char_traits<char>, class AX_dst = std::allocator<char>>
293 void url_unescape(
294 _Inout_ std::basic_string<char, TR_dst, AX_dst>& dst,
295 _In_ const std::basic_string_view<char, std::char_traits<char>> src)
296 {
297 url_unescape(dst, src.data(), src.size());
298 }
299
307 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
308 void url_escape(
309 _Inout_ std::basic_string<char, TR, AX>& dst,
310 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
311 {
312 stdex_assert(src || !num_chars);
313 for (size_t i = 0; i < num_chars && src[i]; ++i) {
314 switch (src[i]) {
315 case ' ': dst += "+"; break;
316 case '<': dst += "%3C"; break;
317 case '>': dst += "%3E"; break;
318 case '#': dst += "%23"; break;
319 case '%': dst += "%25"; break;
320 case '{': dst += "%7B"; break;
321 case '}': dst += "%7D"; break;
322 case '|': dst += "%7C"; break;
323 case '\\': dst += "%5C"; break;
324 case '^': dst += "%5E"; break;
325 case '~': dst += "%7E"; break;
326 case '[': dst += "%5B"; break;
327 case ']': dst += "%5D"; break;
328 case '`': dst += "%60"; break;
329 case ';': dst += "%3B"; break;
330 case '/': dst += "%2F"; break;
331 case '?': dst += "%3F"; break;
332 case ':': dst += "%3A"; break;
333 case '@': dst += "%40"; break;
334 case '=': dst += "%3D"; break;
335 case '&': dst += "%26"; break;
336 case '$': dst += "%24"; break;
337 default:
338 if (0x20 < static_cast<uint8_t>(src[i]) && static_cast<uint8_t>(src[i]) < 0x7f)
339 dst += src[i];
340 else {
341 dst += '%';
342 uint8_t n = (static_cast<uint8_t>(src[i]) & 0xf0) >> 4;
343 dst += n < 10 ? static_cast<char>('0' + n) : static_cast<char>('A' + n - 10);
344 n = ((uint8_t)src[i] & 0x0f);
345 dst += n < 10 ? static_cast<char>('0' + n) : static_cast<char>('A' + n - 10);
346 }
347 }
348 }
349 }
350
357 template<size_t N, class TR = std::char_traits<char>, class AX = std::allocator<char>>
358 void url_escape(
359 _Inout_ std::basic_string<char, TR, AX>& dst,
360 _In_ const char (&src)[N])
361 {
362 url_escape(dst, src, N);
363 }
364
371 template<class TR_dst = std::char_traits<char>, class AX_dst = std::allocator<char>>
372 void url_escape(
373 _Inout_ std::basic_string<char, TR_dst, AX_dst>& dst,
374 _In_ const std::basic_string_view<char, std::char_traits<char>> src)
375 {
376 url_escape(dst, src.data(), src.size());
377 }
378
386 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
387 void css_unescape(
388 _Inout_ std::basic_string<T, TR, AX>& dst,
389 _In_reads_or_z_opt_(num_chars) const T* src, _In_ size_t num_chars)
390 {
391 stdex_assert(src || !num_chars);
392 for (size_t i = 0; i < num_chars && src[i];) {
393 if (src[i] != '\\')
394 dst += src[i++];
395 else if (i + 1 < num_chars) {
396 i++;
397
398 switch (src[i]) {
399 // Classic escapes
400 case 'n': dst += '\n'; i++; break;
401 case 'r': dst += '\r'; i++; break;
402 case 't': dst += '\t'; i++; break;
403
404 // `\` at the end of the line
405 case '\n': i++; break;
406
407 // `\nnnn` escape
408 case '0':
409 case '1':
410 case '2':
411 case '3':
412 case '4':
413 case '5':
414 case '6':
415 case '7':
416 case '8':
417 case '9':
418 case 'A': case 'a':
419 case 'B': case 'b':
420 case 'C': case 'c':
421 case 'D': case 'd':
422 case 'E': case 'e':
423 case 'F': case 'f': {
424 wchar_t chr = 0;
425 size_t end = std::min(num_chars, i + 6);
426
427 for (; i < end; ++i) {
428 if ('0' <= src[i] && src[i] <= '9') chr = chr * 0x10 + src[i] - '0';
429 else if ('A' <= src[i] && src[i] <= 'F') chr = chr * 0x10 + src[i] - 'A' + 10;
430 else if ('a' <= src[i] && src[i] <= 'f') chr = chr * 0x10 + src[i] - 'a' + 10;
431 else break;
432 }
433
434 dst += static_cast<T>(chr);
435
436 if (i < end && src[i] == ' ') {
437 // Skip space after `\nnnn`.
438 i++;
439 }
440 break;
441 }
442
443 default: dst += src[i++];
444 }
445 }
446 }
447 }
448
455 template<class T, size_t N, class TR = std::char_traits<T>, class AX = std::allocator<T>>
456 void css_unescape(
457 _Inout_ std::basic_string<T, TR, AX>& dst,
458 _In_ const T (&src)[N])
459 {
460 css_unescape(dst, src, N);
461 }
462
469 template<class T, class TR_dst = std::char_traits<T>, class AX_dst = std::allocator<T>, class TR_src = std::char_traits<T>, class AX_src = std::allocator<T>>
470 void css_unescape(
471 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
472 _In_ const std::basic_string<T, TR_src, AX_src>& src)
473 {
474 css_unescape(dst, src.data(), src.size());
475 }
476
484 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
485 void css_escape(
486 _Inout_ std::basic_string<char, TR, AX>& dst,
487 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
488 {
489 stdex_assert(src || !num_chars);
490 for (size_t i = 0; i < num_chars && src[i]; ++i) {
491 switch (src[i]) {
492 case '\\': dst += "\\\\"; break;
493 case '\n': dst += "\\n"; break;
494 case '\r': dst += "\\r"; break;
495 case '\t': dst += "\\t"; break;
496 case '\"': dst += "\\\""; break;
497 case '\'': dst += "\\'"; break;
498 default: dst += src[i]; break;
499 }
500 }
501 }
502
510 template<class TR = std::char_traits<wchar_t>, class AX = std::allocator<wchar_t>>
511 void css_escape(
512 _Inout_ std::basic_string<wchar_t, TR, AX>& dst,
513 _In_reads_or_z_opt_(num_chars) const wchar_t* src, _In_ size_t num_chars)
514 {
515 stdex_assert(src || !num_chars);
516 for (size_t i = 0; i < num_chars && src[i]; ++i) {
517 switch (src[i]) {
518 case L'\\': dst += L"\\\\"; break;
519 case L'\n': dst += L"\\n"; break;
520 case L'\r': dst += L"\\r"; break;
521 case L'\t': dst += L"\\t"; break;
522 case L'\"': dst += L"\\\""; break;
523 case L'\'': dst += L"\\'"; break;
524 default: dst += src[i]; break;
525 }
526 }
527 }
528
535 template<class T, size_t N, class TR = std::char_traits<T>, class AX = std::allocator<T>>
536 void css_escape(
537 _Inout_ std::basic_string<T, TR, AX>& dst,
538 _In_ const T (&src)[N])
539 {
540 css_escape(dst, src, N);
541 }
542
549 template<class T, class TR_dst = std::char_traits<T>, class AX_dst = std::allocator<T>, class TR_src = std::char_traits<T>, class AX_src = std::allocator<T>>
550 void css_escape(
551 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
552 _In_ const std::basic_string<T, TR_src, AX_src>& src)
553 {
554 css_escape(dst, src.data(), src.size());
555 }
556
560 enum class element_t {
561 empty = 0,
562 a,
563 abbr,
564 acronym,
565 address,
566 applet,
567 area,
568 b,
569 base,
570 basefont,
571 bdo,
572 bgsound, // Microsoft Specific
573 big,
574 blink, // Microsoft Specific
575 blockquote,
576 body,
577 br,
578 button,
579 caption,
580 center,
581 cite,
582 code,
583 col,
584 colgroup,
585 comment, // Microsoft Specific
586 dd,
587 del,
588 dfn,
589 dir,
590 div,
591 dl,
592 dt,
593 em,
594 embed, // Microsoft Specific
595 fieldset,
596 font,
597 form,
598 frame,
599 frameset,
600 h1,
601 h2,
602 h3,
603 h4,
604 h5,
605 h6,
606 head,
607 hr,
608 html,
609 i,
610 iframe,
611 img,
612 input,
613 ins,
614 isindex,
615 kbd,
616 label,
617 legend,
618 li,
619 link,
620 listing, // Microsoft Specific
621 map,
622 marquee, // Microsoft Specific
623 menu,
624 meta,
625 nextid, // Microsoft Specific
626 nobr, // Microsoft Specific
627 noembed, // Microsoft Specific
628 noframes,
629 noscript,
630 object,
631 ol,
632 optgroup,
633 option,
634 p,
635 param,
636 plaintext, // Microsoft Specific
637 pre,
638 q,
639 rt, // Microsoft Specific
640 ruby, // Microsoft Specific
641 s,
642 samp,
643 script,
644 select,
645 small,
646 span,
647 strike,
648 strong,
649 style,
650 sub,
651 sup,
652 table,
653 tbody,
654 td,
655 textarea,
656 tfoot,
657 th,
658 thead,
659 title,
660 tr,
661 tt,
662 u,
663 ul,
664 var,
665 wbr, // Microsoft Specific
666 xmp, // Microsoft Specific
667
668 unknown = -1,
669 PCDATA = -2,
670 CDATA = -3,
671 };
672
676 enum class element_span_t {
677 needs_end = 0,
678 end_optional,
679 immediate,
680 };
681
686 {
692 static element_span_t span(_In_ element_t code)
693 {
694 static element_span_t lookup[] = {
695 element_span_t::needs_end, // a
696 element_span_t::needs_end, // abbr
697 element_span_t::needs_end, // acronym
698 element_span_t::needs_end, // address
699 element_span_t::needs_end, // applet
700 element_span_t::immediate, // area
701 element_span_t::needs_end, // b
702 element_span_t::immediate, // base
703 element_span_t::immediate, // basefont
704 element_span_t::needs_end, // bdo
705 element_span_t::immediate, // bgsound
706 element_span_t::needs_end, // big
707 element_span_t::needs_end, // blink
708 element_span_t::needs_end, // blockquote
709 element_span_t::end_optional, // body
710 element_span_t::immediate, // br
711 element_span_t::needs_end, // button
712 element_span_t::needs_end, // caption
713 element_span_t::needs_end, // center
714 element_span_t::needs_end, // cite
715 element_span_t::needs_end, // code
716 element_span_t::immediate, // col
717 element_span_t::end_optional, // colgroup
718 element_span_t::needs_end, // comment
719 element_span_t::end_optional, // dd
720 element_span_t::needs_end, // del
721 element_span_t::needs_end, // dfn
722 element_span_t::needs_end, // dir
723 element_span_t::needs_end, // div
724 element_span_t::needs_end, // dl
725 element_span_t::end_optional, // dt
726 element_span_t::needs_end, // em
727 element_span_t::immediate, // embed
728 element_span_t::needs_end, // fieldset
729 element_span_t::needs_end, // font
730 element_span_t::needs_end, // form
731 element_span_t::immediate, // frame
732 element_span_t::needs_end, // frameset
733 element_span_t::needs_end, // h1
734 element_span_t::needs_end, // h2
735 element_span_t::needs_end, // h3
736 element_span_t::needs_end, // h4
737 element_span_t::needs_end, // h5
738 element_span_t::needs_end, // h6
739 element_span_t::end_optional, // head
740 element_span_t::immediate, // hr
741 element_span_t::end_optional, // html
742 element_span_t::needs_end, // i
743 element_span_t::needs_end, // iframe
744 element_span_t::immediate, // img
745 element_span_t::immediate, // input
746 element_span_t::needs_end, // ins
747 element_span_t::immediate, // isindex
748 element_span_t::needs_end, // kbd
749 element_span_t::needs_end, // label
750 element_span_t::needs_end, // legend
751 element_span_t::end_optional, // li
752 element_span_t::immediate, // link
753 element_span_t::needs_end, // listing
754 element_span_t::needs_end, // map
755 element_span_t::needs_end, // marquee
756 element_span_t::needs_end, // menu
757 element_span_t::immediate, // meta
758 element_span_t::immediate, // nextid
759 element_span_t::needs_end, // nobr
760 element_span_t::needs_end, // noembed
761 element_span_t::needs_end, // noframes
762 element_span_t::needs_end, // noscript
763 element_span_t::needs_end, // object
764 element_span_t::needs_end, // ol
765 element_span_t::needs_end, // optgroup
766 element_span_t::end_optional, // option
767 element_span_t::end_optional, // p
768 element_span_t::immediate, // param
769 element_span_t::end_optional, // plaintext
770 element_span_t::needs_end, // pre
771 element_span_t::needs_end, // q
772 element_span_t::immediate, // rt
773 element_span_t::needs_end, // ruby
774 element_span_t::needs_end, // s
775 element_span_t::needs_end, // samp
776 element_span_t::needs_end, // script
777 element_span_t::needs_end, // select
778 element_span_t::needs_end, // small
779 element_span_t::needs_end, // span
780 element_span_t::needs_end, // strike
781 element_span_t::needs_end, // strong
782 element_span_t::needs_end, // style
783 element_span_t::needs_end, // sub
784 element_span_t::needs_end, // sup
785 element_span_t::needs_end, // table
786 element_span_t::end_optional, // tbody
787 element_span_t::end_optional, // td
788 element_span_t::needs_end, // textarea
789 element_span_t::end_optional, // tfoot
790 element_span_t::end_optional, // th
791 element_span_t::end_optional, // thead
792 element_span_t::needs_end, // title
793 element_span_t::end_optional, // tr
794 element_span_t::needs_end, // tt
795 element_span_t::needs_end, // u
796 element_span_t::needs_end, // ul
797 element_span_t::needs_end, // var
798 element_span_t::immediate, // wbr
799 element_span_t::needs_end, // xmp
800 };
801 return element_t::a <= code && code <= element_t::xmp ?
802 lookup[static_cast<size_t>(code) - static_cast<size_t>(element_t::a)] :
803 element_span_t::needs_end;
804 }
805
811 static bool is_fontstyle(_In_ element_t code)
812 {
813 switch (code) {
814 case element_t::tt:
815 case element_t::i:
816 case element_t::b:
817 case element_t::u:
818 case element_t::s:
819 case element_t::strike:
820 case element_t::blink:
821 case element_t::big:
822 case element_t::small:
823 return true;
824 default:
825 return false;
826 };
827 }
828
834 static bool is_phrase(_In_ element_t code)
835 {
836 switch (code) {
837 case element_t::em:
838 case element_t::strong:
839 case element_t::dfn:
840 case element_t::code:
841 case element_t::samp:
842 case element_t::kbd:
843 case element_t::var:
844 case element_t::cite:
845 case element_t::abbr:
846 case element_t::acronym:
847 case element_t::xmp:
848 return true;
849 default:
850 return false;
851 };
852 }
853
859 static bool is_special(_In_ element_t code)
860 {
861 switch (code) {
862 case element_t::a:
863 case element_t::img:
864 case element_t::applet:
865 case element_t::object:
866 case element_t::embed:
867 case element_t::font:
868 case element_t::basefont:
869 case element_t::br:
870 case element_t::wbr:
871 case element_t::rt:
872 case element_t::script:
873 case element_t::map:
874 case element_t::q:
875 case element_t::sub:
876 case element_t::sup:
877 case element_t::ruby:
878 case element_t::span:
879 case element_t::bdo:
880 case element_t::iframe:
881 case element_t::nobr:
882 return true;
883 default:
884 return false;
885 };
886 }
887
893 static bool is_formctrl(_In_ element_t code)
894 {
895 switch (code) {
896 case element_t::input:
897 case element_t::select:
898 case element_t::textarea:
899 case element_t::label:
900 case element_t::button:
901 return true;
902 default:
903 return false;
904 };
905 }
906
912 static bool is_inline(_In_ element_t code)
913 {
914 return
915 code == element_t::PCDATA ||
916 is_fontstyle(code) ||
917 is_phrase(code) ||
918 is_special(code) ||
919 is_formctrl(code);
920 }
921
927 static bool is_heading(_In_ element_t code)
928 {
929 switch (code) {
930 case element_t::h1:
931 case element_t::h2:
932 case element_t::h3:
933 case element_t::h4:
934 case element_t::h5:
935 case element_t::h6:
936 return true;
937 default:
938 return false;
939 };
940 }
941
947 static bool is_list(_In_ element_t code)
948 {
949 switch (code) {
950 case element_t::ul:
951 case element_t::ol:
952 case element_t::dir:
953 case element_t::menu:
954 return true;
955 default:
956 return false;
957 };
958 }
959
965 static bool is_preformatted(_In_ element_t code)
966 {
967 switch (code) {
968 case element_t::pre:
969 case element_t::listing:
970 return true;
971 default:
972 return false;
973 }
974 }
975
981 static bool is_block(_In_ element_t code)
982 {
983 if (is_heading(code) ||
984 is_list(code) ||
985 is_preformatted(code)) return true;
986 switch (code) {
987 case element_t::p:
988 case element_t::dl:
989 case element_t::div:
990 case element_t::center:
991 case element_t::marquee:
992 case element_t::noscript:
993 case element_t::noframes:
994 case element_t::noembed:
995 case element_t::blockquote:
996 case element_t::form:
997 case element_t::isindex:
998 case element_t::hr:
999 case element_t::table:
1000 case element_t::fieldset:
1001 case element_t::address:
1002 return true;
1003 default:
1004 return false;
1005 };
1006 }
1007
1013 static bool is_flow(_In_ element_t code)
1014 {
1015 return is_block(code) || is_inline(code);
1016 }
1017
1023 static bool is_head_content(_In_ element_t code)
1024 {
1025 switch (code) {
1026 case element_t::title:
1027 case element_t::isindex:
1028 case element_t::base:
1029 case element_t::nextid:
1030 return true;
1031 default:
1032 return false;
1033 };
1034 }
1035
1041 static bool is_head_misc(_In_ element_t code)
1042 {
1043 switch (code) {
1044 case element_t::script:
1045 case element_t::style:
1046 case element_t::meta:
1047 case element_t::link:
1048 case element_t::object:
1049 return true;
1050 default:
1051 return false;
1052 };
1053 }
1054
1060 static bool is_pre_exclusion(_In_ element_t code)
1061 {
1062 switch (code) {
1063 case element_t::img:
1064 case element_t::object:
1065 case element_t::applet:
1066 case element_t::embed:
1067 case element_t::big:
1068 case element_t::small:
1069 case element_t::sub:
1070 case element_t::sup:
1071 case element_t::ruby:
1072 case element_t::font:
1073 case element_t::basefont:
1074 case element_t::nobr:
1075 return true;
1076 default:
1077 return false;
1078 };
1079 }
1080
1086 static bool is_html_content(_In_ element_t code)
1087 {
1088 switch (code) {
1089 case element_t::head:
1090 case element_t::body:
1091 case element_t::frameset:
1092 return true;
1093 default:
1094 return false;
1095 };
1096 }
1097
1103 static bool is_group(_In_ element_t code)
1104 {
1105 if (is_block(code) ||
1106 is_html_content(code) ||
1107 is_head_content(code)) return true;
1108 switch (code) {
1109 case element_t::col:
1110 case element_t::colgroup:
1111 case element_t::dd:
1112 case element_t::dir:
1113 case element_t::dt:
1114 case element_t::frame:
1115 case element_t::iframe:
1116 case element_t::legend:
1117 case element_t::td:
1118 case element_t::th:
1119 case element_t::tr:
1120 return true;
1121 default:
1122 return false;
1123 };
1124 }
1125
1134 static bool may_contain(_In_ element_t parent, _In_ element_t child)
1135 {
1136 if (child == element_t::unknown || child == element_t::comment)
1137 return true;
1138 if (is_fontstyle(parent) || is_phrase(parent))
1139 return is_inline(child);
1140 if (is_heading(parent))
1141 return is_inline(child);
1142
1143 switch (parent) {
1144 case element_t::a: return is_inline(child) && child != element_t::a;
1145 case element_t::address: return is_inline(child) || child == element_t::p;
1146 case element_t::applet: return is_flow(child) || child == element_t::param;
1147 case element_t::area: return false;
1148 case element_t::base: return false;
1149 case element_t::basefont: return false;
1150 case element_t::bdo: return is_inline(child);
1151 case element_t::blockquote: return is_flow(child);
1152 case element_t::body: return is_flow(child) || child == element_t::ins || child == element_t::del;
1153 case element_t::br: return false;
1154 case element_t::button: return is_flow(child) && !is_formctrl(child) && child != element_t::a && child != element_t::form && child != element_t::isindex && child != element_t::fieldset && child != element_t::iframe;
1155 case element_t::caption: return is_inline(child);
1156 case element_t::center: return is_flow(child);
1157 case element_t::col: return false;
1158 case element_t::colgroup: return child == element_t::col;
1159 case element_t::comment: return child == element_t::CDATA;
1160 case element_t::dd: return is_flow(child);
1161 case element_t::del: return is_flow(child);
1162 case element_t::dir: return child == element_t::li;
1163 case element_t::div: return is_flow(child);
1164 case element_t::dl: return child == element_t::dt || child == element_t::dd;
1165 case element_t::dt: return is_inline(child);
1166 case element_t::embed: return is_flow(child) || child == element_t::param;
1167 case element_t::fieldset: return is_flow(child) || child == element_t::legend || child == element_t::PCDATA;
1168 case element_t::font: return is_inline(child);
1169 case element_t::form: return is_flow(child) && child != element_t::form;
1170 case element_t::frame: return false;
1171 case element_t::frameset: return child == element_t::frameset || child == element_t::frame || child == element_t::noframes;
1172 case element_t::head: return is_head_content(child) || is_head_misc(child);
1173 case element_t::hr: return false;
1174 case element_t::html: return is_html_content(child);
1175 case element_t::iframe: return is_flow(child);
1176 case element_t::img: return false;
1177 case element_t::input: return false;
1178 case element_t::ins: return is_flow(child);
1179 case element_t::isindex: return false;
1180 case element_t::label: return is_inline(child) && child != element_t::label;
1181 case element_t::legend: return is_inline(child);
1182 case element_t::li: return is_flow(child);
1183 case element_t::link: return false;
1184 case element_t::listing: return child == element_t::CDATA;
1185 case element_t::map: return is_block(child) || child == element_t::area;
1186 case element_t::marquee: return is_flow(child);
1187 case element_t::menu: return child == element_t::li;
1188 case element_t::meta: return false;
1189 case element_t::nobr: return is_inline(child) || child == element_t::wbr;
1190 case element_t::noframes: return (is_flow(child) || child == element_t::body) && child != element_t::noframes;
1191 case element_t::noscript: return is_flow(child);
1192 case element_t::noembed: return is_flow(child);
1193 case element_t::object: return is_flow(child) || child == element_t::param;
1194 case element_t::ol: return child == element_t::li;
1195 case element_t::optgroup: return child == element_t::option;
1196 case element_t::option: return child == element_t::PCDATA;
1197 case element_t::p: return is_inline(child);
1198 case element_t::param: return false;
1199 case element_t::plaintext: return is_flow(child);
1200 case element_t::pre: return is_inline(child) && !is_pre_exclusion(child);
1201 case element_t::q: return is_inline(child);
1202 case element_t::rt: return false;
1203 case element_t::ruby: return is_inline(child);
1204 case element_t::script: return child == element_t::CDATA;
1205 case element_t::select: return child == element_t::optgroup || child == element_t::option;
1206 case element_t::span: return is_inline(child);
1207 case element_t::style: return child == element_t::CDATA;
1208 case element_t::sub: return is_inline(child);
1209 case element_t::sup: return is_inline(child);
1210 case element_t::table: return child == element_t::caption || child == element_t::col || child == element_t::colgroup || child == element_t::thead || child == element_t::tfoot || child == element_t::tbody;
1211 case element_t::tbody: return child == element_t::tr;
1212 case element_t::td: return is_flow(child);
1213 case element_t::textarea: return child == element_t::PCDATA;
1214 case element_t::tfoot: return child == element_t::tr;
1215 case element_t::th: return is_flow(child);
1216 case element_t::thead: return child == element_t::tr;
1217 case element_t::title: return child == element_t::PCDATA;
1218 case element_t::tr: return child == element_t::td || child == element_t::th;
1219 case element_t::ul: return child == element_t::li;
1220 case element_t::wbr: return false;
1221 case element_t::unknown: return true;
1222 default: return false;
1223 }
1224 }
1225
1233 template <class T>
1234 static bool is_uri(_In_ element_t code, _In_reads_or_z_opt_(num_chars) const T* attr_name, _In_ size_t num_chars)
1235 {
1236 stdex_assert(attr_name || !num_chars);
1237 switch (code) {
1238 case element_t::a: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX);
1239 case element_t::applet: return !stdex::strnicmp(attr_name, num_chars, "code", SIZE_MAX) ||
1240 !stdex::strnicmp(attr_name, num_chars, "codebase", SIZE_MAX) ||
1241 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1242 case element_t::area: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX);
1243 case element_t::base: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX);
1244 case element_t::bgsound: return !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1245 case element_t::blockquote: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX);
1246 case element_t::body: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX);
1247 case element_t::comment: return !stdex::strnicmp(attr_name, num_chars, "data", SIZE_MAX);
1248 case element_t::del: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX);
1249 case element_t::embed: return !stdex::strnicmp(attr_name, num_chars, "pluginspage", SIZE_MAX) ||
1250 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1251 case element_t::form: return !stdex::strnicmp(attr_name, num_chars, "action", SIZE_MAX);
1252 case element_t::frame: return !stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX) ||
1253 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1254 case element_t::head: return !stdex::strnicmp(attr_name, num_chars, "profile", SIZE_MAX);
1255 case element_t::iframe: return !stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX) ||
1256 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1257 case element_t::img: return !stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX) ||
1258 !stdex::strnicmp(attr_name, num_chars, "lowsrc", SIZE_MAX) ||
1259 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) ||
1260 !stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX);
1261 case element_t::input: return !stdex::strnicmp(attr_name, num_chars, "lowsrc", SIZE_MAX) ||
1262 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) ||
1263 !stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX);
1264 case element_t::ins: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX);
1265 case element_t::link: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX);
1266 case element_t::object: return !stdex::strnicmp(attr_name, num_chars, "basehref", SIZE_MAX) ||
1267 !stdex::strnicmp(attr_name, num_chars, "classid", SIZE_MAX) ||
1268 !stdex::strnicmp(attr_name, num_chars, "code", SIZE_MAX) ||
1269 !stdex::strnicmp(attr_name, num_chars, "codebase", SIZE_MAX) ||
1270 !stdex::strnicmp(attr_name, num_chars, "data", SIZE_MAX) ||
1271 !stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX);
1272 case element_t::q: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX);
1273 case element_t::script: return !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1274 case element_t::table: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX);
1275 case element_t::td: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX);
1276 case element_t::th: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX);
1277 default: return false;
1278 }
1279 }
1280
1288 template <class T>
1289 static bool is_localizable(element_t code, const T* attr_name, size_t num_chars)
1290 {
1291 stdex_assert(attr_name || !num_chars);
1292 if (!stdex::strnicmp(attr_name, num_chars, "title", SIZE_MAX))
1293 return true;
1294 switch (code) {
1295 case element_t::applet: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1296 case element_t::area: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1297 case element_t::img: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1298 case element_t::input: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1299 case element_t::object: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1300 case element_t::table: return !stdex::strnicmp(attr_name, num_chars, "summary", SIZE_MAX);
1301 case element_t::td: return !stdex::strnicmp(attr_name, num_chars, "abbr", SIZE_MAX);
1302 case element_t::th: return !stdex::strnicmp(attr_name, num_chars, "abbr", SIZE_MAX);
1303 default: return false;
1304 }
1305 }
1306 };
1307
1308 class sequence;
1309 using sequence_store = std::vector<std::unique_ptr<sequence>>;
1310
1315 {
1316 public:
1317 stdex::parser::html_sequence_t type;
1320
1321 sequence(_In_ stdex::parser::html_sequence_t _type = stdex::parser::html_sequence_t::unknown, _In_ size_t start = 0, size_t end = 0, _In_opt_ sequence* _parent = nullptr) :
1322 type(_type),
1323 interval(start, end),
1324 parent(_parent)
1325 {}
1326
1327 virtual ~sequence() {} // make polymorphic
1328 };
1329
1333 class element : public sequence
1334 {
1335 public:
1336 template <class T>
1337 element(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr) :
1338 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1339 code(element_code(src + tag.name.start, tag.name.size())),
1340 name(std::move(tag.name)),
1341 attributes(std::move(tag.attributes))
1342 {}
1343
1344 template <class T>
1345 static element_t element_code(_In_reads_z_(num_chars) const T* name, size_t num_chars)
1346 {
1347 static const struct {
1348 const char* name;
1349 element_t code;
1350 } mapping[] = {
1351 { "a", element_t::a, },
1352 { "abbr", element_t::abbr, },
1353 { "acronym", element_t::acronym, },
1354 { "address", element_t::address, },
1355 { "applet", element_t::applet, },
1356 { "area", element_t::area, },
1357 { "b", element_t::b, },
1358 { "base", element_t::base, },
1359 { "basefont", element_t::basefont, },
1360 { "bdo", element_t::bdo, },
1361 { "bgsound", element_t::bgsound, },
1362 { "big", element_t::big, },
1363 { "blink", element_t::blink, },
1364 { "blockquote", element_t::blockquote, },
1365 { "body", element_t::body, },
1366 { "br", element_t::br, },
1367 { "button", element_t::button, },
1368 { "caption", element_t::caption, },
1369 { "center", element_t::center, },
1370 { "cite", element_t::cite, },
1371 { "code", element_t::code, },
1372 { "col", element_t::col, },
1373 { "colgroup", element_t::colgroup, },
1374 { "comment", element_t::comment, },
1375 { "dd", element_t::dd, },
1376 { "del", element_t::del, },
1377 { "dfn", element_t::dfn, },
1378 { "dir", element_t::dir, },
1379 { "div", element_t::div, },
1380 { "dl", element_t::dl, },
1381 { "dt", element_t::dt, },
1382 { "em", element_t::em, },
1383 { "embed", element_t::embed, },
1384 { "fieldset", element_t::fieldset, },
1385 { "font", element_t::font, },
1386 { "form", element_t::form, },
1387 { "frame", element_t::frame, },
1388 { "frameset", element_t::frameset, },
1389 { "h1", element_t::h1, },
1390 { "h2", element_t::h2, },
1391 { "h3", element_t::h3, },
1392 { "h4", element_t::h4, },
1393 { "h5", element_t::h5, },
1394 { "h6", element_t::h6, },
1395 { "head", element_t::head, },
1396 { "hr", element_t::hr, },
1397 { "html", element_t::html, },
1398 { "i", element_t::i, },
1399 { "iframe", element_t::iframe, },
1400 { "img", element_t::img, },
1401 { "input", element_t::input, },
1402 { "ins", element_t::ins, },
1403 { "isindex", element_t::isindex, },
1404 { "kbd", element_t::kbd, },
1405 { "label", element_t::label, },
1406 { "legend", element_t::legend, },
1407 { "li", element_t::li, },
1408 { "link", element_t::link, },
1409 { "listing", element_t::listing, },
1410 { "map", element_t::map, },
1411 { "marquee", element_t::marquee, },
1412 { "menu", element_t::menu, },
1413 { "meta", element_t::meta, },
1414 { "nextid", element_t::nextid, },
1415 { "nobr", element_t::nobr, },
1416 { "noembed", element_t::noembed, },
1417 { "noframes", element_t::noframes, },
1418 { "noscript", element_t::noscript, },
1419 { "object", element_t::object, },
1420 { "ol", element_t::ol, },
1421 { "optgroup", element_t::optgroup, },
1422 { "option", element_t::option, },
1423 { "p", element_t::p, },
1424 { "param", element_t::param, },
1425 { "plaintext", element_t::plaintext, },
1426 { "pre", element_t::pre, },
1427 { "q", element_t::q, },
1428 { "rt", element_t::rt, },
1429 { "ruby", element_t::ruby, },
1430 { "s", element_t::s, },
1431 { "samp", element_t::samp, },
1432 { "script", element_t::script, },
1433 { "select", element_t::select, },
1434 { "small", element_t::small, },
1435 { "span", element_t::span, },
1436 { "strike", element_t::strike, },
1437 { "strong", element_t::strong, },
1438 { "style", element_t::style, },
1439 { "sub", element_t::sub, },
1440 { "sup", element_t::sup, },
1441 { "table", element_t::table, },
1442 { "tbody", element_t::tbody, },
1443 { "td", element_t::td, },
1444 { "textarea", element_t::textarea, },
1445 { "tfoot", element_t::tfoot, },
1446 { "th", element_t::th, },
1447 { "thead", element_t::thead, },
1448 { "title", element_t::title, },
1449 { "tr", element_t::tr, },
1450 { "tt", element_t::tt, },
1451 { "u", element_t::u, },
1452 { "ul", element_t::ul, },
1453 { "var", element_t::var, },
1454 { "wbr", element_t::wbr, },
1455 { "xmp", element_t::xmp, },
1456 };
1457#ifndef NDEBUG
1458 // The mapping table MUST be sorted and all names in lowercase.
1459 for (size_t i = 1; i < _countof(mapping); i++)
1460 stdex_assert(stdex::strcmp(mapping[i - 1].name, mapping[i].name) <= 0);
1461 for (size_t i = 0; i < _countof(mapping); i++) {
1462 for (size_t j = 0; mapping[i].name[j]; j++)
1463 stdex_assert(stdex::islower(mapping[i].name[j]) | stdex::isdigit(mapping[i].name[j]));
1464 }
1465#endif
1466 for (size_t i = 0, j = _countof(mapping); i < j; ) {
1467 size_t m = (i + j) / 2;
1468 int r = 0;
1469 for (size_t i1 = 0, i2 = 0;;) {
1470 if (!mapping[m].name[i1]) {
1471 r = i2 >= num_chars || !name[i2] ? 0 : -1;
1472 break;
1473 }
1474 if (i2 >= num_chars || !name[i2]) {
1475 r = 1;
1476 break;
1477 }
1478
1479 auto chr = static_cast<char>(stdex::tolower(name[i2++]));
1480 if (mapping[m].name[i1] > chr) {
1481 r = 1;
1482 break;
1483 }
1484 if (mapping[m].name[i1] < chr) {
1485 r = -1;
1486 break;
1487 }
1488 i1++;
1489 }
1490
1491 if (r < 0)
1492 i = m + 1;
1493 else if (r > 0)
1494 j = m;
1495 else
1496 return mapping[m].code;
1497 }
1498 return element_t::unknown;
1499 }
1500
1501 public:
1502 element_t code;
1504 std::vector<stdex::parser::html_attribute> attributes;
1505 };
1506
1507 class element_end;
1508
1512 class element_start : public element
1513 {
1514 public:
1515 template <class T>
1516 element_start(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr, _In_opt_ sequence* _end = nullptr) :
1517 element(std::move(tag), src, parent),
1518 end(_end)
1519 {}
1520
1521 public:
1523 };
1524
1528 class element_end : public sequence
1529 {
1530 public:
1531 template <class T>
1532 element_end(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr, _In_opt_ element_start* _start = nullptr) :
1533 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1534 code(element::element_code(src + tag.name.start, tag.name.size())),
1535 name(std::move(tag.name)),
1536 start(_start)
1537 {}
1538
1539 public:
1540 element_t code;
1543 };
1544
1548 class declaration : public sequence
1549 {
1550 public:
1551 template <class T>
1552 declaration(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_opt_ sequence* parent = nullptr) :
1553 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1554 name(std::move(tag.name)),
1555 attributes(std::move(tag.attributes))
1556 {}
1557
1558 public:
1560 std::vector<stdex::parser::html_attribute> attributes;
1561 };
1562
1566 class comment : public sequence
1567 {
1568 public:
1569 template <class T>
1570 comment(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_opt_ sequence* parent = nullptr) :
1571 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1572 content(std::move(tag.name))
1573 {}
1574
1575 public:
1577 };
1578
1582 class instruction : public sequence
1583 {
1584 public:
1585 template <class T>
1586 instruction(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_opt_ sequence* parent = nullptr) :
1587 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1588 content(std::move(tag.name))
1589 {}
1590
1591 public:
1593 };
1594
1598 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
1599 struct entity
1600 {
1602 std::basic_string<T, TR, AX> value;
1603 };
1604
1608 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
1609 class parser;
1610
1614 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
1616 {
1617 public:
1618 document() :
1619 m_num_parsed(0),
1620 m_charset(stdex::charset_id::system),
1621
1622 // Declaration parsing data
1625 m_is_cdata(false),
1626 m_is_rcdata(false),
1627
1628 // Element parsing data
1630 {}
1631
1635 void clear()
1636 {
1637 m_source.clear();
1638 m_num_parsed = 0;
1639 m_charset = stdex::charset_id::system;
1640
1641 // Declaration parsing data
1643 m_is_cdata = m_is_rcdata = false;
1644 m_entities.clear();
1645
1646 // Element parsing data
1647 m_sequences.clear();
1648
1649 m_element_stack.clear();
1650 m_is_special_element = false;
1651 }
1652
1656 void append(_In_reads_or_z_opt_(num_chars) const T* source, _In_ size_t num_chars)
1657 {
1658 stdex_assert(source || !num_chars);
1659 m_source.append(source, stdex::strnlen(source, num_chars));
1660 source = m_source.data();
1661 num_chars = m_source.size();
1662
1663 for (size_t i = m_num_parsed; i < num_chars;) {
1664 if (m_is_cdata || m_is_rcdata) {
1665 if (m_condition_end.match(source, i, num_chars)) {
1666 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(
1667 m_is_cdata ? stdex::parser::html_sequence_t::CDATA : stdex::parser::html_sequence_t::PCDATA,
1668 m_num_parsed, i,
1669 active_element()))));
1670 m_is_cdata = m_is_rcdata = false;
1671 i = m_num_parsed = m_condition_end.interval.end;
1672 continue;
1673 }
1674 goto next_char;
1675 }
1676
1678 if (m_condition_end.match(source, i, num_chars)) {
1680 i = m_num_parsed = m_condition_end.interval.end;
1681 continue;
1682 }
1683 goto next_char;
1684 }
1685
1686 if (m_num_valid_conditions && m_condition_end.match(source, i, num_chars)) {
1687 if (m_num_parsed < i)
1688 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));
1689
1691 i = m_num_parsed = m_condition_end.interval.end;
1692 continue;
1693 }
1694
1695 if (m_condition_start.match(source, i, num_chars)) {
1696 auto condition_src(replace_entities(source + m_condition_start.condition.start, m_condition_start.condition.size()));
1697 if (stdex::strncmp(condition_src.data(), condition_src.size(), "CDATA", SIZE_MAX) == 0)
1698 m_is_cdata = true;
1699 else if (stdex::strncmp(condition_src.data(), condition_src.size(), "RCDATA", SIZE_MAX) == 0)
1700 m_is_rcdata = true;
1703 else if (stdex::strncmp(condition_src.data(), condition_src.size(), "IGNORE", SIZE_MAX) == 0)
1705 else
1707
1708 i = m_num_parsed = m_condition_start.interval.end;
1709 continue;
1710 }
1711
1713 auto parent = active_element();
1714 stdex_assert(parent);
1715 if (m_tag.match(source, i, num_chars) &&
1716 m_tag.type == stdex::parser::html_sequence_t::element_end &&
1717 element::element_code(source + m_tag.name.start, m_tag.name.size()) == parent->code)
1718 {
1719 if (m_num_parsed < i)
1720 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, parent))));
1721 i = m_num_parsed = m_tag.interval.end;
1722 std::unique_ptr<element_end> e(new element_end(std::move(m_tag), source, parent->parent, parent));
1723 parent->end = e.get();
1724 m_sequences.push_back(std::move(e));
1725 m_element_stack.pop_back();
1726 m_is_special_element = false;
1727 continue;
1728 }
1729 goto next_char;
1730 }
1731
1732 if (m_tag.match(source, i, num_chars)) {
1733 if (m_num_parsed < i)
1734 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));
1735 i = m_num_parsed = m_tag.interval.end;
1736
1737 switch (m_tag.type) {
1738 case stdex::parser::html_sequence_t::element:
1739 case stdex::parser::html_sequence_t::element_start: {
1740 std::unique_ptr<element> e(
1741 m_tag.type == stdex::parser::html_sequence_t::element ? new element(std::move(m_tag), source) :
1742 m_tag.type == stdex::parser::html_sequence_t::element_start ? new element_start(std::move(m_tag), source) :
1743 nullptr);
1744
1745 // Does this tag end any of the started elements?
1746 for (size_t j = m_element_stack.size(); j--; ) {
1747 auto starting_tag = m_element_stack[j];
1748 stdex_assert(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1749 if (element_traits::may_contain(starting_tag->code, e->code)) {
1750 e->parent = starting_tag;
1751 break;
1752 }
1753 e->parent = starting_tag->parent;
1754 starting_tag->end = e.get();
1755 m_element_stack.resize(j);
1756 }
1757
1758 if (e->type == stdex::parser::html_sequence_t::element_start) {
1759 auto e_start = static_cast<element_start*>(e.get());
1760 if (element_traits::span(e->code) == element_span_t::immediate)
1761 e_start->end = e.get();
1762 else {
1763 m_element_stack.push_back(e_start);
1764 switch (e->code) {
1765 case element_t::code:
1766 case element_t::comment:
1767 case element_t::script:
1768 case element_t::style:
1769 m_is_special_element = true;
1770 break;
1771 default:;
1772 }
1773 }
1774 }
1775
1776 if (e->code == element_t::meta && m_charset == stdex::charset_id::system) {
1777 bool is_content_type = false;
1778 stdex::parser::html_attribute* content_attr = nullptr;
1779 for (auto& attr : e->attributes) {
1780 if (!stdex::strnicmp(source + attr.name.start, attr.name.size(), "http-equiv", SIZE_MAX) &&
1781 !stdex::strnicmp(source + attr.value.start, attr.value.size(), "content-type", SIZE_MAX))
1782 is_content_type = true;
1783 else if (!stdex::strnicmp(source + attr.name.start, attr.name.size(), "content", SIZE_MAX))
1784 content_attr = &attr;
1785 }
1786 if (is_content_type && content_attr) {
1787 // <meta http-equiv="Content-Type" content="..."> found.
1789 if (content.match(source, content_attr->value.start, content_attr->value.end) &&
1790 content.charset)
1791 {
1792 std::string str;
1793 str.reserve(content.charset.size());
1794 for (size_t j = content.charset.start; j < content.charset.end; ++j)
1795 str.push_back(static_cast<char>(source[j]));
1796 m_charset = stdex::charset_from_name(str);
1797 }
1798 }
1799 }
1800
1801 m_sequences.push_back(std::move(e));
1802 break;
1803 }
1804 case stdex::parser::html_sequence_t::element_end: {
1805 std::unique_ptr<element_end> e(new element_end(std::move(m_tag), source, active_element()));
1806
1807 for (size_t j = m_element_stack.size(); j--; ) {
1808 auto starting_tag = m_element_stack[j];
1809 stdex_assert(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1810 if (starting_tag->code == e->code ||
1811 (starting_tag->code == element_t::unknown && e->code == element_t::unknown && !stdex::strnicmp(source + starting_tag->name.start, starting_tag->name.size(), source + e->name.start, e->name.size())))
1812 {
1813 e->start = starting_tag;
1814 e->parent = starting_tag->parent;
1815 starting_tag->end = e.get();
1816 m_element_stack.resize(j);
1817 break;
1818 }
1819 }
1820
1821 m_sequences.push_back(std::move(e));
1822 break;
1823 }
1824 case stdex::parser::html_sequence_t::declaration:
1825 if (m_tag.attributes.size() > 3 &&
1826 !stdex::strnicmp(source + m_tag.attributes[0].name.start, m_tag.attributes[0].name.size(), "entity", SIZE_MAX))
1827 {
1828 if (!stdex::strncmp(source + m_tag.attributes[1].name.start, m_tag.attributes[1].name.size(), "%", SIZE_MAX) &&
1829 stdex::strncmp(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size(), "SYSTEM", SIZE_MAX) &&
1830 stdex::strncmp(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size(), "PUBLIC", SIZE_MAX))
1831 {
1832 std::unique_ptr<entity<T, TR, AX>> e(new entity<T, TR, AX>());
1833 e->name = m_tag.attributes[2].name;
1834 e->value = std::move(replace_entities(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size()));
1835 m_entities.push_back(std::move(e));
1836 }
1837
1838 // TODO: Parse & entities and entities in SYSTEM and PUBLIC external files.
1839 }
1840 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new declaration(std::move(m_tag), active_element()))));
1841 break;
1842 case stdex::parser::html_sequence_t::comment:
1843 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new comment(std::move(m_tag), active_element()))));
1844 break;
1845 case stdex::parser::html_sequence_t::instruction:
1846 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new instruction(std::move(m_tag), active_element()))));
1847 break;
1848 default:
1849 throw std::invalid_argument("unknown tag type");
1850 }
1851
1852 continue;
1853 }
1854
1855 next_char:
1856 if (m_any_char.match(source, i, num_chars)) {
1857 // Skip any character, but don't declare it as parsed yet. It might be a part of unfinished tag.
1858 i = m_any_char.interval.end;
1859 }
1860 else
1861 break;
1862 }
1863 }
1864
1869 {
1870 size_t i = m_source.size();
1871 if (m_num_parsed < i)
1872 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));
1873 m_num_parsed = i;
1874 m_element_stack.clear();
1875 }
1876
1880 void assign(_In_reads_or_z_opt_(num_chars) const T* source, _In_ size_t num_chars)
1881 {
1882 clear();
1883 append(source, num_chars);
1884 finalize();
1885 }
1886
1890 const std::basic_string<T, TR, AX>& source() const { return m_source; }
1891
1892 friend class parser<T, TR, AX>;
1893
1894 protected:
1899 {
1900 return m_element_stack.empty() ? nullptr : m_element_stack.back();
1901 }
1902
1906 std::basic_string<T, TR, AX> replace_entities(_In_reads_or_z_opt_(num_chars) const T* input, _In_ size_t num_chars) const
1907 {
1908 stdex_assert(input || !num_chars);
1909 const size_t num_entities = m_entities.size();
1910 const T* source = m_source.data();
1911 std::basic_string<T, TR, AX> output;
1912 for (size_t i = 0; i < num_chars && input[i];) {
1913 if (input[i] == '%') {
1914 for (size_t j = 0; j < num_entities; j++) {
1915 auto& e = m_entities[j];
1916 size_t entity_size = e->name.size();
1917 if (i + entity_size + 1 < num_chars &&
1918 !stdex::strncmp(input + i + 1, source + e->name.start, entity_size) &&
1919 input[i + entity_size + 1] == ';')
1920 {
1921 output += e->value;
1922 i += entity_size + 2;
1923 goto next_char;
1924 }
1925 }
1926 throw std::runtime_error("undefined entity");
1927 }
1928 output += input[i++];
1929 next_char:;
1930 }
1931 return output;
1932 }
1933
1934 protected:
1935 std::basic_string<T, TR, AX> m_source;
1937 stdex::charset_id m_charset;
1938
1939 // Declaration parsing data
1947 std::vector<std::unique_ptr<entity<T, TR, AX>>> m_entities;
1948
1949 // Element parsing data
1951 sequence_store m_sequences;
1952 std::vector<element_start*> m_element_stack;
1954 };
1955
1959 enum class token_t {
1960 root = 0,
1961 complete,
1962 starting,
1963 ending,
1964 url,
1965 };
1966
1970 constexpr size_t token_tag_max =
1971 sizeof(void*) * 2 // Memory address in hexadecimal
1972 + 2 // Leading and trailing parenthesis
1973 + 1; // Zero terminator
1974
1979 constexpr char token_tag_start = '\x12';
1980
1985 constexpr char token_tag_end = '\x13';
1986
1990 class token
1991 {
1992 protected:
1993 token(_In_ token_t _type = token_t::root, _In_opt_ sequence* _sequence = nullptr, _In_ uintptr_t _data = 0) :
1994 type(_type),
1995 sequence(_sequence),
1996 data(_data)
1997 {}
1998
1999 template<class T, class TR, class AX>
2000 friend class parser;
2001
2002 public:
2003 virtual ~token() {} // make polymorphic
2004
2012 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
2013 size_t append_tag(_Inout_ std::basic_string<char, TR, AX>& str) const
2014 {
2015 size_t n = str.size();
2016 // Use %X instead of %p to omit leading zeros and save space.
2017 stdex::appendf(str, "%c%zX%c", stdex::locale_C, token_tag_start, reinterpret_cast<uintptr_t>(this), token_tag_end);
2018 return str.size() - n;
2019 }
2020
2028 template<class TR = std::char_traits<wchar_t>, class AX = std::allocator<wchar_t>>
2029 size_t append_tag(_Inout_ std::basic_string<wchar_t, TR, AX>& str) const
2030 {
2031 // Use %X instead of %p to omit leading zeros and save space.
2032 return stdex::appendf(str, L"%c%zX%c", stdex::locale_C, static_cast<wchar_t>(token_tag_start), reinterpret_cast<uintptr_t>(this), static_cast<wchar_t>(token_tag_end));
2033 }
2034
2035 template<class T>
2036 static token* parse_tag(const T* str, size_t& offset)
2037 {
2038 if (str[offset] != static_cast<T>(token_tag_start))
2039 return nullptr;
2040
2041 // Locate tag end.
2042 size_t end;
2043 for (end = offset + 1; ; end++) {
2044 if (!str[end])
2045 return nullptr;
2046 if (str[end] == token_tag_end)
2047 break;
2048 }
2049
2050 // Parse hexadecimal token memory address.
2051 token* t = reinterpret_cast<token*>(stdex::strtouint<T, uintptr_t>(str + offset + 1, end - offset - 1, nullptr, 16));
2052 if (!t)
2053 throw std::invalid_argument("null token");
2054 offset = end + 1;
2055 return t;
2056 }
2057
2058 public:
2059 token_t type;
2061 uintptr_t data;
2062 };
2063
2064 using token_vector = std::vector<std::unique_ptr<token>>;
2065 using token_list = std::list<token*>;
2066
2070 enum text_type_flag_t : uint32_t {
2071 has_tokens = 1 << 0,
2072 has_text = 1 << 1,
2073 is_title = 1 << 2,
2074 is_bullet = 1 << 3,
2075 };
2076
2080 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
2081 class text_token : public token
2082 {
2083 protected:
2084 text_token(
2085 _In_ token_t type = token_t::complete,
2086 _In_reads_or_z_opt_(num_chars) const T* _text = nullptr, _In_ size_t num_chars = 0,
2087 _In_ uint32_t _text_type = 0,
2088 _In_opt_ stdex::html::sequence* sequence = nullptr, _In_ uintptr_t data = 0) :
2090 text(_text, num_chars),
2091 text_type(_text_type)
2092 {}
2093
2094 friend class parser<T, TR, AX>;
2095
2096 public:
2097 std::basic_string<T, TR, AX> text;
2098 uint32_t text_type;
2099 stdex::mapping_vector<size_t> mapping;
2100 };
2101
2105 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
2106 class starting_token : public text_token<T, TR, AX>
2107 {
2108 protected:
2110 _In_reads_or_z_opt_(num_chars_text) const T* _text = nullptr, _In_ size_t num_chars_text = 0,
2111 _In_reads_or_z_opt_(num_chars_name) const T* _name = nullptr, _In_ size_t num_chars_name = 0,
2112 _In_ uint32_t text_type = 0,
2113 _In_opt_ stdex::html::sequence* sequence = nullptr,
2114 _In_opt_ stdex::html::sequence* _end_sequence = nullptr,
2115 _In_ uintptr_t data = 0) :
2116 text_token<T, TR, AX>(token_t::starting, _text, num_chars_text, text_type, sequence, data),
2117 name(_name, num_chars_name),
2118 end_sequence(_end_sequence)
2119 {}
2120
2121 friend class parser<T, TR, AX>;
2122
2123 public:
2124 std::basic_string<T, TR, AX> name;
2126 };
2127
2131 enum class token_url_t {
2132 plain = 0, // URL is not using any particular encoding scheme (as-is)
2133 sgml, // URL is encoded using SGML entities
2134 css, // URL is encoded using CSS escaping scheme
2135 };
2136
2140 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
2141 class url_token : public token
2142 {
2143 protected:
2144 url_token(
2145 _In_reads_or_z_opt_(num_chars) const T* _url = nullptr, _In_ size_t num_chars = 0,
2146 token_url_t _encoding = token_url_t::plain,
2147 _In_opt_ stdex::html::sequence* sequence = nullptr, _In_ uintptr_t data = 0) :
2148 token(token_t::url, sequence, data),
2149 url(_url, num_chars),
2150 encoding(_encoding)
2151 {}
2152
2153 friend class parser<T, TR, AX>;
2154
2155 public:
2156 std::basic_string<T, TR, AX> url;
2157 token_url_t encoding;
2158 };
2159
2165 std::list<stdex::html::token*> active_tokens;
2166 size_t word_index;
2168 };
2169
2170 using inserted_token_list = std::list<inserted_token>;
2171
2172 template<class T, class TR, class AX>
2174 {
2175 public:
2176 parser(
2177 _In_ const document<T, TR, AX>& document,
2178 _In_reads_or_z_opt_(num_chars) const stdex::schar_t* url = nullptr, _In_ size_t num_chars = 0,
2179 _In_ bool parse_frames = false, _In_ stdex::progress<size_t>* progress = nullptr) :
2181 m_url(url, stdex::strnlen(url, num_chars)),
2182 m_parse_frames(parse_frames),
2184 m_source(nullptr)
2185 {}
2186
2191 {
2192 stdex_assert(m_tokens.empty());
2193
2194 if (m_progress) {
2195 m_progress->set_range(0, m_document.source().size());
2196 m_progress->set(0);
2197 }
2198
2199 m_source = m_document.source().data();
2201 return parse(m_document.m_sequences.end());
2202 }
2203
2210 static void link(_Inout_ std::basic_string<T, TR, AX>& source, _In_ const text_token<T, TR, AX>* t)
2211 {
2212 stdex_assert(t);
2213 stdex_assert(
2214 t->type == token_t::complete ||
2215 t->type == token_t::starting ||
2216 t->type == token_t::ending ||
2217 t->type == token_t::root);
2218
2219 if (t->text_type & has_tokens) {
2220 const T* root = t->text.data();
2221 for (size_t i = 0, num_chars = t->text.size(); i < num_chars && root[i];) {
2222 stdex_assert(root[i] != token_tag_end);
2223 const token* t2 = token::parse_tag(root, i);
2224 if (t2) {
2225 switch (t2->type) {
2226 case token_t::complete:
2227 case token_t::starting:
2228 case token_t::ending:
2229 case token_t::root:
2230 link(source, dynamic_cast<const text_token<T, TR, AX>*>(t2));
2231 break;
2232 case token_t::url: {
2233 auto t2_url = dynamic_cast<const url_token<T, TR, AX>*>(t2);
2234 switch (t2_url->encoding) {
2235 case token_url_t::plain:
2236 source += t2_url->url;
2237 break;
2238 case token_url_t::sgml:
2239 escape(source, t2_url->url.data(), t2_url->url.size());
2240 break;
2241 case token_url_t::css:
2242 css_escape(source, t2_url->url.data(), t2_url->url.size());
2243 break;
2244 default:
2245 throw std::invalid_argument("unsupported URL encoding");
2246 }
2247 break;
2248 }
2249 default:
2250 throw std::invalid_argument("unsupported token type");
2251 }
2252 }
2253 else if (t->text_type & has_text) {
2254 escape_min(source, root[i]);
2255 i++;
2256 }
2257 else
2258 source += root[i++];
2259 }
2260 }
2261 else if (t->text_type & has_text) {
2262 // Token contains no references to other tokens. But, it does contain text that requires escaping.
2263 escape_min(source, t->text.data(), t->text.size());
2264 }
2265 else
2266 source += t->text;
2267 }
2268
2277 static void start_tokens(_Inout_ std::basic_string<T, TR, AX>& source, _Inout_ token_list& active_tokens, _In_ const token_list& new_tokens, _In_ token_list::const_iterator from)
2278 {
2279 for (; from != new_tokens.cend(); ++from) {
2280 auto t = *from;
2281 t->append_tag(source);
2282 active_tokens.push_back(t);
2283 }
2284 }
2285
2295 token_list::const_iterator end_tokens(_Inout_ std::basic_string<T, TR, AX>& source, _Inout_ token_list& active_tokens, _In_ const token_list& new_tokens)
2296 {
2297 // Skip matching tokens in active_tokens and new_tokens.
2298 token_list::const_iterator i1, i2;
2299 for (i1 = active_tokens.cbegin(), i2 = new_tokens.cbegin(); i1 != active_tokens.cend(); ++i1, ++i2) {
2300 if (i2 == new_tokens.cend() || *i1 != *i2) {
2301 // Got two tokens, where lists don't match anymore, or new_tokens list is out.
2302 // End tokens not relevant anymore in reverse order of starting.
2303 for (auto i = active_tokens.cend(); i != active_tokens.cbegin(); ) {
2304 auto t1 = dynamic_cast<starting_token<T, TR, AX>*>(*(--i));
2305 stdex_assert(t1 && t1->type == token_t::starting);
2306
2307 std::unique_ptr<text_token<T, TR, AX>> t2(new text_token<T, TR, AX>(token_t::ending));
2308 t2->text.reserve(t1->name.size() + 3);
2309 t2->text += '<';
2310 t2->text += '/';
2311 t2->text += t1->name;
2312 t2->text += '>';
2313 append_token(std::move(t2), source);
2314
2315 // Pop the active token.
2316 if (i1 == i) {
2317 active_tokens.erase(i);
2318 break;
2319 }
2320 active_tokens.erase(i);
2321 i = active_tokens.cend();
2322 }
2323 break;
2324 }
2325 }
2326 return i2;
2327 }
2328
2338 void append_inserted_tokens(_Inout_ std::basic_string<T, TR, AX>& source, _Inout_ inserted_token_list& inserted_tokens,
2339 _In_ size_t word_index, _In_ bool after_word,
2340 _Inout_ token_list& active_tokens)
2341 {
2342 for (auto i = inserted_tokens.begin(); i != inserted_tokens.end(); ) {
2343 auto& t = *i;
2344 stdex_assert(t.token);
2345 if (t.word_index == word_index && t.after_word == after_word) {
2346 if (t.token->type != token_t::ending)
2347 start_tokens(source, active_tokens, t.active_tokens, end_tokens(source, active_tokens, t.active_tokens));
2348 t.token->append_tag(source);
2349 inserted_tokens.erase(i++);
2350 }
2351 else
2352 ++i;
2353 }
2354 }
2355
2362 static void merge(_Inout_ token_list& a, _In_ const token_list& b)
2363 {
2364 for (auto i2 = b.begin(); i2 != b.end(); ++i2) {
2365 auto t2 = *i2;
2366 for (auto i1 = a.begin(); i1 != a.end(); ++i1) {
2367 if (i1 == a.end()) {
2368 a.push_back(t2);
2369 break;
2370 }
2371 auto t1 = *i1;
2372 if (t1 == t2)
2373 break;
2374 }
2375 }
2376 }
2377
2381 void make_absolute_url(std::basic_string<T, TR, AX>& rel)
2382 {
2383 _Unreferenced_(rel);
2384
2385 if (m_url.empty())
2386 return;
2387
2388 // TODO: Implement!
2389 }
2390
2394 const token_vector& tokens() const { return m_tokens; }
2395
2396 protected:
2404 template <class T_token>
2405 T_token* append_token(_Inout_ std::unique_ptr<T_token>&& token)
2406 {
2407 if (!token)
2408 return nullptr;
2409 auto t = token.get();
2410 m_tokens.push_back(std::move(token));
2411 return t;
2412 }
2413
2422 template <class T_token>
2423 size_t append_token(_Inout_ std::unique_ptr<T_token>&& token, _Inout_ std::basic_string<T, TR, AX>& source)
2424 {
2425 if (!token)
2426 return 0;
2427 size_t n = token->append_tag(source);
2428 m_tokens.push_back(std::move(token));
2429 return n;
2430 }
2431
2440 text_token<T, TR, AX>* parse(_In_ const sequence_store::const_iterator& end, _In_ uint32_t text_type = 0)
2441 {
2443 std::unique_ptr<text_token<T, TR, AX>> token(new text_token<T, TR, AX>(
2444 token_t::complete,
2445 nullptr, 0,
2446 text_type,
2447 m_offset != end ? m_offset->get() : nullptr));
2448
2449 while (m_offset != end) {
2450 auto& s = *m_offset;
2451
2452 if (m_progress) {
2453 if (m_progress->cancel())
2454 throw stdex::user_cancelled();
2455 m_progress->set(s->interval.start);
2456 }
2457
2458 // No token_tag_start and token_tag_end chars, please.
2459 stdex_assert(
2460 stdex::strnchr(m_source + s->interval.start, s->interval.size(), static_cast<T>(token_tag_start)) == stdex::npos &&
2461 stdex::strnchr(m_source + s->interval.start, s->interval.size(), static_cast<T>(token_tag_end)) == stdex::npos);
2462
2463 if (s->type == stdex::parser::html_sequence_t::text) {
2464 rel.from = s->interval.start;
2465 token->mapping.push_back(rel);
2466 stdex::sgml2strcat(token->text, m_source + s->interval.start, s->interval.size(), 0, rel, &token->mapping);
2467 rel.to = token->text.size();
2468 if (!(token->text_type & has_text) &&
2469 !stdex::isblank(m_source + s->interval.start, s->interval.size()))
2470 token->text_type |= has_text;
2471 ++m_offset;
2472 }
2473 else if (s->type == stdex::parser::html_sequence_t::element || s->type == stdex::parser::html_sequence_t::element_start) {
2474 const element* s_el = static_cast<const element*>(s.get());
2475 stdex_assert(s_el);
2476 const element_start* s_el_start = s->type == stdex::parser::html_sequence_t::element_start ? static_cast<const element_start*>(s.get()) : nullptr;
2477 if (s_el->code == element_t::frameset && !m_parse_frames)
2478 throw std::invalid_argument("<frameset> detected");
2479
2480 {
2481 size_t offset = s->interval.start;
2482 std::unique_ptr<text_token<T, TR, AX>> t(s->type == stdex::parser::html_sequence_t::element || element_traits::span(s_el_start->code) == element_span_t::immediate ?
2483 new text_token<T, TR, AX>(token_t::complete, nullptr, 0, 0, s.get()) :
2484 new starting_token<T, TR, AX>(nullptr, 0, m_source + s_el_start->name.start, s_el_start->name.size(), 0, s.get(), s_el_start->end));
2485
2486 // Copy the tag contents, but mind any attributes containing localizable text.
2487 for (auto& a : s_el->attributes) {
2488 if (a.value.empty() ||
2489 stdex::isblank(m_source + a.value.start, a.value.size()))
2490 continue;
2491
2492 if (element_traits::is_uri(s_el->code, m_source + a.name.start, a.name.size())) {
2493 t->text.append(m_source + offset, a.value.start - offset);
2494 std::unique_ptr<url_token<T, TR, AX>> t_url(new url_token<T, TR, AX>(
2495 nullptr, 0,
2496 token_url_t::sgml,
2497 s.get()));
2498 stdex::sgml2strcat(t_url->url, m_source + a.value.start, a.value.size());
2499 append_token(std::move(t_url), t->text);
2500 t->text_type |= has_tokens;
2501 offset = a.value.end;
2502 }
2503 else if (element_traits::is_localizable(s_el->code, m_source + a.name.start, a.name.size())) {
2504 t->text.append(m_source + offset, a.value.start - offset);
2505 std::unique_ptr<text_token<T, TR, AX>> t_value(new text_token<T, TR, AX>(
2506 token_t::complete,
2507 nullptr, 0,
2508 has_text | is_title,
2509 s.get()));
2510 stdex::mapping<size_t> rel_value(a.value.start, 0);
2511 t_value->mapping.push_back(rel_value);
2512 stdex::sgml2strcat(t_value->text, m_source + a.value.start, a.value.size(), 0, rel_value, &t_value->mapping);
2513 append_token(std::move(t_value), t->text);
2514 t->text_type |= has_tokens;
2515 offset = a.value.end;
2516 }
2517 }
2518
2519 t->text.append(m_source + offset, s->interval.end - offset);
2520 rel.from = s->interval.start;
2521 token->mapping.push_back(rel);
2522 rel.to += append_token(std::move(t), token->text);
2523 token->text_type |= has_tokens;
2524 }
2525 ++m_offset;
2526
2527 if (s_el_start) {
2528 if (s_el_start->code == element_t::address ||
2529 s_el_start->code == element_t::code ||
2530 s_el_start->code == element_t::comment ||
2531 s_el_start->code == element_t::cite ||
2532 s_el_start->code == element_t::kbd ||
2533 s_el_start->code == element_t::samp ||
2534 s_el_start->code == element_t::script ||
2535 s_el_start->code == element_t::style)
2536 {
2537 // Non-localizable
2538 auto s_end = s_el_start->end;
2539 stdex_assert(s_end);
2540
2541 if (s->interval.end < s_end->interval.start) {
2542 if (s_el_start->code != element_t::style) {
2543 rel.from = s->interval.start;
2544 token->mapping.push_back(rel);
2545 rel.to += append_token(std::move(std::unique_ptr<text_token<T, TR, AX>>(
2547 token_t::complete,
2548 m_source + s->interval.end, s_end->interval.start - s->interval.end,
2549 0,
2550 m_offset->get()))),
2551 token->text);
2552 }
2553 else {
2554 // Partially parse CSS. It may contain URLs we need to make absolute.
2555 auto t = parse_css(s->interval.end, s_end->interval.start);
2556 stdex_assert(t);
2557 rel.from = s->interval.start;
2558 token->mapping.push_back(rel);
2559 rel.to += t->append_tag(token->text);
2560 }
2561 token->text_type |= has_tokens;
2562 }
2563 while (m_offset != end && m_offset->get() != s_end)
2564 ++m_offset;
2565 }
2566 else if (element_traits::is_group(s_el_start->code)) {
2567 auto limit = m_offset;
2568 while (limit != end && limit->get() != s_el_start->end)
2569 ++limit;
2570 auto t = parse(limit,
2571 (element_traits::is_heading(s_el_start->code) || s_el_start->code == element_t::dt || s_el_start->code == element_t::title ? is_title : 0) |
2572 (element_traits::is_list(s_el_start->code) ? is_bullet : 0));
2573 rel.from = s->interval.start;
2574 token->mapping.push_back(rel);
2575 rel.to += t->append_tag(token->text);
2576 token->text_type |= has_tokens;
2577 }
2578 }
2579 }
2580 else if (s->type == stdex::parser::html_sequence_t::element_end) {
2581 rel.from = s->interval.start;
2582 token->mapping.push_back(rel);
2583 rel.to += append_token(std::move(std::unique_ptr<text_token<T, TR, AX>>(
2585 token_t::ending,
2586 m_source + s->interval.start, s->interval.size(),
2587 0,
2588 s.get()))),
2589 token->text);
2590 token->text_type |= has_tokens;
2591 ++m_offset;
2592 }
2593 else {
2594 // Declaration, instruction, (P)CDATA section, comment...
2595 rel.from = s->interval.start;
2596 token->mapping.push_back(rel);
2597 rel.to += append_token(std::move(std::unique_ptr<text_token<T, TR, AX>>(
2599 token_t::complete,
2600 m_source + s->interval.start, s->interval.size(),
2601 0,
2602 s.get()))),
2603 token->text);
2604 token->text_type |= has_tokens;
2605 ++m_offset;
2606 }
2607 }
2608
2609 return append_token(std::move(token));
2610 }
2611
2615 text_token<T, TR, AX>* parse_css(size_t start, size_t end)
2616 {
2617 stdex::interval<size_t> section, content;
2618 std::unique_ptr<text_token<T, TR, AX>> token(
2620 token_t::complete,
2621 nullptr, 0,
2622 0,
2623 m_offset->get()));
2624
2625 for (;;) {
2626 if (m_css_comment.match(m_source, start, end)) {
2627 token->text.append(m_source + start, m_css_comment.interval.end - start);
2628 start = m_css_comment.interval.end;
2629 }
2630 else if (m_css_cdo.match(m_source, start, end)) {
2631 token->text.append(m_source + start, m_css_cdo.interval.end - start);
2632 start = m_css_cdo.interval.end;
2633 }
2634 else if (m_css_cdc.match(m_source, start, end)) {
2635 token->text.append(m_source + start, m_css_cdc.interval.end - start);
2636 start = m_css_cdc.interval.end;
2637 }
2638 else if (
2639 (m_css_import.match(m_source, start, end) && ((void)(section = m_css_import.interval), (void)(content = m_css_import.content), true)) ||
2640 (m_css_uri.match(m_source, start, end) && ((void)(section = m_css_uri.interval), (void)(content = m_css_uri.content), true)))
2641 {
2642 std::unique_ptr<url_token<T, TR, AX>> t_url(
2644 nullptr, 0,
2645 token_url_t::css,
2646 m_offset->get()));
2647 css_unescape(t_url->url, m_source + content.start, content.size());
2648 token->text.append(m_source + start, content.start - start);
2649 append_token(std::move(t_url), token->text);
2650 token->text.append(m_source + content.end, section.end - content.end);
2651 token->text_type |= has_tokens;
2652 start = section.end;
2653 }
2654 else if (m_any_char.match(m_source, start, end)) {
2655 token->text.append(m_source + start, m_any_char.interval.end - start);
2656 start = m_any_char.interval.end;
2657 }
2658 else
2659 break;
2660 }
2661
2662 return append_token(std::move(token));
2663 }
2664
2665 protected:
2667 const stdex::sstring m_url;
2668 const bool m_parse_frames;
2670 const T* m_source;
2671 token_vector m_tokens;
2672 sequence_store::const_iterator m_offset;
2673
2674 // For detecting URLs in CSS
2682 };
2683 }
2684}
HTML comment.
Definition html.hpp:1567
stdex::interval< size_t > content
Comment content position in source.
Definition html.hpp:1576
HTML declaration.
Definition html.hpp:1549
stdex::interval< size_t > name
Declaration name position in source.
Definition html.hpp:1559
std::vector< stdex::parser::html_attribute > attributes
Declaration attribute positions in source.
Definition html.hpp:1560
HTML document.
Definition html.hpp:1616
bool m_is_rcdata
Inside of RCDATA?
Definition html.hpp:1943
const std::basic_string< T, TR, AX > & source() const
Returns document HTML source code.
Definition html.hpp:1890
void append(_In_reads_or_z_opt_(num_chars) const T *source, size_t num_chars)
Parses HTML source code by chunks.
Definition html.hpp:1656
size_t m_num_valid_conditions
Number of started valid conditions.
Definition html.hpp:1940
size_t m_num_invalid_conditions
Number of started invalid conditions.
Definition html.hpp:1941
bool m_is_cdata
Inside of CDATA?
Definition html.hpp:1942
stdex::charset_id m_charset
Document charset.
Definition html.hpp:1937
sequence_store m_sequences
Store of sequences.
Definition html.hpp:1951
element_start * active_element() const
Returns starting tag of currently active element or nullptr if no element is known to be started.
Definition html.hpp:1898
size_t m_num_parsed
Number of characters already parsed.
Definition html.hpp:1936
std::vector< element_start * > m_element_stack
LIFO stack of started elements.
Definition html.hpp:1952
void finalize()
Finalizes document when no more appending is planned.
Definition html.hpp:1868
std::basic_string< T, TR, AX > replace_entities(_In_reads_or_z_opt_(num_chars) const T *input, size_t num_chars) const
Replaces entities with their content.
Definition html.hpp:1906
void assign(_In_reads_or_z_opt_(num_chars) const T *source, size_t num_chars)
Parses HTML document source code.
Definition html.hpp:1880
bool m_is_special_element
Inside of a special element (<SCRIPT>, <STYLE>, ...)?
Definition html.hpp:1953
std::vector< std::unique_ptr< entity< T, TR, AX > > > m_entities
Array of entities.
Definition html.hpp:1947
void clear()
Empties document.
Definition html.hpp:1635
std::basic_string< T, TR, AX > m_source
Document HTML source code.
Definition html.hpp:1935
Ending tag of an HTML element </...>
Definition html.hpp:1529
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1541
element_start * start
Corresponding starting tag.
Definition html.hpp:1542
element_t code
Element code.
Definition html.hpp:1540
Starting tag of an HTML element <...>
Definition html.hpp:1513
sequence * end
Corresponding ending tag of type element_end; When element is ended by a start of another element,...
Definition html.hpp:1522
HTML element <.../>
Definition html.hpp:1334
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1503
std::vector< stdex::parser::html_attribute > attributes
Element attribute positions in source.
Definition html.hpp:1504
element_t code
Element code.
Definition html.hpp:1502
HTML instruction.
Definition html.hpp:1583
stdex::interval< size_t > content
Instruction content position in source.
Definition html.hpp:1592
HTML parser.
Definition html.hpp:2174
token_vector m_tokens
HTML token storage.
Definition html.hpp:2671
void append_inserted_tokens(std::basic_string< T, TR, AX > &source, inserted_token_list &inserted_tokens, size_t word_index, bool after_word, token_list &active_tokens)
Adds matching inserted tokens before/after the given word in source code.
Definition html.hpp:2338
text_token< T, TR, AX > * parse(const sequence_store::const_iterator &end, uint32_t text_type=0)
Recursively parses HTML document.
Definition html.hpp:2440
const stdex::sstring m_url
Absolute document URL.
Definition html.hpp:2667
text_token< T, TR, AX > * parse()
Parses HTML document.
Definition html.hpp:2190
const document< T, TR, AX > & m_document
Document being analyzed.
Definition html.hpp:2666
token_list::const_iterator end_tokens(std::basic_string< T, TR, AX > &source, token_list &active_tokens, const token_list &new_tokens)
Pops ending tokens from the active token list and append their tags to the source code string.
Definition html.hpp:2295
static void merge(token_list &a, const token_list &b)
Adds tokens from list b to list a creating an union.
Definition html.hpp:2362
text_token< T, TR, AX > * parse_css(size_t start, size_t end)
Parses CSS.
Definition html.hpp:2615
static void start_tokens(std::basic_string< T, TR, AX > &source, token_list &active_tokens, const token_list &new_tokens, token_list::const_iterator from)
Pushes tokens to the active token list and appends their tags to the source code string.
Definition html.hpp:2277
static void link(std::basic_string< T, TR, AX > &source, const text_token< T, TR, AX > *t)
Rebuilds HTML source code from the token tree.
Definition html.hpp:2210
T_token * append_token(std::unique_ptr< T_token > &&token)
Adds token to the collection.
Definition html.hpp:2405
sequence_store::const_iterator m_offset
Index of active section.
Definition html.hpp:2672
const T * m_source
HTML source code.
Definition html.hpp:2670
stdex::progress< size_t > * m_progress
Progress indicator.
Definition html.hpp:2669
const bool m_parse_frames
Parse frames.
Definition html.hpp:2668
void make_absolute_url(std::basic_string< T, TR, AX > &rel)
Converts URL to absolute.
Definition html.hpp:2381
size_t append_token(std::unique_ptr< T_token > &&token, std::basic_string< T, TR, AX > &source)
Adds token to the collection and appends its tag to the source code string.
Definition html.hpp:2423
const token_vector & tokens() const
Returns collection of tokens.
Definition html.hpp:2394
Base class for HTML sequences.
Definition html.hpp:1315
stdex::interval< size_t > interval
Sequence position in source.
Definition html.hpp:1318
stdex::parser::html_sequence_t type
Sequence type. Enum is used for performance reasons (vs. dynamic_cast)
Definition html.hpp:1317
sequence * parent
Parent sequence.
Definition html.hpp:1319
Token representing start HTML tag.
Definition html.hpp:2107
stdex::html::sequence * end_sequence
Ending tag sequence.
Definition html.hpp:2125
std::basic_string< T, TR, AX > name
Element name allowing later recreation of ending </tag>
Definition html.hpp:2124
Token representing part of HTML text.
Definition html.hpp:2082
stdex::mapping_vector< size_t > mapping
Mapping between source and text positions.
Definition html.hpp:2099
uint32_t text_type
Mask of text_type_flag_t to specify text content.
Definition html.hpp:2098
std::basic_string< T, TR, AX > text
Token text.
Definition html.hpp:2097
HTML token base class.
Definition html.hpp:1991
sequence * sequence
Pointer to the sequence this token represents or nullptr when it doesn't trivially represent one sequ...
Definition html.hpp:2060
uintptr_t data
Any user-supplied data.
Definition html.hpp:2061
size_t append_tag(std::basic_string< wchar_t, TR, AX > &str) const
Appends token tag to the source code.
Definition html.hpp:2029
token_t type
Token type.
Definition html.hpp:2059
size_t append_tag(std::basic_string< char, TR, AX > &str) const
Appends token tag to the source code.
Definition html.hpp:2013
HTTP token representing an URL.
Definition html.hpp:2142
token_url_t encoding
URL encoding.
Definition html.hpp:2157
std::basic_string< T, TR, AX > url
URL.
Definition html.hpp:2156
Test for any code unit.
Definition parser.hpp:216
Legacy CSS comment end -->
Definition parser.hpp:7451
Legacy CSS comment start <!--
Definition parser.hpp:7413
CSS comment.
Definition parser.hpp:7353
CSS import directive.
Definition parser.hpp:7665
CSS string.
Definition parser.hpp:7488
URI in CSS.
Definition parser.hpp:7555
End of condition ...]]>
Definition parser.hpp:8336
Start of condition <![condition[...
Definition parser.hpp:8270
Tag.
Definition parser.hpp:8034
MIME content type.
Definition parser.hpp:7749
stdex::interval< size_t > charset
charset position in source
Definition parser.hpp:7761
Progress indicator base class.
Definition progress.hpp:22
virtual bool cancel()
Query whether user requested abort.
Definition progress.hpp:70
virtual void set(T value)
Set current progress.
Definition progress.hpp:52
virtual void set_range(T start, T end)
Set progress range extent.
Definition progress.hpp:42
User cancelled exception.
Definition exception.hpp:17
Describes attributes associated with a HTML element.
Definition html.hpp:686
static bool is_group(element_t code)
Does element represent a separate part of text?
Definition html.hpp:1103
static bool is_flow(element_t code)
Does element typically represent text?
Definition html.hpp:1013
static bool is_heading(element_t code)
Does element represent a heading?
Definition html.hpp:927
static bool is_head_content(element_t code)
Is element part of the document head?
Definition html.hpp:1023
static bool is_fontstyle(element_t code)
Does element represent font styling?
Definition html.hpp:811
static bool is_block(element_t code)
Is element typically displayed as a stand-alone section of text?
Definition html.hpp:981
static bool is_head_misc(element_t code)
May element be a part of document head?
Definition html.hpp:1041
static bool is_list(element_t code)
Does element represent a list of items?
Definition html.hpp:947
static bool is_uri(element_t code, _In_reads_or_z_opt_(num_chars) const T *attr_name, size_t num_chars)
Checks if expected element attribute value is URI.
Definition html.hpp:1234
static bool is_preformatted(element_t code)
Does element represent preformatted text, source code etc.?
Definition html.hpp:965
static bool is_localizable(element_t code, const T *attr_name, size_t num_chars)
Checks if expected element attribute value is localizable.
Definition html.hpp:1289
static bool is_special(element_t code)
Does element represent non-textual item in the document?
Definition html.hpp:859
static bool is_pre_exclusion(element_t code)
May element be a part of <pre></pre>?
Definition html.hpp:1060
static bool is_inline(element_t code)
Is element typically displayed inline with text?
Definition html.hpp:912
static bool is_html_content(element_t code)
Does element represent the document body?
Definition html.hpp:1086
static bool is_formctrl(element_t code)
Does element represent a form control?
Definition html.hpp:893
static bool is_phrase(element_t code)
Does element represent a phrase-of-speech?
Definition html.hpp:834
static bool may_contain(element_t parent, element_t child)
Checks if one element may nest inside another.
Definition html.hpp:1134
static element_span_t span(element_t code)
Returns expected element span in HTML code.
Definition html.hpp:692
HTML entity.
Definition html.hpp:1600
std::basic_string< T, TR, AX > value
Entity value.
Definition html.hpp:1602
stdex::interval< size_t > name
Name position in source.
Definition html.hpp:1601
Inserted HTML token.
Definition html.hpp:2163
bool after_word
true if token is anchored after the word; false if anchored before the word
Definition html.hpp:2167
std::list< stdex::html::token * > active_tokens
List of started tokens at inserted token.
Definition html.hpp:2165
size_t word_index
Index of the word, token is anchored to.
Definition html.hpp:2166
token * token
Points to the token.
Definition html.hpp:2164
Numerical interval.
Definition interval.hpp:18
T size() const
Returns interval size.
Definition interval.hpp:47
T end
interval end
Definition interval.hpp:20
T start
interval start
Definition interval.hpp:19
Maps index in source string to index in destination string.
Definition mapping.hpp:18
Tag attribute.
Definition parser.hpp:8024
stdex::interval< size_t > value
attribute value position in source
Definition parser.hpp:8026