stdex
Additional custom or not Standard C++ covered algorithms
Loading...
Searching...
No Matches
html.hpp
1/*
2 SPDX-License-Identifier: MIT
3 Copyright © 2016-2024 Amebis
4*/
5
6#pragma once
7
8#include "compat.hpp"
9#include "exception.hpp"
10#include "interval.hpp"
11#include "mapping.hpp"
12#include "parser.hpp"
13#include "progress.hpp"
14#include "sgml.hpp"
15#include "string.hpp"
16#include "system.hpp"
17#include "unicode.hpp"
18#include <exception>
19#include <list>
20#include <map>
21#include <memory>
22#include <stdexcept>
23#include <string_view>
24#include <string>
25#include <vector>
26
27#ifdef _WIN32
28#undef small
29#endif
30
31namespace stdex
32{
33 namespace html
34 {
42 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
43 void escape(
44 _Inout_ std::basic_string<char, TR, AX>& dst,
45 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
46 {
47 _Assume_(src || !num_chars);
48 for (size_t i = 0; i < num_chars && src[i]; ++i) {
49 switch (src[i]) {
50 case '&': dst += "&amp;"; break;
51 case ';': dst += "&semi;"; break;
52 case '\"': dst += "&quot;"; break;
53 case '\'': dst += "&#x27;"; break;
54 case '<': dst += "&lt;"; break;
55 case '>': dst += "&gt;"; break;
56 case 0x00a0: dst += "&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
57 default: dst += src[i]; break;
58 }
59 }
60 }
61
69 template<class TR = std::char_traits<wchar_t>, class AX = std::allocator<wchar_t>>
70 void escape(
71 _Inout_ std::basic_string<wchar_t, TR, AX>& dst,
72 _In_reads_or_z_opt_(num_chars) const wchar_t* src, _In_ size_t num_chars)
73 {
74 _Assume_(src || !num_chars);
75 for (size_t i = 0; i < num_chars && src[i]; ++i) {
76 switch (src[i]) {
77 case L'&': dst += L"&amp;"; break;
78 case L';': dst += L"&semi;"; break;
79 case L'\"': dst += L"&quot;"; break;
80 case L'\'': dst += L"&#x27;"; break;
81 case L'<': dst += L"&lt;"; break;
82 case L'>': dst += L"&gt;"; break;
83 case L'\u00a0': dst += L"&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
84 default: dst += src[i]; break;
85 }
86 }
87 }
88
95 template<class T, size_t N, class TR = std::char_traits<T>, class AX = std::allocator<T>>
96 void escape(
97 _Inout_ std::basic_string<T, TR, AX>& dst,
98 _In_ const T (&src)[N])
99 {
100 escape(dst, src, N);
101 }
102
109 template<class T, class TR_dst = std::char_traits<T>, class AX_dst = std::allocator<T>, class TR_src = std::char_traits<T>, class AX_src = std::allocator<T>>
110 void escape(
111 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
112 _In_ const std::basic_string<T, TR_src, AX_src>& src)
113 {
114 escape(dst, src.data(), src.size());
115 }
116
123 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
124 void escape_min(_Inout_ std::basic_string<char, TR, AX>& dst, _In_ char chr)
125 {
126 switch (chr) {
127 case '&': dst += "&amp;"; break;
128 case '<': dst += "&lt;"; break;
129 case '>': dst += "&gt;"; break;
130 case 0x00a0: dst += "&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
131 default: dst += chr; break;
132 }
133 }
134
141 template<class TR = std::char_traits<wchar_t>, class AX = std::allocator<wchar_t>>
142 void escape_min(_Inout_ std::basic_string<wchar_t, TR, AX>& dst, _In_ wchar_t chr)
143 {
144 switch (chr) {
145 case L'&': dst += L"&amp;"; break;
146 case L'<': dst += L"&lt;"; break;
147 case L'>': dst += L"&gt;"; break;
148 case L'\u00a0': dst += L"&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
149 default: dst += chr; break;
150 }
151 }
152
160 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
161 void escape_min(
162 _Inout_ std::basic_string<char, TR, AX>& dst,
163 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
164 {
165 _Assume_(src || !num_chars);
166 for (size_t i = 0; i < num_chars && src[i]; ++i) {
167 switch (src[i]) {
168 case '&': dst += "&amp;"; break;
169 case '<': dst += "&lt;"; break;
170 case '>': dst += "&gt;"; break;
171 case 0x00a0: dst += "&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
172 default: dst += src[i]; break;
173 }
174 }
175 }
176
184 template<class TR = std::char_traits<wchar_t>, class AX = std::allocator<wchar_t>>
185 void escape_min(
186 _Inout_ std::basic_string<wchar_t, TR, AX>& dst,
187 _In_reads_or_z_opt_(num_chars) const wchar_t* src, _In_ size_t num_chars)
188 {
189 _Assume_(src || !num_chars);
190 for (size_t i = 0; i < num_chars && src[i]; ++i) {
191 switch (src[i]) {
192 case L'&': dst += L"&amp;"; break;
193 case L'<': dst += L"&lt;"; break;
194 case L'>': dst += L"&gt;"; break;
195 case L'\u00a0': dst += L"&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
196 default: dst += src[i]; break;
197 }
198 }
199 }
200
207 template<class T, size_t N, class TR = std::char_traits<T>, class AX = std::allocator<T>>
208 void escape_min(
209 _Inout_ std::basic_string<T, TR, AX>& dst,
210 _In_ const T (&src)[N])
211 {
212 escape_min(dst, src, N);
213 }
214
221 template<class T, class TR_dst = std::char_traits<T>, class AX_dst = std::allocator<T>, class TR_src = std::char_traits<T>, class AX_src = std::allocator<T>>
222 void escape_min(
223 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
224 _In_ const std::basic_string<T, TR_src, AX_src>& src)
225 {
226 escape_min(dst, src.data(), src.size());
227 }
228
236 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
237 void url_unescape(
238 _Inout_ std::basic_string<char, TR, AX>& dst,
239 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
240 {
241 _Assume_(src || !num_chars);
242 for (size_t i = 0; i < num_chars && src[i];) {
243 switch (src[i]) {
244 case '+':
245 dst += ' '; i++;
246 break;
247
248 case '%': {
249 i++;
250
251 char chr;
252 if ('0' <= src[i] && src[i] <= '9') chr = static_cast<char>((src[i++] - '0') << 4);
253 else if ('A' <= src[i] && src[i] <= 'F') chr = static_cast<char>((src[i++] - 'A' + 10) << 4);
254 else if ('a' <= src[i] && src[i] <= 'f') chr = static_cast<char>((src[i++] - 'a' + 10) << 4);
255 else { dst += '%'; continue; }
256 if ('0' <= src[i] && src[i] <= '9') chr |= static_cast<char>((src[i++] - '0'));
257 else if ('A' <= src[i] && src[i] <= 'F') chr |= static_cast<char>((src[i++] - 'A' + 10));
258 else if ('a' <= src[i] && src[i] <= 'f') chr |= static_cast<char>((src[i++] - 'a' + 10));
259 else { dst += '%'; dst += src[i - 1]; continue; }
260
261 dst += chr;
262 break;
263 }
264
265 default:
266 dst += src[i++];
267 }
268 }
269 }
270
277 template<size_t N, class TR = std::char_traits<char>, class AX = std::allocator<char>>
278 void url_unescape(
279 _Inout_ std::basic_string<char, TR, AX>& dst,
280 _In_ const char (&src)[N])
281 {
282 url_unescape(dst, src, N);
283 }
284
291 template<class TR_dst = std::char_traits<char>, class AX_dst = std::allocator<char>>
292 void url_unescape(
293 _Inout_ std::basic_string<char, TR_dst, AX_dst>& dst,
294 _In_ const std::basic_string_view<char, std::char_traits<char>> src)
295 {
296 url_unescape(dst, src.data(), src.size());
297 }
298
306 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
307 void url_escape(
308 _Inout_ std::basic_string<char, TR, AX>& dst,
309 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
310 {
311 _Assume_(src || !num_chars);
312 for (size_t i = 0; i < num_chars && src[i]; ++i) {
313 switch (src[i]) {
314 case ' ': dst += "+"; break;
315 case '<': dst += "%3C"; break;
316 case '>': dst += "%3E"; break;
317 case '#': dst += "%23"; break;
318 case '%': dst += "%25"; break;
319 case '{': dst += "%7B"; break;
320 case '}': dst += "%7D"; break;
321 case '|': dst += "%7C"; break;
322 case '\\': dst += "%5C"; break;
323 case '^': dst += "%5E"; break;
324 case '~': dst += "%7E"; break;
325 case '[': dst += "%5B"; break;
326 case ']': dst += "%5D"; break;
327 case '`': dst += "%60"; break;
328 case ';': dst += "%3B"; break;
329 case '/': dst += "%2F"; break;
330 case '?': dst += "%3F"; break;
331 case ':': dst += "%3A"; break;
332 case '@': dst += "%40"; break;
333 case '=': dst += "%3D"; break;
334 case '&': dst += "%26"; break;
335 case '$': dst += "%24"; break;
336 default:
337 if (0x20 < static_cast<uint8_t>(src[i]) && static_cast<uint8_t>(src[i]) < 0x7f)
338 dst += src[i];
339 else {
340 dst += '%';
341 uint8_t n = (static_cast<uint8_t>(src[i]) & 0xf0) >> 4;
342 dst += n < 10 ? static_cast<char>('0' + n) : static_cast<char>('A' + n - 10);
343 n = ((uint8_t)src[i] & 0x0f);
344 dst += n < 10 ? static_cast<char>('0' + n) : static_cast<char>('A' + n - 10);
345 }
346 }
347 }
348 }
349
356 template<size_t N, class TR = std::char_traits<char>, class AX = std::allocator<char>>
357 void url_escape(
358 _Inout_ std::basic_string<char, TR, AX>& dst,
359 _In_ const char (&src)[N])
360 {
361 url_escape(dst, src, N);
362 }
363
370 template<class TR_dst = std::char_traits<char>, class AX_dst = std::allocator<char>>
371 void url_escape(
372 _Inout_ std::basic_string<char, TR_dst, AX_dst>& dst,
373 _In_ const std::basic_string_view<char, std::char_traits<char>> src)
374 {
375 url_escape(dst, src.data(), src.size());
376 }
377
385 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
386 void css_unescape(
387 _Inout_ std::basic_string<T, TR, AX>& dst,
388 _In_reads_or_z_opt_(num_chars) const T* src, _In_ size_t num_chars)
389 {
390 _Assume_(src || !num_chars);
391 for (size_t i = 0; i < num_chars && src[i];) {
392 if (src[i] != '\\')
393 dst += src[i++];
394 else if (i + 1 < num_chars) {
395 i++;
396
397 switch (src[i]) {
398 // Classic escapes
399 case 'n': dst += '\n'; i++; break;
400 case 'r': dst += '\r'; i++; break;
401 case 't': dst += '\t'; i++; break;
402
403 // `\` at the end of the line
404 case '\n': i++; break;
405
406 // `\nnnn` escape
407 case '0':
408 case '1':
409 case '2':
410 case '3':
411 case '4':
412 case '5':
413 case '6':
414 case '7':
415 case '8':
416 case '9':
417 case 'A': case 'a':
418 case 'B': case 'b':
419 case 'C': case 'c':
420 case 'D': case 'd':
421 case 'E': case 'e':
422 case 'F': case 'f': {
423 wchar_t chr = 0;
424 size_t end = std::min(num_chars, i + 6);
425
426 for (; i < end; ++i) {
427 if ('0' <= src[i] && src[i] <= '9') chr = chr * 0x10 + src[i] - '0';
428 else if ('A' <= src[i] && src[i] <= 'F') chr = chr * 0x10 + src[i] - 'A' + 10;
429 else if ('a' <= src[i] && src[i] <= 'f') chr = chr * 0x10 + src[i] - 'a' + 10;
430 else break;
431 }
432
433 dst += static_cast<T>(chr);
434
435 if (i < end && src[i] == ' ') {
436 // Skip space after `\nnnn`.
437 i++;
438 }
439 break;
440 }
441
442 default: dst += src[i++];
443 }
444 }
445 }
446 }
447
454 template<class T, size_t N, class TR = std::char_traits<T>, class AX = std::allocator<T>>
455 void css_unescape(
456 _Inout_ std::basic_string<T, TR, AX>& dst,
457 _In_ const T (&src)[N])
458 {
459 css_unescape(dst, src, N);
460 }
461
468 template<class T, class TR_dst = std::char_traits<T>, class AX_dst = std::allocator<T>, class TR_src = std::char_traits<T>, class AX_src = std::allocator<T>>
469 void css_unescape(
470 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
471 _In_ const std::basic_string<T, TR_src, AX_src>& src)
472 {
473 css_unescape(dst, src.data(), src.size());
474 }
475
483 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
484 void css_escape(
485 _Inout_ std::basic_string<char, TR, AX>& dst,
486 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
487 {
488 _Assume_(src || !num_chars);
489 for (size_t i = 0; i < num_chars && src[i]; ++i) {
490 switch (src[i]) {
491 case '\\': dst += "\\\\"; break;
492 case '\n': dst += "\\n"; break;
493 case '\r': dst += "\\r"; break;
494 case '\t': dst += "\\t"; break;
495 case '\"': dst += "\\\""; break;
496 case '\'': dst += "\\'"; break;
497 default: dst += src[i]; break;
498 }
499 }
500 }
501
509 template<class TR = std::char_traits<wchar_t>, class AX = std::allocator<wchar_t>>
510 void css_escape(
511 _Inout_ std::basic_string<wchar_t, TR, AX>& dst,
512 _In_reads_or_z_opt_(num_chars) const wchar_t* src, _In_ size_t num_chars)
513 {
514 _Assume_(src || !num_chars);
515 for (size_t i = 0; i < num_chars && src[i]; ++i) {
516 switch (src[i]) {
517 case L'\\': dst += L"\\\\"; break;
518 case L'\n': dst += L"\\n"; break;
519 case L'\r': dst += L"\\r"; break;
520 case L'\t': dst += L"\\t"; break;
521 case L'\"': dst += L"\\\""; break;
522 case L'\'': dst += L"\\'"; break;
523 default: dst += src[i]; break;
524 }
525 }
526 }
527
534 template<class T, size_t N, class TR = std::char_traits<T>, class AX = std::allocator<T>>
535 void css_escape(
536 _Inout_ std::basic_string<T, TR, AX>& dst,
537 _In_ const T (&src)[N])
538 {
539 css_escape(dst, src, N);
540 }
541
548 template<class T, class TR_dst = std::char_traits<T>, class AX_dst = std::allocator<T>, class TR_src = std::char_traits<T>, class AX_src = std::allocator<T>>
549 void css_escape(
550 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
551 _In_ const std::basic_string<T, TR_src, AX_src>& src)
552 {
553 css_escape(dst, src.data(), src.size());
554 }
555
559 enum class element_t {
560 empty = 0,
561 a,
562 abbr,
563 acronym,
564 address,
565 applet,
566 area,
567 b,
568 base,
569 basefont,
570 bdo,
571 bgsound, // Microsoft Specific
572 big,
573 blink, // Microsoft Specific
574 blockquote,
575 body,
576 br,
577 button,
578 caption,
579 center,
580 cite,
581 code,
582 col,
583 colgroup,
584 comment, // Microsoft Specific
585 dd,
586 del,
587 dfn,
588 dir,
589 div,
590 dl,
591 dt,
592 em,
593 embed, // Microsoft Specific
594 fieldset,
595 font,
596 form,
597 frame,
598 frameset,
599 h1,
600 h2,
601 h3,
602 h4,
603 h5,
604 h6,
605 head,
606 hr,
607 html,
608 i,
609 iframe,
610 img,
611 input,
612 ins,
613 isindex,
614 kbd,
615 label,
616 legend,
617 li,
618 link,
619 listing, // Microsoft Specific
620 map,
621 marquee, // Microsoft Specific
622 menu,
623 meta,
624 nextid, // Microsoft Specific
625 nobr, // Microsoft Specific
626 noembed, // Microsoft Specific
627 noframes,
628 noscript,
629 object,
630 ol,
631 optgroup,
632 option,
633 p,
634 param,
635 plaintext, // Microsoft Specific
636 pre,
637 q,
638 rt, // Microsoft Specific
639 ruby, // Microsoft Specific
640 s,
641 samp,
642 script,
643 select,
644 small,
645 span,
646 strike,
647 strong,
648 style,
649 sub,
650 sup,
651 table,
652 tbody,
653 td,
654 textarea,
655 tfoot,
656 th,
657 thead,
658 title,
659 tr,
660 tt,
661 u,
662 ul,
663 var,
664 wbr, // Microsoft Specific
665 xmp, // Microsoft Specific
666
667 unknown = -1,
668 PCDATA = -2,
669 CDATA = -3,
670 };
671
675 enum class element_span_t {
676 needs_end = 0,
677 end_optional,
678 immediate,
679 };
680
685 {
691 static element_span_t span(_In_ element_t code)
692 {
693 static element_span_t lookup[] = {
694 element_span_t::needs_end, // a
695 element_span_t::needs_end, // abbr
696 element_span_t::needs_end, // acronym
697 element_span_t::needs_end, // address
698 element_span_t::needs_end, // applet
699 element_span_t::immediate, // area
700 element_span_t::needs_end, // b
701 element_span_t::immediate, // base
702 element_span_t::immediate, // basefont
703 element_span_t::needs_end, // bdo
704 element_span_t::immediate, // bgsound
705 element_span_t::needs_end, // big
706 element_span_t::needs_end, // blink
707 element_span_t::needs_end, // blockquote
708 element_span_t::end_optional, // body
709 element_span_t::immediate, // br
710 element_span_t::needs_end, // button
711 element_span_t::needs_end, // caption
712 element_span_t::needs_end, // center
713 element_span_t::needs_end, // cite
714 element_span_t::needs_end, // code
715 element_span_t::immediate, // col
716 element_span_t::end_optional, // colgroup
717 element_span_t::needs_end, // comment
718 element_span_t::end_optional, // dd
719 element_span_t::needs_end, // del
720 element_span_t::needs_end, // dfn
721 element_span_t::needs_end, // dir
722 element_span_t::needs_end, // div
723 element_span_t::needs_end, // dl
724 element_span_t::end_optional, // dt
725 element_span_t::needs_end, // em
726 element_span_t::immediate, // embed
727 element_span_t::needs_end, // fieldset
728 element_span_t::needs_end, // font
729 element_span_t::needs_end, // form
730 element_span_t::immediate, // frame
731 element_span_t::needs_end, // frameset
732 element_span_t::needs_end, // h1
733 element_span_t::needs_end, // h2
734 element_span_t::needs_end, // h3
735 element_span_t::needs_end, // h4
736 element_span_t::needs_end, // h5
737 element_span_t::needs_end, // h6
738 element_span_t::end_optional, // head
739 element_span_t::immediate, // hr
740 element_span_t::end_optional, // html
741 element_span_t::needs_end, // i
742 element_span_t::needs_end, // iframe
743 element_span_t::immediate, // img
744 element_span_t::immediate, // input
745 element_span_t::needs_end, // ins
746 element_span_t::immediate, // isindex
747 element_span_t::needs_end, // kbd
748 element_span_t::needs_end, // label
749 element_span_t::needs_end, // legend
750 element_span_t::end_optional, // li
751 element_span_t::immediate, // link
752 element_span_t::needs_end, // listing
753 element_span_t::needs_end, // map
754 element_span_t::needs_end, // marquee
755 element_span_t::needs_end, // menu
756 element_span_t::immediate, // meta
757 element_span_t::immediate, // nextid
758 element_span_t::needs_end, // nobr
759 element_span_t::needs_end, // noembed
760 element_span_t::needs_end, // noframes
761 element_span_t::needs_end, // noscript
762 element_span_t::needs_end, // object
763 element_span_t::needs_end, // ol
764 element_span_t::needs_end, // optgroup
765 element_span_t::end_optional, // option
766 element_span_t::end_optional, // p
767 element_span_t::immediate, // param
768 element_span_t::end_optional, // plaintext
769 element_span_t::needs_end, // pre
770 element_span_t::needs_end, // q
771 element_span_t::immediate, // rt
772 element_span_t::needs_end, // ruby
773 element_span_t::needs_end, // s
774 element_span_t::needs_end, // samp
775 element_span_t::needs_end, // script
776 element_span_t::needs_end, // select
777 element_span_t::needs_end, // small
778 element_span_t::needs_end, // span
779 element_span_t::needs_end, // strike
780 element_span_t::needs_end, // strong
781 element_span_t::needs_end, // style
782 element_span_t::needs_end, // sub
783 element_span_t::needs_end, // sup
784 element_span_t::needs_end, // table
785 element_span_t::end_optional, // tbody
786 element_span_t::end_optional, // td
787 element_span_t::needs_end, // textarea
788 element_span_t::end_optional, // tfoot
789 element_span_t::end_optional, // th
790 element_span_t::end_optional, // thead
791 element_span_t::needs_end, // title
792 element_span_t::end_optional, // tr
793 element_span_t::needs_end, // tt
794 element_span_t::needs_end, // u
795 element_span_t::needs_end, // ul
796 element_span_t::needs_end, // var
797 element_span_t::immediate, // wbr
798 element_span_t::needs_end, // xmp
799 };
800 return element_t::a <= code && code <= element_t::xmp ?
801 lookup[static_cast<size_t>(code) - static_cast<size_t>(element_t::a)] :
802 element_span_t::needs_end;
803 }
804
810 static bool is_fontstyle(_In_ element_t code)
811 {
812 switch (code) {
813 case element_t::tt:
814 case element_t::i:
815 case element_t::b:
816 case element_t::u:
817 case element_t::s:
818 case element_t::strike:
819 case element_t::blink:
820 case element_t::big:
821 case element_t::small:
822 return true;
823 default:
824 return false;
825 };
826 }
827
833 static bool is_phrase(_In_ element_t code)
834 {
835 switch (code) {
836 case element_t::em:
837 case element_t::strong:
838 case element_t::dfn:
839 case element_t::code:
840 case element_t::samp:
841 case element_t::kbd:
842 case element_t::var:
843 case element_t::cite:
844 case element_t::abbr:
845 case element_t::acronym:
846 case element_t::xmp:
847 return true;
848 default:
849 return false;
850 };
851 }
852
858 static bool is_special(_In_ element_t code)
859 {
860 switch (code) {
861 case element_t::a:
862 case element_t::img:
863 case element_t::applet:
864 case element_t::object:
865 case element_t::embed:
866 case element_t::font:
867 case element_t::basefont:
868 case element_t::br:
869 case element_t::wbr:
870 case element_t::rt:
871 case element_t::script:
872 case element_t::map:
873 case element_t::q:
874 case element_t::sub:
875 case element_t::sup:
876 case element_t::ruby:
877 case element_t::span:
878 case element_t::bdo:
879 case element_t::iframe:
880 case element_t::nobr:
881 return true;
882 default:
883 return false;
884 };
885 }
886
892 static bool is_formctrl(_In_ element_t code)
893 {
894 switch (code) {
895 case element_t::input:
896 case element_t::select:
897 case element_t::textarea:
898 case element_t::label:
899 case element_t::button:
900 return true;
901 default:
902 return false;
903 };
904 }
905
911 static bool is_inline(_In_ element_t code)
912 {
913 return
914 code == element_t::PCDATA ||
915 is_fontstyle(code) ||
916 is_phrase(code) ||
917 is_special(code) ||
918 is_formctrl(code);
919 }
920
926 static bool is_heading(_In_ element_t code)
927 {
928 switch (code) {
929 case element_t::h1:
930 case element_t::h2:
931 case element_t::h3:
932 case element_t::h4:
933 case element_t::h5:
934 case element_t::h6:
935 return true;
936 default:
937 return false;
938 };
939 }
940
946 static bool is_list(_In_ element_t code)
947 {
948 switch (code) {
949 case element_t::ul:
950 case element_t::ol:
951 case element_t::dir:
952 case element_t::menu:
953 return true;
954 default:
955 return false;
956 };
957 }
958
964 static bool is_preformatted(_In_ element_t code)
965 {
966 switch (code) {
967 case element_t::pre:
968 case element_t::listing:
969 return true;
970 default:
971 return false;
972 }
973 }
974
980 static bool is_block(_In_ element_t code)
981 {
982 if (is_heading(code) ||
983 is_list(code) ||
984 is_preformatted(code)) return true;
985 switch (code) {
986 case element_t::p:
987 case element_t::dl:
988 case element_t::div:
989 case element_t::center:
990 case element_t::marquee:
991 case element_t::noscript:
992 case element_t::noframes:
993 case element_t::noembed:
994 case element_t::blockquote:
995 case element_t::form:
996 case element_t::isindex:
997 case element_t::hr:
998 case element_t::table:
999 case element_t::fieldset:
1000 case element_t::address:
1001 return true;
1002 default:
1003 return false;
1004 };
1005 }
1006
1012 static bool is_flow(_In_ element_t code)
1013 {
1014 return is_block(code) || is_inline(code);
1015 }
1016
1022 static bool is_head_content(_In_ element_t code)
1023 {
1024 switch (code) {
1025 case element_t::title:
1026 case element_t::isindex:
1027 case element_t::base:
1028 case element_t::nextid:
1029 return true;
1030 default:
1031 return false;
1032 };
1033 }
1034
1040 static bool is_head_misc(_In_ element_t code)
1041 {
1042 switch (code) {
1043 case element_t::script:
1044 case element_t::style:
1045 case element_t::meta:
1046 case element_t::link:
1047 case element_t::object:
1048 return true;
1049 default:
1050 return false;
1051 };
1052 }
1053
1059 static bool is_pre_exclusion(_In_ element_t code)
1060 {
1061 switch (code) {
1062 case element_t::img:
1063 case element_t::object:
1064 case element_t::applet:
1065 case element_t::embed:
1066 case element_t::big:
1067 case element_t::small:
1068 case element_t::sub:
1069 case element_t::sup:
1070 case element_t::ruby:
1071 case element_t::font:
1072 case element_t::basefont:
1073 case element_t::nobr:
1074 return true;
1075 default:
1076 return false;
1077 };
1078 }
1079
1085 static bool is_html_content(_In_ element_t code)
1086 {
1087 switch (code) {
1088 case element_t::head:
1089 case element_t::body:
1090 case element_t::frameset:
1091 return true;
1092 default:
1093 return false;
1094 };
1095 }
1096
1102 static bool is_group(_In_ element_t code)
1103 {
1104 if (is_block(code) ||
1105 is_html_content(code) ||
1106 is_head_content(code)) return true;
1107 switch (code) {
1108 case element_t::col:
1109 case element_t::colgroup:
1110 case element_t::dd:
1111 case element_t::dir:
1112 case element_t::dt:
1113 case element_t::frame:
1114 case element_t::iframe:
1115 case element_t::legend:
1116 case element_t::td:
1117 case element_t::th:
1118 case element_t::tr:
1119 return true;
1120 default:
1121 return false;
1122 };
1123 }
1124
1133 static bool may_contain(_In_ element_t parent, _In_ element_t child)
1134 {
1135 if (child == element_t::unknown || child == element_t::comment)
1136 return true;
1137 if (is_fontstyle(parent) || is_phrase(parent))
1138 return is_inline(child);
1139 if (is_heading(parent))
1140 return is_inline(child);
1141
1142 switch (parent) {
1143 case element_t::a: return is_inline(child) && child != element_t::a;
1144 case element_t::address: return is_inline(child) || child == element_t::p;
1145 case element_t::applet: return is_flow(child) || child == element_t::param;
1146 case element_t::area: return false;
1147 case element_t::base: return false;
1148 case element_t::basefont: return false;
1149 case element_t::bdo: return is_inline(child);
1150 case element_t::blockquote: return is_flow(child);
1151 case element_t::body: return is_flow(child) || child == element_t::ins || child == element_t::del;
1152 case element_t::br: return false;
1153 case element_t::button: return is_flow(child) && !is_formctrl(child) && child != element_t::a && child != element_t::form && child != element_t::isindex && child != element_t::fieldset && child != element_t::iframe;
1154 case element_t::caption: return is_inline(child);
1155 case element_t::center: return is_flow(child);
1156 case element_t::col: return false;
1157 case element_t::colgroup: return child == element_t::col;
1158 case element_t::comment: return child == element_t::CDATA;
1159 case element_t::dd: return is_flow(child);
1160 case element_t::del: return is_flow(child);
1161 case element_t::dir: return child == element_t::li;
1162 case element_t::div: return is_flow(child);
1163 case element_t::dl: return child == element_t::dt || child == element_t::dd;
1164 case element_t::dt: return is_inline(child);
1165 case element_t::embed: return is_flow(child) || child == element_t::param;
1166 case element_t::fieldset: return is_flow(child) || child == element_t::legend || child == element_t::PCDATA;
1167 case element_t::font: return is_inline(child);
1168 case element_t::form: return is_flow(child) && child != element_t::form;
1169 case element_t::frame: return false;
1170 case element_t::frameset: return child == element_t::frameset || child == element_t::frame || child == element_t::noframes;
1171 case element_t::head: return is_head_content(child) || is_head_misc(child);
1172 case element_t::hr: return false;
1173 case element_t::html: return is_html_content(child);
1174 case element_t::iframe: return is_flow(child);
1175 case element_t::img: return false;
1176 case element_t::input: return false;
1177 case element_t::ins: return is_flow(child);
1178 case element_t::isindex: return false;
1179 case element_t::label: return is_inline(child) && child != element_t::label;
1180 case element_t::legend: return is_inline(child);
1181 case element_t::li: return is_flow(child);
1182 case element_t::link: return false;
1183 case element_t::listing: return child == element_t::CDATA;
1184 case element_t::map: return is_block(child) || child == element_t::area;
1185 case element_t::marquee: return is_flow(child);
1186 case element_t::menu: return child == element_t::li;
1187 case element_t::meta: return false;
1188 case element_t::nobr: return is_inline(child) || child == element_t::wbr;
1189 case element_t::noframes: return (is_flow(child) || child == element_t::body) && child != element_t::noframes;
1190 case element_t::noscript: return is_flow(child);
1191 case element_t::noembed: return is_flow(child);
1192 case element_t::object: return is_flow(child) || child == element_t::param;
1193 case element_t::ol: return child == element_t::li;
1194 case element_t::optgroup: return child == element_t::option;
1195 case element_t::option: return child == element_t::PCDATA;
1196 case element_t::p: return is_inline(child);
1197 case element_t::param: return false;
1198 case element_t::plaintext: return is_flow(child);
1199 case element_t::pre: return is_inline(child) && !is_pre_exclusion(child);
1200 case element_t::q: return is_inline(child);
1201 case element_t::rt: return false;
1202 case element_t::ruby: return is_inline(child);
1203 case element_t::script: return child == element_t::CDATA;
1204 case element_t::select: return child == element_t::optgroup || child == element_t::option;
1205 case element_t::span: return is_inline(child);
1206 case element_t::style: return child == element_t::CDATA;
1207 case element_t::sub: return is_inline(child);
1208 case element_t::sup: return is_inline(child);
1209 case element_t::table: return child == element_t::caption || child == element_t::col || child == element_t::colgroup || child == element_t::thead || child == element_t::tfoot || child == element_t::tbody;
1210 case element_t::tbody: return child == element_t::tr;
1211 case element_t::td: return is_flow(child);
1212 case element_t::textarea: return child == element_t::PCDATA;
1213 case element_t::tfoot: return child == element_t::tr;
1214 case element_t::th: return is_flow(child);
1215 case element_t::thead: return child == element_t::tr;
1216 case element_t::title: return child == element_t::PCDATA;
1217 case element_t::tr: return child == element_t::td || child == element_t::th;
1218 case element_t::ul: return child == element_t::li;
1219 case element_t::wbr: return false;
1220 case element_t::unknown: return true;
1221 default: return false;
1222 }
1223 }
1224
1232 template <class T>
1233 static bool is_uri(_In_ element_t code, _In_reads_or_z_opt_(num_chars) const T* attr_name, _In_ size_t num_chars)
1234 {
1235 _Assume_(attr_name || !num_chars);
1236 switch (code) {
1237 case element_t::a: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX);
1238 case element_t::applet: return !stdex::strnicmp(attr_name, num_chars, "code", SIZE_MAX) ||
1239 !stdex::strnicmp(attr_name, num_chars, "codebase", SIZE_MAX) ||
1240 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1241 case element_t::area: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX);
1242 case element_t::base: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX);
1243 case element_t::bgsound: return !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1244 case element_t::blockquote: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX);
1245 case element_t::body: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX);
1246 case element_t::comment: return !stdex::strnicmp(attr_name, num_chars, "data", SIZE_MAX);
1247 case element_t::del: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX);
1248 case element_t::embed: return !stdex::strnicmp(attr_name, num_chars, "pluginspage", SIZE_MAX) ||
1249 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1250 case element_t::form: return !stdex::strnicmp(attr_name, num_chars, "action", SIZE_MAX);
1251 case element_t::frame: return !stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX) ||
1252 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1253 case element_t::head: return !stdex::strnicmp(attr_name, num_chars, "profile", SIZE_MAX);
1254 case element_t::iframe: return !stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX) ||
1255 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1256 case element_t::img: return !stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX) ||
1257 !stdex::strnicmp(attr_name, num_chars, "lowsrc", SIZE_MAX) ||
1258 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) ||
1259 !stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX);
1260 case element_t::input: return !stdex::strnicmp(attr_name, num_chars, "lowsrc", SIZE_MAX) ||
1261 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) ||
1262 !stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX);
1263 case element_t::ins: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX);
1264 case element_t::link: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX);
1265 case element_t::object: return !stdex::strnicmp(attr_name, num_chars, "basehref", SIZE_MAX) ||
1266 !stdex::strnicmp(attr_name, num_chars, "classid", SIZE_MAX) ||
1267 !stdex::strnicmp(attr_name, num_chars, "code", SIZE_MAX) ||
1268 !stdex::strnicmp(attr_name, num_chars, "codebase", SIZE_MAX) ||
1269 !stdex::strnicmp(attr_name, num_chars, "data", SIZE_MAX) ||
1270 !stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX);
1271 case element_t::q: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX);
1272 case element_t::script: return !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1273 case element_t::table: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX);
1274 case element_t::td: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX);
1275 case element_t::th: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX);
1276 default: return false;
1277 }
1278 }
1279
1287 template <class T>
1288 static bool is_localizable(element_t code, const T* attr_name, size_t num_chars)
1289 {
1290 _Assume_(attr_name || !num_chars);
1291 if (!stdex::strnicmp(attr_name, num_chars, "title", SIZE_MAX))
1292 return true;
1293 switch (code) {
1294 case element_t::applet: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1295 case element_t::area: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1296 case element_t::img: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1297 case element_t::input: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1298 case element_t::object: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1299 case element_t::table: return !stdex::strnicmp(attr_name, num_chars, "summary", SIZE_MAX);
1300 case element_t::td: return !stdex::strnicmp(attr_name, num_chars, "abbr", SIZE_MAX);
1301 case element_t::th: return !stdex::strnicmp(attr_name, num_chars, "abbr", SIZE_MAX);
1302 default: return false;
1303 }
1304 }
1305 };
1306
1307 class sequence;
1308 using sequence_store = std::vector<std::unique_ptr<sequence>>;
1309
1314 {
1315 public:
1316 stdex::parser::html_sequence_t type;
1319
1320 sequence(_In_ stdex::parser::html_sequence_t _type = stdex::parser::html_sequence_t::unknown, _In_ size_t start = 0, size_t end = 0, _In_opt_ sequence* _parent = nullptr) :
1321 type(_type),
1322 interval(start, end),
1323 parent(_parent)
1324 {}
1325
1326 virtual ~sequence() {} // make polymorphic
1327 };
1328
1332 class element : public sequence
1333 {
1334 public:
1335 template <class T>
1336 element(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr) :
1337 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1338 code(element_code(src + tag.name.start, tag.name.size())),
1339 name(std::move(tag.name)),
1340 attributes(std::move(tag.attributes))
1341 {}
1342
1343 template <class T>
1344 static element_t element_code(_In_reads_z_(num_chars) const T* name, size_t num_chars)
1345 {
1346 static const struct {
1347 const char* name;
1348 element_t code;
1349 } mapping[] = {
1350 { "a", element_t::a, },
1351 { "abbr", element_t::abbr, },
1352 { "acronym", element_t::acronym, },
1353 { "address", element_t::address, },
1354 { "applet", element_t::applet, },
1355 { "area", element_t::area, },
1356 { "b", element_t::b, },
1357 { "base", element_t::base, },
1358 { "basefont", element_t::basefont, },
1359 { "bdo", element_t::bdo, },
1360 { "bgsound", element_t::bgsound, },
1361 { "big", element_t::big, },
1362 { "blink", element_t::blink, },
1363 { "blockquote", element_t::blockquote, },
1364 { "body", element_t::body, },
1365 { "br", element_t::br, },
1366 { "button", element_t::button, },
1367 { "caption", element_t::caption, },
1368 { "center", element_t::center, },
1369 { "cite", element_t::cite, },
1370 { "code", element_t::code, },
1371 { "col", element_t::col, },
1372 { "colgroup", element_t::colgroup, },
1373 { "comment", element_t::comment, },
1374 { "dd", element_t::dd, },
1375 { "del", element_t::del, },
1376 { "dfn", element_t::dfn, },
1377 { "dir", element_t::dir, },
1378 { "div", element_t::div, },
1379 { "dl", element_t::dl, },
1380 { "dt", element_t::dt, },
1381 { "em", element_t::em, },
1382 { "embed", element_t::embed, },
1383 { "fieldset", element_t::fieldset, },
1384 { "font", element_t::font, },
1385 { "form", element_t::form, },
1386 { "frame", element_t::frame, },
1387 { "frameset", element_t::frameset, },
1388 { "h1", element_t::h1, },
1389 { "h2", element_t::h2, },
1390 { "h3", element_t::h3, },
1391 { "h4", element_t::h4, },
1392 { "h5", element_t::h5, },
1393 { "h6", element_t::h6, },
1394 { "head", element_t::head, },
1395 { "hr", element_t::hr, },
1396 { "html", element_t::html, },
1397 { "i", element_t::i, },
1398 { "iframe", element_t::iframe, },
1399 { "img", element_t::img, },
1400 { "input", element_t::input, },
1401 { "ins", element_t::ins, },
1402 { "isindex", element_t::isindex, },
1403 { "kbd", element_t::kbd, },
1404 { "label", element_t::label, },
1405 { "legend", element_t::legend, },
1406 { "li", element_t::li, },
1407 { "link", element_t::link, },
1408 { "listing", element_t::listing, },
1409 { "map", element_t::map, },
1410 { "marquee", element_t::marquee, },
1411 { "menu", element_t::menu, },
1412 { "meta", element_t::meta, },
1413 { "nextid", element_t::nextid, },
1414 { "nobr", element_t::nobr, },
1415 { "noembed", element_t::noembed, },
1416 { "noframes", element_t::noframes, },
1417 { "noscript", element_t::noscript, },
1418 { "object", element_t::object, },
1419 { "ol", element_t::ol, },
1420 { "optgroup", element_t::optgroup, },
1421 { "option", element_t::option, },
1422 { "p", element_t::p, },
1423 { "param", element_t::param, },
1424 { "plaintext", element_t::plaintext, },
1425 { "pre", element_t::pre, },
1426 { "q", element_t::q, },
1427 { "rt", element_t::rt, },
1428 { "ruby", element_t::ruby, },
1429 { "s", element_t::s, },
1430 { "samp", element_t::samp, },
1431 { "script", element_t::script, },
1432 { "select", element_t::select, },
1433 { "small", element_t::small, },
1434 { "span", element_t::span, },
1435 { "strike", element_t::strike, },
1436 { "strong", element_t::strong, },
1437 { "style", element_t::style, },
1438 { "sub", element_t::sub, },
1439 { "sup", element_t::sup, },
1440 { "table", element_t::table, },
1441 { "tbody", element_t::tbody, },
1442 { "td", element_t::td, },
1443 { "textarea", element_t::textarea, },
1444 { "tfoot", element_t::tfoot, },
1445 { "th", element_t::th, },
1446 { "thead", element_t::thead, },
1447 { "title", element_t::title, },
1448 { "tr", element_t::tr, },
1449 { "tt", element_t::tt, },
1450 { "u", element_t::u, },
1451 { "ul", element_t::ul, },
1452 { "var", element_t::var, },
1453 { "wbr", element_t::wbr, },
1454 { "xmp", element_t::xmp, },
1455 };
1456#ifndef NDEBUG
1457 // The mapping table MUST be sorted and all names in lowercase.
1458 for (size_t i = 1; i < _countof(mapping); i++)
1459 _Assume_(stdex::strcmp(mapping[i - 1].name, mapping[i].name) <= 0);
1460 for (size_t i = 0; i < _countof(mapping); i++) {
1461 for (size_t j = 0; mapping[i].name[j]; j++)
1462 _Assume_(stdex::islower(mapping[i].name[j]) | stdex::isdigit(mapping[i].name[j]));
1463 }
1464#endif
1465 for (size_t i = 0, j = _countof(mapping); i < j; ) {
1466 size_t m = (i + j) / 2;
1467 int r = 0;
1468 for (size_t i1 = 0, i2 = 0;;) {
1469 if (!mapping[m].name[i1]) {
1470 r = i2 >= num_chars || !name[i2] ? 0 : -1;
1471 break;
1472 }
1473 if (i2 >= num_chars || !name[i2]) {
1474 r = 1;
1475 break;
1476 }
1477
1478 auto chr = static_cast<char>(stdex::tolower(name[i2++]));
1479 if (mapping[m].name[i1] > chr) {
1480 r = 1;
1481 break;
1482 }
1483 if (mapping[m].name[i1] < chr) {
1484 r = -1;
1485 break;
1486 }
1487 i1++;
1488 }
1489
1490 if (r < 0)
1491 i = m + 1;
1492 else if (r > 0)
1493 j = m;
1494 else
1495 return mapping[m].code;
1496 }
1497 return element_t::unknown;
1498 }
1499
1500 public:
1501 element_t code;
1503 std::vector<stdex::parser::html_attribute> attributes;
1504 };
1505
1506 class element_end;
1507
1511 class element_start : public element
1512 {
1513 public:
1514 template <class T>
1515 element_start(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr, _In_opt_ sequence* _end = nullptr) :
1516 element(std::move(tag), src, parent),
1517 end(_end)
1518 {}
1519
1520 public:
1522 };
1523
1527 class element_end : public sequence
1528 {
1529 public:
1530 template <class T>
1531 element_end(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr, _In_opt_ element_start* _start = nullptr) :
1532 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1533 code(element::element_code(src + tag.name.start, tag.name.size())),
1534 name(std::move(tag.name)),
1535 start(_start)
1536 {}
1537
1538 public:
1539 element_t code;
1542 };
1543
1547 class declaration : public sequence
1548 {
1549 public:
1550 template <class T>
1551 declaration(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_opt_ sequence* parent = nullptr) :
1552 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1553 name(std::move(tag.name)),
1554 attributes(std::move(tag.attributes))
1555 {}
1556
1557 public:
1559 std::vector<stdex::parser::html_attribute> attributes;
1560 };
1561
1565 class comment : public sequence
1566 {
1567 public:
1568 template <class T>
1569 comment(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_opt_ sequence* parent = nullptr) :
1570 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1571 content(std::move(tag.name))
1572 {}
1573
1574 public:
1576 };
1577
1581 class instruction : public sequence
1582 {
1583 public:
1584 template <class T>
1585 instruction(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_opt_ sequence* parent = nullptr) :
1586 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1587 content(std::move(tag.name))
1588 {}
1589
1590 public:
1592 };
1593
1597 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
1598 struct entity
1599 {
1601 std::basic_string<T, TR, AX> value;
1602 };
1603
1607 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
1608 class parser;
1609
1613 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
1615 {
1616 public:
1617 document() :
1618 m_num_parsed(0),
1619 m_charset(stdex::charset_id::system),
1620
1621 // Declaration parsing data
1624 m_is_cdata(false),
1625 m_is_rcdata(false),
1626
1627 // Element parsing data
1629 {}
1630
1634 void clear()
1635 {
1636 m_source.clear();
1637 m_num_parsed = 0;
1638 m_charset = stdex::charset_id::system;
1639
1640 // Declaration parsing data
1642 m_is_cdata = m_is_rcdata = false;
1643 m_entities.clear();
1644
1645 // Element parsing data
1646 m_sequences.clear();
1647
1648 m_element_stack.clear();
1649 m_is_special_element = false;
1650 }
1651
1655 void append(_In_reads_or_z_opt_(num_chars) const T* source, _In_ size_t num_chars)
1656 {
1657 _Assume_(source || !num_chars);
1658 m_source.append(source, stdex::strnlen(source, num_chars));
1659 source = m_source.data();
1660 num_chars = m_source.size();
1661
1662 for (size_t i = m_num_parsed; i < num_chars;) {
1663 if (m_is_cdata || m_is_rcdata) {
1664 if (m_condition_end.match(source, i, num_chars)) {
1665 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(
1666 m_is_cdata ? stdex::parser::html_sequence_t::CDATA : stdex::parser::html_sequence_t::PCDATA,
1667 m_num_parsed, i,
1668 active_element()))));
1669 m_is_cdata = m_is_rcdata = false;
1670 i = m_num_parsed = m_condition_end.interval.end;
1671 continue;
1672 }
1673 goto next_char;
1674 }
1675
1677 if (m_condition_end.match(source, i, num_chars)) {
1679 i = m_num_parsed = m_condition_end.interval.end;
1680 continue;
1681 }
1682 goto next_char;
1683 }
1684
1685 if (m_num_valid_conditions && m_condition_end.match(source, i, num_chars)) {
1686 if (m_num_parsed < i)
1687 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));
1688
1690 i = m_num_parsed = m_condition_end.interval.end;
1691 continue;
1692 }
1693
1694 if (m_condition_start.match(source, i, num_chars)) {
1695 auto condition_src(replace_entities(source + m_condition_start.condition.start, m_condition_start.condition.size()));
1696 if (stdex::strncmp(condition_src.data(), condition_src.size(), "CDATA", SIZE_MAX) == 0)
1697 m_is_cdata = true;
1698 else if (stdex::strncmp(condition_src.data(), condition_src.size(), "RCDATA", SIZE_MAX) == 0)
1699 m_is_rcdata = true;
1702 else if (stdex::strncmp(condition_src.data(), condition_src.size(), "IGNORE", SIZE_MAX) == 0)
1704 else
1706
1707 i = m_num_parsed = m_condition_start.interval.end;
1708 continue;
1709 }
1710
1712 auto parent = active_element();
1713 _Assume_(parent);
1714 if (m_tag.match(source, i, num_chars) &&
1715 m_tag.type == stdex::parser::html_sequence_t::element_end &&
1716 element::element_code(source + m_tag.name.start, m_tag.name.size()) == parent->code)
1717 {
1718 if (m_num_parsed < i)
1719 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, parent))));
1720 i = m_num_parsed = m_tag.interval.end;
1721 std::unique_ptr<element_end> e(new element_end(std::move(m_tag), source, parent->parent, parent));
1722 parent->end = e.get();
1723 m_sequences.push_back(std::move(e));
1724 m_element_stack.pop_back();
1725 m_is_special_element = false;
1726 continue;
1727 }
1728 goto next_char;
1729 }
1730
1731 if (m_tag.match(source, i, num_chars)) {
1732 if (m_num_parsed < i)
1733 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));
1734 i = m_num_parsed = m_tag.interval.end;
1735
1736 switch (m_tag.type) {
1737 case stdex::parser::html_sequence_t::element:
1738 case stdex::parser::html_sequence_t::element_start: {
1739 std::unique_ptr<element> e(
1740 m_tag.type == stdex::parser::html_sequence_t::element ? new element(std::move(m_tag), source) :
1741 m_tag.type == stdex::parser::html_sequence_t::element_start ? new element_start(std::move(m_tag), source) :
1742 nullptr);
1743
1744 // Does this tag end any of the started elements?
1745 for (size_t j = m_element_stack.size(); j--; ) {
1746 auto starting_tag = m_element_stack[j];
1747 _Assume_(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1748 if (element_traits::may_contain(starting_tag->code, e->code)) {
1749 e->parent = starting_tag;
1750 break;
1751 }
1752 e->parent = starting_tag->parent;
1753 starting_tag->end = e.get();
1754 m_element_stack.resize(j);
1755 }
1756
1757 if (e->type == stdex::parser::html_sequence_t::element_start) {
1758 auto e_start = static_cast<element_start*>(e.get());
1759 if (element_traits::span(e->code) == element_span_t::immediate)
1760 e_start->end = e.get();
1761 else {
1762 m_element_stack.push_back(e_start);
1763 switch (e->code) {
1764 case element_t::code:
1765 case element_t::comment:
1766 case element_t::script:
1767 case element_t::style:
1768 m_is_special_element = true;
1769 break;
1770 default:;
1771 }
1772 }
1773 }
1774
1775 if (e->code == element_t::meta && m_charset == stdex::charset_id::system) {
1776 bool is_content_type = false;
1777 stdex::parser::html_attribute* content_attr = nullptr;
1778 for (auto& attr : e->attributes) {
1779 if (!stdex::strnicmp(source + attr.name.start, attr.name.size(), "http-equiv", SIZE_MAX) &&
1780 !stdex::strnicmp(source + attr.value.start, attr.value.size(), "content-type", SIZE_MAX))
1781 is_content_type = true;
1782 else if (!stdex::strnicmp(source + attr.name.start, attr.name.size(), "content", SIZE_MAX))
1783 content_attr = &attr;
1784 }
1785 if (is_content_type && content_attr) {
1786 // <meta http-equiv="Content-Type" content="..."> found.
1788 if (content.match(source, content_attr->value.start, content_attr->value.end) &&
1789 content.charset)
1790 {
1791 std::string str;
1792 str.reserve(content.charset.size());
1793 for (size_t j = content.charset.start; j < content.charset.end; ++j)
1794 str.push_back(static_cast<char>(source[j]));
1795 m_charset = stdex::charset_from_name(str);
1796 }
1797 }
1798 }
1799
1800 m_sequences.push_back(std::move(e));
1801 break;
1802 }
1803 case stdex::parser::html_sequence_t::element_end: {
1804 std::unique_ptr<element_end> e(new element_end(std::move(m_tag), source, active_element()));
1805
1806 for (size_t j = m_element_stack.size(); j--; ) {
1807 auto starting_tag = m_element_stack[j];
1808 _Assume_(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1809 if (starting_tag->code == e->code ||
1810 (starting_tag->code == element_t::unknown && e->code == element_t::unknown && !stdex::strnicmp(source + starting_tag->name.start, starting_tag->name.size(), source + e->name.start, e->name.size())))
1811 {
1812 e->start = starting_tag;
1813 e->parent = starting_tag->parent;
1814 starting_tag->end = e.get();
1815 m_element_stack.resize(j);
1816 break;
1817 }
1818 }
1819
1820 m_sequences.push_back(std::move(e));
1821 break;
1822 }
1823 case stdex::parser::html_sequence_t::declaration:
1824 if (m_tag.attributes.size() > 3 &&
1825 !stdex::strnicmp(source + m_tag.attributes[0].name.start, m_tag.attributes[0].name.size(), "entity", SIZE_MAX))
1826 {
1827 if (!stdex::strncmp(source + m_tag.attributes[1].name.start, m_tag.attributes[1].name.size(), "%", SIZE_MAX) &&
1828 stdex::strncmp(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size(), "SYSTEM", SIZE_MAX) &&
1829 stdex::strncmp(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size(), "PUBLIC", SIZE_MAX))
1830 {
1831 std::unique_ptr<entity<T, TR, AX>> e(new entity<T, TR, AX>());
1832 e->name = m_tag.attributes[2].name;
1833 e->value = std::move(replace_entities(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size()));
1834 m_entities.push_back(std::move(e));
1835 }
1836
1837 // TODO: Parse & entities and entities in SYSTEM and PUBLIC external files.
1838 }
1839 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new declaration(std::move(m_tag), active_element()))));
1840 break;
1841 case stdex::parser::html_sequence_t::comment:
1842 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new comment(std::move(m_tag), active_element()))));
1843 break;
1844 case stdex::parser::html_sequence_t::instruction:
1845 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new instruction(std::move(m_tag), active_element()))));
1846 break;
1847 default:
1848 throw std::invalid_argument("unknown tag type");
1849 }
1850
1851 continue;
1852 }
1853
1854 next_char:
1855 if (m_any_char.match(source, i, num_chars)) {
1856 // Skip any character, but don't declare it as parsed yet. It might be a part of unfinished tag.
1857 i = m_any_char.interval.end;
1858 }
1859 else
1860 break;
1861 }
1862 }
1863
1868 {
1869 size_t i = m_source.size();
1870 if (m_num_parsed < i)
1871 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));
1872 m_num_parsed = i;
1873 m_element_stack.clear();
1874 }
1875
1879 void assign(_In_reads_or_z_opt_(num_chars) const T* source, _In_ size_t num_chars)
1880 {
1881 clear();
1882 append(source, num_chars);
1883 finalize();
1884 }
1885
1889 const std::basic_string<T, TR, AX>& source() const { return m_source; }
1890
1891 friend class parser<T, TR, AX>;
1892
1893 protected:
1898 {
1899 return m_element_stack.empty() ? nullptr : m_element_stack.back();
1900 }
1901
1905 std::basic_string<T, TR, AX> replace_entities(_In_reads_or_z_opt_(num_chars) const T* input, _In_ size_t num_chars) const
1906 {
1907 _Assume_(input || !num_chars);
1908 const size_t num_entities = m_entities.size();
1909 const T* source = m_source.data();
1910 std::basic_string<T, TR, AX> output;
1911 for (size_t i = 0; i < num_chars && input[i];) {
1912 if (input[i] == '%') {
1913 for (size_t j = 0; j < num_entities; j++) {
1914 auto& e = m_entities[j];
1915 size_t entity_size = e->name.size();
1916 if (i + entity_size + 1 < num_chars &&
1917 !stdex::strncmp(input + i + 1, source + e->name.start, entity_size) &&
1918 input[i + entity_size + 1] == ';')
1919 {
1920 output += e->value;
1921 i += entity_size + 2;
1922 goto next_char;
1923 }
1924 }
1925 throw std::runtime_error("undefined entity");
1926 }
1927 output += input[i++];
1928 next_char:;
1929 }
1930 return output;
1931 }
1932
1933 protected:
1934 std::basic_string<T, TR, AX> m_source;
1936 stdex::charset_id m_charset;
1937
1938 // Declaration parsing data
1946 std::vector<std::unique_ptr<entity<T, TR, AX>>> m_entities;
1947
1948 // Element parsing data
1950 sequence_store m_sequences;
1951 std::vector<element_start*> m_element_stack;
1953 };
1954
1958 enum class token_t {
1959 root = 0,
1960 complete,
1961 starting,
1962 ending,
1963 url,
1964 };
1965
1969 constexpr size_t token_tag_max =
1970 sizeof(void*) * 2 // Memory address in hexadecimal
1971 + 2 // Leading and trailing parenthesis
1972 + 1; // Zero terminator
1973
1978 constexpr char token_tag_start = '\x12';
1979
1984 constexpr char token_tag_end = '\x13';
1985
1989 class token
1990 {
1991 protected:
1992 token(_In_ token_t _type = token_t::root, _In_opt_ sequence* _sequence = nullptr, _In_ uintptr_t _data = 0) :
1993 type(_type),
1994 sequence(_sequence),
1995 data(_data)
1996 {}
1997
1998 template<class T, class TR, class AX>
1999 friend class parser;
2000
2001 public:
2002 virtual ~token() {} // make polymorphic
2003
2011 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
2012 size_t append_tag(_Inout_ std::basic_string<char, TR, AX>& str) const
2013 {
2014 size_t n = str.size();
2015 // Use %X instead of %p to omit leading zeros and save space.
2016 stdex::appendf(str, "%c%zX%c", stdex::locale_C, token_tag_start, reinterpret_cast<uintptr_t>(this), token_tag_end);
2017 return str.size() - n;
2018 }
2019
2027 template<class TR = std::char_traits<wchar_t>, class AX = std::allocator<wchar_t>>
2028 size_t append_tag(_Inout_ std::basic_string<wchar_t, TR, AX>& str) const
2029 {
2030 // Use %X instead of %p to omit leading zeros and save space.
2031 return stdex::appendf(str, L"%c%zX%c", stdex::locale_C, static_cast<wchar_t>(token_tag_start), reinterpret_cast<uintptr_t>(this), static_cast<wchar_t>(token_tag_end));
2032 }
2033
2034 template<class T>
2035 static token* parse_tag(const T* str, size_t& offset)
2036 {
2037 if (str[offset] != static_cast<T>(token_tag_start))
2038 return nullptr;
2039
2040 // Locate tag end.
2041 size_t end;
2042 for (end = offset + 1; ; end++) {
2043 if (!str[end])
2044 return nullptr;
2045 if (str[end] == token_tag_end)
2046 break;
2047 }
2048
2049 // Parse hexadecimal token memory address.
2050 token* t = reinterpret_cast<token*>(stdex::strtouint<T, uintptr_t>(str + offset + 1, end - offset - 1, nullptr, 16));
2051 if (!t)
2052 throw std::invalid_argument("null token");
2053 offset = end + 1;
2054 return t;
2055 }
2056
2057 public:
2058 token_t type;
2060 uintptr_t data;
2061 };
2062
2063 using token_vector = std::vector<std::unique_ptr<token>>;
2064 using token_list = std::list<token*>;
2065
2069 enum text_type_flag_t : uint32_t {
2070 has_tokens = 1 << 0,
2071 has_text = 1 << 1,
2072 is_title = 1 << 2,
2073 is_bullet = 1 << 3,
2074 };
2075
2079 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
2080 class text_token : public token
2081 {
2082 protected:
2083 text_token(
2084 _In_ token_t type = token_t::complete,
2085 _In_reads_or_z_opt_(num_chars) const T* _text = nullptr, _In_ size_t num_chars = 0,
2086 _In_ uint32_t _text_type = 0,
2087 _In_opt_ stdex::html::sequence* sequence = nullptr, _In_ uintptr_t data = 0) :
2089 text(_text, num_chars),
2090 text_type(_text_type)
2091 {}
2092
2093 friend class parser<T, TR, AX>;
2094
2095 public:
2096 std::basic_string<T, TR, AX> text;
2097 uint32_t text_type;
2098 stdex::mapping_vector<size_t> mapping;
2099 };
2100
2104 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
2105 class starting_token : public text_token<T, TR, AX>
2106 {
2107 protected:
2109 _In_reads_or_z_opt_(num_chars_text) const T* _text = nullptr, _In_ size_t num_chars_text = 0,
2110 _In_reads_or_z_opt_(num_chars_name) const T* _name = nullptr, _In_ size_t num_chars_name = 0,
2111 _In_ uint32_t text_type = 0,
2112 _In_opt_ stdex::html::sequence* sequence = nullptr,
2113 _In_opt_ stdex::html::sequence* _end_sequence = nullptr,
2114 _In_ uintptr_t data = 0) :
2115 text_token<T, TR, AX>(token_t::starting, _text, num_chars_text, text_type, sequence, data),
2116 name(_name, num_chars_name),
2117 end_sequence(_end_sequence)
2118 {}
2119
2120 friend class parser<T, TR, AX>;
2121
2122 public:
2123 std::basic_string<T, TR, AX> name;
2125 };
2126
2130 enum class token_url_t {
2131 plain = 0, // URL is not using any particular encoding scheme (as-is)
2132 sgml, // URL is encoded using SGML entities
2133 css, // URL is encoded using CSS escaping scheme
2134 };
2135
2139 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
2140 class url_token : public token
2141 {
2142 protected:
2143 url_token(
2144 _In_reads_or_z_opt_(num_chars) const T* _url = nullptr, _In_ size_t num_chars = 0,
2145 token_url_t _encoding = token_url_t::plain,
2146 _In_opt_ stdex::html::sequence* sequence = nullptr, _In_ uintptr_t data = 0) :
2147 token(token_t::url, sequence, data),
2148 url(_url, num_chars),
2149 encoding(_encoding)
2150 {}
2151
2152 friend class parser<T, TR, AX>;
2153
2154 public:
2155 std::basic_string<T, TR, AX> url;
2156 token_url_t encoding;
2157 };
2158
2164 std::list<stdex::html::token*> active_tokens;
2165 size_t word_index;
2167 };
2168
2169 using inserted_token_list = std::list<inserted_token>;
2170
2171 template<class T, class TR, class AX>
2173 {
2174 public:
2175 parser(
2176 _In_ const document<T, TR, AX>& document,
2177 _In_reads_or_z_opt_(num_chars) const stdex::schar_t* url = nullptr, _In_ size_t num_chars = 0,
2178 _In_ bool parse_frames = false, _In_ stdex::progress<size_t>* progress = nullptr) :
2180 m_url(url, stdex::strnlen(url, num_chars)),
2181 m_parse_frames(parse_frames),
2183 m_source(nullptr)
2184 {}
2185
2190 {
2191 _Assume_(m_tokens.empty());
2192
2193 if (m_progress) {
2194 m_progress->set_range(0, m_document.source().size());
2195 m_progress->set(0);
2196 }
2197
2198 m_source = m_document.source().data();
2200 return parse(m_document.m_sequences.end());
2201 }
2202
2209 static void link(_Inout_ std::basic_string<T, TR, AX>& source, _In_ const text_token<T, TR, AX>* t)
2210 {
2211 _Assume_(t);
2212 _Assume_(
2213 t->type == token_t::complete ||
2214 t->type == token_t::starting ||
2215 t->type == token_t::ending ||
2216 t->type == token_t::root);
2217
2218 if (t->text_type & has_tokens) {
2219 const T* root = t->text.data();
2220 for (size_t i = 0, num_chars = t->text.size(); i < num_chars && root[i];) {
2221 _Assume_(root[i] != token_tag_end);
2222 const token* t2 = token::parse_tag(root, i);
2223 if (t2) {
2224 switch (t2->type) {
2225 case token_t::complete:
2226 case token_t::starting:
2227 case token_t::ending:
2228 case token_t::root:
2229 link(source, dynamic_cast<const text_token<T, TR, AX>*>(t2));
2230 break;
2231 case token_t::url: {
2232 auto t2_url = dynamic_cast<const url_token<T, TR, AX>*>(t2);
2233 switch (t2_url->encoding) {
2234 case token_url_t::plain:
2235 source += t2_url->url;
2236 break;
2237 case token_url_t::sgml:
2238 escape(source, t2_url->url.data(), t2_url->url.size());
2239 break;
2240 case token_url_t::css:
2241 css_escape(source, t2_url->url.data(), t2_url->url.size());
2242 break;
2243 default:
2244 throw std::invalid_argument("unsupported URL encoding");
2245 }
2246 break;
2247 }
2248 default:
2249 throw std::invalid_argument("unsupported token type");
2250 }
2251 }
2252 else if (t->text_type & has_text) {
2253 escape_min(source, root[i]);
2254 i++;
2255 }
2256 else
2257 source += root[i++];
2258 }
2259 }
2260 else if (t->text_type & has_text) {
2261 // Token contains no references to other tokens. But, it does contain text that requires escaping.
2262 escape_min(source, t->text.data(), t->text.size());
2263 }
2264 else
2265 source += t->text;
2266 }
2267
2276 static void start_tokens(_Inout_ std::basic_string<T, TR, AX>& source, _Inout_ token_list& active_tokens, _In_ const token_list& new_tokens, _In_ token_list::const_iterator from)
2277 {
2278 for (; from != new_tokens.cend(); ++from) {
2279 auto t = *from;
2280 t->append_tag(source);
2281 active_tokens.push_back(t);
2282 }
2283 }
2284
2294 token_list::const_iterator end_tokens(_Inout_ std::basic_string<T, TR, AX>& source, _Inout_ token_list& active_tokens, _In_ const token_list& new_tokens)
2295 {
2296 // Skip matching tokens in active_tokens and new_tokens.
2297 token_list::const_iterator i1, i2;
2298 for (i1 = active_tokens.cbegin(), i2 = new_tokens.cbegin(); i1 != active_tokens.cend(); ++i1, ++i2) {
2299 if (i2 == new_tokens.cend() || *i1 != *i2) {
2300 // Got two tokens, where lists don't match anymore, or new_tokens list is out.
2301 // End tokens not relevant anymore in reverse order of starting.
2302 for (auto i = active_tokens.cend(); i != active_tokens.cbegin(); ) {
2303 auto t1 = dynamic_cast<starting_token<T, TR, AX>*>(*(--i));
2304 _Assume_(t1 && t1->type == token_t::starting);
2305
2306 std::unique_ptr<text_token<T, TR, AX>> t2(new text_token<T, TR, AX>(token_t::ending));
2307 t2->text.reserve(t1->name.size() + 3);
2308 t2->text += '<';
2309 t2->text += '/';
2310 t2->text += t1->name;
2311 t2->text += '>';
2312 append_token(std::move(t2), source);
2313
2314 // Pop the active token.
2315 if (i1 == i) {
2316 active_tokens.erase(i);
2317 break;
2318 }
2319 active_tokens.erase(i);
2320 i = active_tokens.cend();
2321 }
2322 break;
2323 }
2324 }
2325 return i2;
2326 }
2327
2337 void append_inserted_tokens(_Inout_ std::basic_string<T, TR, AX>& source, _Inout_ inserted_token_list& inserted_tokens,
2338 _In_ size_t word_index, _In_ bool after_word,
2339 _Inout_ token_list& active_tokens)
2340 {
2341 for (auto i = inserted_tokens.begin(); i != inserted_tokens.end(); ) {
2342 auto& t = *i;
2343 _Assume_(t.token);
2344 if (t.word_index == word_index && t.after_word == after_word) {
2345 if (t.token->type != token_t::ending)
2346 start_tokens(source, active_tokens, t.active_tokens, end_tokens(source, active_tokens, t.active_tokens));
2347 t.token->append_tag(source);
2348 inserted_tokens.erase(i++);
2349 }
2350 else
2351 ++i;
2352 }
2353 }
2354
2361 static void merge(_Inout_ token_list& a, _In_ const token_list& b)
2362 {
2363 for (auto i2 = b.begin(); i2 != b.end(); ++i2) {
2364 auto t2 = *i2;
2365 for (auto i1 = a.begin(); i1 != a.end(); ++i1) {
2366 if (i1 == a.end()) {
2367 a.push_back(t2);
2368 break;
2369 }
2370 auto t1 = *i1;
2371 if (t1 == t2)
2372 break;
2373 }
2374 }
2375 }
2376
2380 void make_absolute_url(std::basic_string<T, TR, AX>& rel)
2381 {
2382 _Unreferenced_(rel);
2383
2384 if (m_url.empty())
2385 return;
2386
2387 // TODO: Implement!
2388 }
2389
2393 const token_vector& tokens() const { return m_tokens; }
2394
2395 protected:
2403 template <class T_token>
2404 T_token* append_token(_Inout_ std::unique_ptr<T_token>&& token)
2405 {
2406 if (!token)
2407 return nullptr;
2408 auto t = token.get();
2409 m_tokens.push_back(std::move(token));
2410 return t;
2411 }
2412
2421 template <class T_token>
2422 size_t append_token(_Inout_ std::unique_ptr<T_token>&& token, _Inout_ std::basic_string<T, TR, AX>& source)
2423 {
2424 if (!token)
2425 return 0;
2426 size_t n = token->append_tag(source);
2427 m_tokens.push_back(std::move(token));
2428 return n;
2429 }
2430
2439 text_token<T, TR, AX>* parse(_In_ const sequence_store::const_iterator& end, _In_ uint32_t text_type = 0)
2440 {
2442 std::unique_ptr<text_token<T, TR, AX>> token(new text_token<T, TR, AX>(
2443 token_t::complete,
2444 nullptr, 0,
2445 text_type,
2446 m_offset != end ? m_offset->get() : nullptr));
2447
2448 while (m_offset != end) {
2449 auto& s = *m_offset;
2450
2451 if (m_progress) {
2452 if (m_progress->cancel())
2453 throw stdex::user_cancelled();
2454 m_progress->set(s->interval.start);
2455 }
2456
2457 // No token_tag_start and token_tag_end chars, please.
2458 _Assume_(
2459 stdex::strnchr(m_source + s->interval.start, s->interval.size(), static_cast<T>(token_tag_start)) == stdex::npos &&
2460 stdex::strnchr(m_source + s->interval.start, s->interval.size(), static_cast<T>(token_tag_end)) == stdex::npos);
2461
2462 if (s->type == stdex::parser::html_sequence_t::text) {
2463 rel.from = s->interval.start;
2464 token->mapping.push_back(rel);
2465 stdex::sgml2strcat(token->text, m_source + s->interval.start, s->interval.size(), 0, rel, &token->mapping);
2466 rel.to = token->text.size();
2467 if (!(token->text_type & has_text) &&
2468 !stdex::isblank(m_source + s->interval.start, s->interval.size()))
2469 token->text_type |= has_text;
2470 ++m_offset;
2471 }
2472 else if (s->type == stdex::parser::html_sequence_t::element || s->type == stdex::parser::html_sequence_t::element_start) {
2473 const element* s_el = static_cast<const element*>(s.get());
2474 _Assume_(s_el);
2475 const element_start* s_el_start = s->type == stdex::parser::html_sequence_t::element_start ? static_cast<const element_start*>(s.get()) : nullptr;
2476 if (s_el->code == element_t::frameset && !m_parse_frames)
2477 throw std::invalid_argument("<frameset> detected");
2478
2479 {
2480 size_t offset = s->interval.start;
2481 std::unique_ptr<text_token<T, TR, AX>> t(s->type == stdex::parser::html_sequence_t::element || element_traits::span(s_el_start->code) == element_span_t::immediate ?
2482 new text_token<T, TR, AX>(token_t::complete, nullptr, 0, 0, s.get()) :
2483 new starting_token<T, TR, AX>(nullptr, 0, m_source + s_el_start->name.start, s_el_start->name.size(), 0, s.get(), s_el_start->end));
2484
2485 // Copy the tag contents, but mind any attributes containing localizable text.
2486 for (auto& a : s_el->attributes) {
2487 if (a.value.empty() ||
2488 stdex::isblank(m_source + a.value.start, a.value.size()))
2489 continue;
2490
2491 if (element_traits::is_uri(s_el->code, m_source + a.name.start, a.name.size())) {
2492 t->text.append(m_source + offset, a.value.start - offset);
2493 std::unique_ptr<url_token<T, TR, AX>> t_url(new url_token<T, TR, AX>(
2494 nullptr, 0,
2495 token_url_t::sgml,
2496 s.get()));
2497 stdex::sgml2strcat(t_url->url, m_source + a.value.start, a.value.size());
2498 append_token(std::move(t_url), t->text);
2499 t->text_type |= has_tokens;
2500 offset = a.value.end;
2501 }
2502 else if (element_traits::is_localizable(s_el->code, m_source + a.name.start, a.name.size())) {
2503 t->text.append(m_source + offset, a.value.start - offset);
2504 std::unique_ptr<text_token<T, TR, AX>> t_value(new text_token<T, TR, AX>(
2505 token_t::complete,
2506 nullptr, 0,
2507 has_text | is_title,
2508 s.get()));
2509 stdex::mapping<size_t> rel_value(a.value.start, 0);
2510 t_value->mapping.push_back(rel_value);
2511 stdex::sgml2strcat(t_value->text, m_source + a.value.start, a.value.size(), 0, rel_value, &t_value->mapping);
2512 append_token(std::move(t_value), t->text);
2513 t->text_type |= has_tokens;
2514 offset = a.value.end;
2515 }
2516 }
2517
2518 t->text.append(m_source + offset, s->interval.end - offset);
2519 rel.from = s->interval.start;
2520 token->mapping.push_back(rel);
2521 rel.to += append_token(std::move(t), token->text);
2522 token->text_type |= has_tokens;
2523 }
2524 ++m_offset;
2525
2526 if (s_el_start) {
2527 if (s_el_start->code == element_t::address ||
2528 s_el_start->code == element_t::code ||
2529 s_el_start->code == element_t::comment ||
2530 s_el_start->code == element_t::cite ||
2531 s_el_start->code == element_t::kbd ||
2532 s_el_start->code == element_t::samp ||
2533 s_el_start->code == element_t::script ||
2534 s_el_start->code == element_t::style)
2535 {
2536 // Non-localizable
2537 auto s_end = s_el_start->end;
2538 _Assume_(s_end);
2539
2540 if (s->interval.end < s_end->interval.start) {
2541 if (s_el_start->code != element_t::style) {
2542 rel.from = s->interval.start;
2543 token->mapping.push_back(rel);
2544 rel.to += append_token(std::move(std::unique_ptr<text_token<T, TR, AX>>(
2546 token_t::complete,
2547 m_source + s->interval.end, s_end->interval.start - s->interval.end,
2548 0,
2549 m_offset->get()))),
2550 token->text);
2551 }
2552 else {
2553 // Partially parse CSS. It may contain URLs we need to make absolute.
2554 auto t = parse_css(s->interval.end, s_end->interval.start);
2555 _Assume_(t);
2556 rel.from = s->interval.start;
2557 token->mapping.push_back(rel);
2558 rel.to += t->append_tag(token->text);
2559 }
2560 token->text_type |= has_tokens;
2561 }
2562 while (m_offset != end && m_offset->get() != s_end)
2563 ++m_offset;
2564 }
2565 else if (element_traits::is_group(s_el_start->code)) {
2566 auto limit = m_offset;
2567 while (limit != end && limit->get() != s_el_start->end)
2568 ++limit;
2569 auto t = parse(limit,
2570 (element_traits::is_heading(s_el_start->code) || s_el_start->code == element_t::dt || s_el_start->code == element_t::title ? is_title : 0) |
2571 (element_traits::is_list(s_el_start->code) ? is_bullet : 0));
2572 rel.from = s->interval.start;
2573 token->mapping.push_back(rel);
2574 rel.to += t->append_tag(token->text);
2575 token->text_type |= has_tokens;
2576 }
2577 }
2578 }
2579 else if (s->type == stdex::parser::html_sequence_t::element_end) {
2580 rel.from = s->interval.start;
2581 token->mapping.push_back(rel);
2582 rel.to += append_token(std::move(std::unique_ptr<text_token<T, TR, AX>>(
2584 token_t::ending,
2585 m_source + s->interval.start, s->interval.size(),
2586 0,
2587 s.get()))),
2588 token->text);
2589 token->text_type |= has_tokens;
2590 ++m_offset;
2591 }
2592 else {
2593 // Declaration, instruction, (P)CDATA section, comment...
2594 rel.from = s->interval.start;
2595 token->mapping.push_back(rel);
2596 rel.to += append_token(std::move(std::unique_ptr<text_token<T, TR, AX>>(
2598 token_t::complete,
2599 m_source + s->interval.start, s->interval.size(),
2600 0,
2601 s.get()))),
2602 token->text);
2603 token->text_type |= has_tokens;
2604 ++m_offset;
2605 }
2606 }
2607
2608 return append_token(std::move(token));
2609 }
2610
2614 text_token<T, TR, AX>* parse_css(size_t start, size_t end)
2615 {
2616 stdex::interval<size_t> section, content;
2617 std::unique_ptr<text_token<T, TR, AX>> token(
2619 token_t::complete,
2620 nullptr, 0,
2621 0,
2622 m_offset->get()));
2623
2624 for (;;) {
2625 if (m_css_comment.match(m_source, start, end)) {
2626 token->text.append(m_source + start, m_css_comment.interval.end - start);
2627 start = m_css_comment.interval.end;
2628 }
2629 else if (m_css_cdo.match(m_source, start, end)) {
2630 token->text.append(m_source + start, m_css_cdo.interval.end - start);
2631 start = m_css_cdo.interval.end;
2632 }
2633 else if (m_css_cdc.match(m_source, start, end)) {
2634 token->text.append(m_source + start, m_css_cdc.interval.end - start);
2635 start = m_css_cdc.interval.end;
2636 }
2637 else if (
2638 (m_css_import.match(m_source, start, end) && ((void)(section = m_css_import.interval), (void)(content = m_css_import.content), true)) ||
2639 (m_css_uri.match(m_source, start, end) && ((void)(section = m_css_uri.interval), (void)(content = m_css_uri.content), true)))
2640 {
2641 std::unique_ptr<url_token<T, TR, AX>> t_url(
2643 nullptr, 0,
2644 token_url_t::css,
2645 m_offset->get()));
2646 css_unescape(t_url->url, m_source + content.start, content.size());
2647 token->text.append(m_source + start, content.start - start);
2648 append_token(std::move(t_url), token->text);
2649 token->text.append(m_source + content.end, section.end - content.end);
2650 token->text_type |= has_tokens;
2651 start = section.end;
2652 }
2653 else if (m_any_char.match(m_source, start, end)) {
2654 token->text.append(m_source + start, m_any_char.interval.end - start);
2655 start = m_any_char.interval.end;
2656 }
2657 else
2658 break;
2659 }
2660
2661 return append_token(std::move(token));
2662 }
2663
2664 protected:
2666 const stdex::sstring m_url;
2667 const bool m_parse_frames;
2669 const T* m_source;
2670 token_vector m_tokens;
2671 sequence_store::const_iterator m_offset;
2672
2673 // For detecting URLs in CSS
2681 };
2682 }
2683}
HTML comment.
Definition html.hpp:1566
stdex::interval< size_t > content
Comment content position in source.
Definition html.hpp:1575
HTML declaration.
Definition html.hpp:1548
stdex::interval< size_t > name
Declaration name position in source.
Definition html.hpp:1558
std::vector< stdex::parser::html_attribute > attributes
Declaration attribute positions in source.
Definition html.hpp:1559
HTML document.
Definition html.hpp:1615
bool m_is_rcdata
Inside of RCDATA?
Definition html.hpp:1942
const std::basic_string< T, TR, AX > & source() const
Returns document HTML source code.
Definition html.hpp:1889
void append(_In_reads_or_z_opt_(num_chars) const T *source, size_t num_chars)
Parses HTML source code by chunks.
Definition html.hpp:1655
size_t m_num_valid_conditions
Number of started valid conditions.
Definition html.hpp:1939
size_t m_num_invalid_conditions
Number of started invalid conditions.
Definition html.hpp:1940
bool m_is_cdata
Inside of CDATA?
Definition html.hpp:1941
stdex::charset_id m_charset
Document charset.
Definition html.hpp:1936
sequence_store m_sequences
Store of sequences.
Definition html.hpp:1950
element_start * active_element() const
Returns starting tag of currently active element or nullptr if no element is known to be started.
Definition html.hpp:1897
size_t m_num_parsed
Number of characters already parsed.
Definition html.hpp:1935
std::vector< element_start * > m_element_stack
LIFO stack of started elements.
Definition html.hpp:1951
void finalize()
Finalizes document when no more appending is planned.
Definition html.hpp:1867
std::basic_string< T, TR, AX > replace_entities(_In_reads_or_z_opt_(num_chars) const T *input, size_t num_chars) const
Replaces entities with their content.
Definition html.hpp:1905
void assign(_In_reads_or_z_opt_(num_chars) const T *source, size_t num_chars)
Parses HTML document source code.
Definition html.hpp:1879
bool m_is_special_element
Inside of a special element (<SCRIPT>, <STYLE>, ...)?
Definition html.hpp:1952
std::vector< std::unique_ptr< entity< T, TR, AX > > > m_entities
Array of entities.
Definition html.hpp:1946
void clear()
Empties document.
Definition html.hpp:1634
std::basic_string< T, TR, AX > m_source
Document HTML source code.
Definition html.hpp:1934
Ending tag of an HTML element </...>
Definition html.hpp:1528
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1540
element_start * start
Corresponding starting tag.
Definition html.hpp:1541
element_t code
Element code.
Definition html.hpp:1539
Starting tag of an HTML element <...>
Definition html.hpp:1512
sequence * end
Corresponding ending tag of type element_end; When element is ended by a start of another element,...
Definition html.hpp:1521
HTML element <.../>
Definition html.hpp:1333
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1502
std::vector< stdex::parser::html_attribute > attributes
Element attribute positions in source.
Definition html.hpp:1503
element_t code
Element code.
Definition html.hpp:1501
HTML instruction.
Definition html.hpp:1582
stdex::interval< size_t > content
Instruction content position in source.
Definition html.hpp:1591
HTML parser.
Definition html.hpp:2173
token_vector m_tokens
HTML token storage.
Definition html.hpp:2670
void append_inserted_tokens(std::basic_string< T, TR, AX > &source, inserted_token_list &inserted_tokens, size_t word_index, bool after_word, token_list &active_tokens)
Adds matching inserted tokens before/after the given word in source code.
Definition html.hpp:2337
text_token< T, TR, AX > * parse(const sequence_store::const_iterator &end, uint32_t text_type=0)
Recursively parses HTML document.
Definition html.hpp:2439
const stdex::sstring m_url
Absolute document URL.
Definition html.hpp:2666
text_token< T, TR, AX > * parse()
Parses HTML document.
Definition html.hpp:2189
const document< T, TR, AX > & m_document
Document being analyzed.
Definition html.hpp:2665
token_list::const_iterator end_tokens(std::basic_string< T, TR, AX > &source, token_list &active_tokens, const token_list &new_tokens)
Pops ending tokens from the active token list and append their tags to the source code string.
Definition html.hpp:2294
static void merge(token_list &a, const token_list &b)
Adds tokens from list b to list a creating an union.
Definition html.hpp:2361
text_token< T, TR, AX > * parse_css(size_t start, size_t end)
Parses CSS.
Definition html.hpp:2614
static void start_tokens(std::basic_string< T, TR, AX > &source, token_list &active_tokens, const token_list &new_tokens, token_list::const_iterator from)
Pushes tokens to the active token list and appends their tags to the source code string.
Definition html.hpp:2276
static void link(std::basic_string< T, TR, AX > &source, const text_token< T, TR, AX > *t)
Rebuilds HTML source code from the token tree.
Definition html.hpp:2209
T_token * append_token(std::unique_ptr< T_token > &&token)
Adds token to the collection.
Definition html.hpp:2404
sequence_store::const_iterator m_offset
Index of active section.
Definition html.hpp:2671
const T * m_source
HTML source code.
Definition html.hpp:2669
stdex::progress< size_t > * m_progress
Progress indicator.
Definition html.hpp:2668
const bool m_parse_frames
Parse frames.
Definition html.hpp:2667
void make_absolute_url(std::basic_string< T, TR, AX > &rel)
Converts URL to absolute.
Definition html.hpp:2380
size_t append_token(std::unique_ptr< T_token > &&token, std::basic_string< T, TR, AX > &source)
Adds token to the collection and appends its tag to the source code string.
Definition html.hpp:2422
const token_vector & tokens() const
Returns collection of tokens.
Definition html.hpp:2393
Base class for HTML sequences.
Definition html.hpp:1314
stdex::interval< size_t > interval
Sequence position in source.
Definition html.hpp:1317
stdex::parser::html_sequence_t type
Sequence type. Enum is used for performance reasons (vs. dynamic_cast)
Definition html.hpp:1316
sequence * parent
Parent sequence.
Definition html.hpp:1318
Token representing start HTML tag.
Definition html.hpp:2106
stdex::html::sequence * end_sequence
Ending tag sequence.
Definition html.hpp:2124
std::basic_string< T, TR, AX > name
Element name allowing later recreation of ending </tag>
Definition html.hpp:2123
Token representing part of HTML text.
Definition html.hpp:2081
stdex::mapping_vector< size_t > mapping
Mapping between source and text positions.
Definition html.hpp:2098
uint32_t text_type
Mask of text_type_flag_t to specify text content.
Definition html.hpp:2097
std::basic_string< T, TR, AX > text
Token text.
Definition html.hpp:2096
HTML token base class.
Definition html.hpp:1990
sequence * sequence
Pointer to the sequence this token represents or nullptr when it doesn't trivially represent one sequ...
Definition html.hpp:2059
uintptr_t data
Any user-supplied data.
Definition html.hpp:2060
size_t append_tag(std::basic_string< wchar_t, TR, AX > &str) const
Appends token tag to the source code.
Definition html.hpp:2028
token_t type
Token type.
Definition html.hpp:2058
size_t append_tag(std::basic_string< char, TR, AX > &str) const
Appends token tag to the source code.
Definition html.hpp:2012
HTTP token representing an URL.
Definition html.hpp:2141
token_url_t encoding
URL encoding.
Definition html.hpp:2156
std::basic_string< T, TR, AX > url
URL.
Definition html.hpp:2155
stdex::interval< size_t > interval
Region of the last match.
Definition parser.hpp:120
Test for given string.
Definition parser.hpp:814
Progress indicator base class.
Definition progress.hpp:22
virtual bool cancel()
Query whether user requested abort.
Definition progress.hpp:70
virtual void set(T value)
Set current progress.
Definition progress.hpp:52
virtual void set_range(T start, T end)
Set progress range extent.
Definition progress.hpp:42
User cancelled exception.
Definition exception.hpp:17
Describes attributes associated with a HTML element.
Definition html.hpp:685
static bool is_group(element_t code)
Does element represent a separate part of text?
Definition html.hpp:1102
static bool is_flow(element_t code)
Does element typically represent text?
Definition html.hpp:1012
static bool is_heading(element_t code)
Does element represent a heading?
Definition html.hpp:926
static bool is_head_content(element_t code)
Is element part of the document head?
Definition html.hpp:1022
static bool is_fontstyle(element_t code)
Does element represent font styling?
Definition html.hpp:810
static bool is_block(element_t code)
Is element typically displayed as a stand-alone section of text?
Definition html.hpp:980
static bool is_head_misc(element_t code)
May element be a part of document head?
Definition html.hpp:1040
static bool is_list(element_t code)
Does element represent a list of items?
Definition html.hpp:946
static bool is_uri(element_t code, _In_reads_or_z_opt_(num_chars) const T *attr_name, size_t num_chars)
Checks if expected element attribute value is URI.
Definition html.hpp:1233
static bool is_preformatted(element_t code)
Does element represent preformatted text, source code etc.?
Definition html.hpp:964
static bool is_localizable(element_t code, const T *attr_name, size_t num_chars)
Checks if expected element attribute value is localizable.
Definition html.hpp:1288
static bool is_special(element_t code)
Does element represent non-textual item in the document?
Definition html.hpp:858
static bool is_pre_exclusion(element_t code)
May element be a part of <pre></pre>?
Definition html.hpp:1059
static bool is_inline(element_t code)
Is element typically displayed inline with text?
Definition html.hpp:911
static bool is_html_content(element_t code)
Does element represent the document body?
Definition html.hpp:1085
static bool is_formctrl(element_t code)
Does element represent a form control?
Definition html.hpp:892
static bool is_phrase(element_t code)
Does element represent a phrase-of-speech?
Definition html.hpp:833
static bool may_contain(element_t parent, element_t child)
Checks if one element may nest inside another.
Definition html.hpp:1133
static element_span_t span(element_t code)
Returns expected element span in HTML code.
Definition html.hpp:691
HTML entity.
Definition html.hpp:1599
std::basic_string< T, TR, AX > value
Entity value.
Definition html.hpp:1601
stdex::interval< size_t > name
Name position in source.
Definition html.hpp:1600
Inserted HTML token.
Definition html.hpp:2162
bool after_word
true if token is anchored after the word; false if anchored before the word
Definition html.hpp:2166
std::list< stdex::html::token * > active_tokens
List of started tokens at inserted token.
Definition html.hpp:2164
size_t word_index
Index of the word, token is anchored to.
Definition html.hpp:2165
token * token
Points to the token.
Definition html.hpp:2163
Numerical interval.
Definition interval.hpp:18
T size() const
Returns interval size.
Definition interval.hpp:47
T end
interval end
Definition interval.hpp:20
T start
interval start
Definition interval.hpp:19
Maps index in source string to index in destination string.
Definition mapping.hpp:17
Tag attribute.
Definition parser.hpp:8023
stdex::interval< size_t > value
attribute value position in source
Definition parser.hpp:8025