stdex
Additional custom or not Standard C++ covered algorithms
Loading...
Searching...
No Matches
html.hpp
1/*
2 SPDX-License-Identifier: MIT
3 Copyright © 2016-2023 Amebis
4*/
5
6#pragma once
7
8#include "compat.hpp"
9#include "exception.hpp"
10#include "interval.hpp"
11#include "mapping.hpp"
12#include "parser.hpp"
13#include "progress.hpp"
14#include "sgml.hpp"
15#include "string.hpp"
16#include "system.hpp"
17#include "unicode.hpp"
18#include <exception>
19#include <list>
20#include <map>
21#include <memory>
22#include <stdexcept>
23#include <string_view>
24#include <string>
25#include <vector>
26
27#ifdef _WIN32
28#undef small
29#endif
30
31namespace stdex
32{
33 namespace html
34 {
42 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
43 void escape(
44 _Inout_ std::basic_string<char, TR, AX>& dst,
45 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
46 {
47 _Assume_(src || !num_chars);
48 for (size_t i = 0; i < num_chars && src[i]; ++i) {
49 switch (src[i]) {
50 case '&': dst += "&amp;"; break;
51 case ';': dst += "&semi;"; break;
52 case '\"': dst += "&quot;"; break;
53 case '\'': dst += "&#x27;"; break;
54 case '<': dst += "&lt;"; break;
55 case '>': dst += "&gt;"; break;
56 case 0x00a0: dst += "&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
57 default: dst += src[i]; break;
58 }
59 }
60 }
61
69 template<class TR = std::char_traits<wchar_t>, class AX = std::allocator<wchar_t>>
70 void escape(
71 _Inout_ std::basic_string<wchar_t, TR, AX>& dst,
72 _In_reads_or_z_opt_(num_chars) const wchar_t* src, _In_ size_t num_chars)
73 {
74 _Assume_(src || !num_chars);
75 for (size_t i = 0; i < num_chars && src[i]; ++i) {
76 switch (src[i]) {
77 case L'&': dst += L"&amp;"; break;
78 case L';': dst += L"&semi;"; break;
79 case L'\"': dst += L"&quot;"; break;
80 case L'\'': dst += L"&#x27;"; break;
81 case L'<': dst += L"&lt;"; break;
82 case L'>': dst += L"&gt;"; break;
83 case L'\u00a0': dst += L"&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
84 default: dst += src[i]; break;
85 }
86 }
87 }
88
95 template<class T, size_t N, class TR = std::char_traits<T>, class AX = std::allocator<T>>
96 void escape(
97 _Inout_ std::basic_string<T, TR, AX>& dst,
98 _In_ const T (&src)[N])
99 {
100 escape(dst, src, N);
101 }
102
109 template<class T, class TR_dst = std::char_traits<T>, class AX_dst = std::allocator<T>, class TR_src = std::char_traits<T>, class AX_src = std::allocator<T>>
110 void escape(
111 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
112 _In_ const std::basic_string<T, TR_src, AX_src>& src)
113 {
114 escape(dst, src.data(), src.size());
115 }
116
123 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
124 void escape_min(_Inout_ std::basic_string<char, TR, AX>& dst, _In_ char chr)
125 {
126 switch (chr) {
127 case '&': dst += "&amp;"; break;
128 case '<': dst += "&lt;"; break;
129 case '>': dst += "&gt;"; break;
130 case 0x00a0: dst += "&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
131 default: dst += chr; break;
132 }
133 }
134
141 template<class TR = std::char_traits<wchar_t>, class AX = std::allocator<wchar_t>>
142 void escape_min(_Inout_ std::basic_string<wchar_t, TR, AX>& dst, _In_ wchar_t chr)
143 {
144 switch (chr) {
145 case L'&': dst += L"&amp;"; break;
146 case L'<': dst += L"&lt;"; break;
147 case L'>': dst += L"&gt;"; break;
148 case L'\u00a0': dst += L"&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
149 default: dst += chr; break;
150 }
151 }
152
160 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
161 void escape_min(
162 _Inout_ std::basic_string<char, TR, AX>& dst,
163 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
164 {
165 _Assume_(src || !num_chars);
166 for (size_t i = 0; i < num_chars && src[i]; ++i) {
167 switch (src[i]) {
168 case '&': dst += "&amp;"; break;
169 case '<': dst += "&lt;"; break;
170 case '>': dst += "&gt;"; break;
171 case 0x00a0: dst += "&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
172 default: dst += src[i]; break;
173 }
174 }
175 }
176
184 template<class TR = std::char_traits<wchar_t>, class AX = std::allocator<wchar_t>>
185 void escape_min(
186 _Inout_ std::basic_string<wchar_t, TR, AX>& dst,
187 _In_reads_or_z_opt_(num_chars) const wchar_t* src, _In_ size_t num_chars)
188 {
189 _Assume_(src || !num_chars);
190 for (size_t i = 0; i < num_chars && src[i]; ++i) {
191 switch (src[i]) {
192 case L'&': dst += L"&amp;"; break;
193 case L'<': dst += L"&lt;"; break;
194 case L'>': dst += L"&gt;"; break;
195 case L'\u00a0': dst += L"&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
196 default: dst += src[i]; break;
197 }
198 }
199 }
200
207 template<class T, size_t N, class TR = std::char_traits<T>, class AX = std::allocator<T>>
208 void escape_min(
209 _Inout_ std::basic_string<T, TR, AX>& dst,
210 _In_ const T (&src)[N])
211 {
212 escape_min(dst, src, N);
213 }
214
221 template<class T, class TR_dst = std::char_traits<T>, class AX_dst = std::allocator<T>, class TR_src = std::char_traits<T>, class AX_src = std::allocator<T>>
222 void escape_min(
223 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
224 _In_ const std::basic_string<T, TR_src, AX_src>& src)
225 {
226 escape_min(dst, src.data(), src.size());
227 }
228
236 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
237 void url_unescape(
238 _Inout_ std::basic_string<char, TR, AX>& dst,
239 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
240 {
241 _Assume_(src || !num_chars);
242 for (size_t i = 0; i < num_chars && src[i];) {
243 switch (src[i]) {
244 case '+':
245 dst += ' '; i++;
246 break;
247
248 case '%': {
249 i++;
250
251 uint8_t chr;
252 if ('0' <= src[i] && src[i] <= '9') chr = (src[i++] - '0') << 4;
253 else if ('A' <= src[i] && src[i] <= 'F') chr = (src[i++] - 'A' + 10) << 4;
254 else if ('a' <= src[i] && src[i] <= 'f') chr = (src[i++] - 'a' + 10) << 4;
255 else { dst += '%'; continue; }
256 if ('0' <= src[i] && src[i] <= '9') chr |= (src[i++] - '0');
257 else if ('A' <= src[i] && src[i] <= 'F') chr |= (src[i++] - 'A' + 10);
258 else if ('a' <= src[i] && src[i] <= 'f') chr |= (src[i++] - 'a' + 10);
259 else { dst += '%'; dst += src[i - 1]; continue; }
260
261 dst += static_cast<char>(chr);
262 break;
263 }
264
265 default:
266 dst += src[i++];
267 }
268 }
269 }
270
277 template<size_t N, class TR = std::char_traits<char>, class AX = std::allocator<char>>
278 void url_unescape(
279 _Inout_ std::basic_string<char, TR, AX>& dst,
280 _In_ const char (&src)[N])
281 {
282 url_unescape(dst, src, N);
283 }
284
291 template<class TR_dst = std::char_traits<char>, class AX_dst = std::allocator<char>>
292 void url_unescape(
293 _Inout_ std::basic_string<char, TR_dst, AX_dst>& dst,
294 _In_ const std::basic_string_view<char, std::char_traits<char>> src)
295 {
296 url_unescape(dst, src.data(), src.size());
297 }
298
306 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
307 void url_escape(
308 _Inout_ std::basic_string<char, TR, AX>& dst,
309 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
310 {
311 _Assume_(src || !num_chars);
312 for (size_t i = 0; i < num_chars && src[i]; ++i) {
313 switch (src[i]) {
314 case ' ': dst += "+"; break;
315 case '<': dst += "%3C"; break;
316 case '>': dst += "%3E"; break;
317 case '#': dst += "%23"; break;
318 case '%': dst += "%25"; break;
319 case '{': dst += "%7B"; break;
320 case '}': dst += "%7D"; break;
321 case '|': dst += "%7C"; break;
322 case '\\': dst += "%5C"; break;
323 case '^': dst += "%5E"; break;
324 case '~': dst += "%7E"; break;
325 case '[': dst += "%5B"; break;
326 case ']': dst += "%5D"; break;
327 case '`': dst += "%60"; break;
328 case ';': dst += "%3B"; break;
329 case '/': dst += "%2F"; break;
330 case '?': dst += "%3F"; break;
331 case ':': dst += "%3A"; break;
332 case '@': dst += "%40"; break;
333 case '=': dst += "%3D"; break;
334 case '&': dst += "%26"; break;
335 case '$': dst += "%24"; break;
336 default:
337 if (0x20 < static_cast<uint8_t>(src[i]) && static_cast<uint8_t>(src[i]) < 0x7f)
338 dst += src[i];
339 else {
340 dst += '%';
341 uint8_t n = (static_cast<uint8_t>(src[i]) & 0xf0) >> 4;
342 dst += n < 10 ? static_cast<char>('0' + n) : static_cast<char>('A' + n - 10);
343 n = ((uint8_t)src[i] & 0x0f);
344 dst += n < 10 ? static_cast<char>('0' + n) : static_cast<char>('A' + n - 10);
345 }
346 }
347 }
348 }
349
356 template<size_t N, class TR = std::char_traits<char>, class AX = std::allocator<char>>
357 void url_escape(
358 _Inout_ std::basic_string<char, TR, AX>& dst,
359 _In_ const char (&src)[N])
360 {
361 url_escape(dst, src, N);
362 }
363
370 template<class TR_dst = std::char_traits<char>, class AX_dst = std::allocator<char>>
371 void url_escape(
372 _Inout_ std::basic_string<char, TR_dst, AX_dst>& dst,
373 _In_ const std::basic_string_view<char, std::char_traits<char>> src)
374 {
375 url_escape(dst, src.data(), src.size());
376 }
377
385 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
386 void css_unescape(
387 _Inout_ std::basic_string<T, TR, AX>& dst,
388 _In_reads_or_z_opt_(num_chars) const T* src, _In_ size_t num_chars)
389 {
390 _Assume_(src || !num_chars);
391 for (size_t i = 0; i < num_chars && src[i];) {
392 if (src[i] != '\\')
393 dst += src[i++];
394 else if (i + 1 < num_chars) {
395 i++;
396
397 switch (src[i]) {
398 // Classic escapes
399 case 'n': dst += '\n'; i++; break;
400 case 'r': dst += '\r'; i++; break;
401 case 't': dst += '\t'; i++; break;
402
403 // `\` at the end of the line
404 case '\n': i++; break;
405
406 // `\nnnn` escape
407 case '0':
408 case '1':
409 case '2':
410 case '3':
411 case '4':
412 case '5':
413 case '6':
414 case '7':
415 case '8':
416 case '9':
417 case 'A': case 'a':
418 case 'B': case 'b':
419 case 'C': case 'c':
420 case 'D': case 'd':
421 case 'E': case 'e':
422 case 'F': case 'f': {
423 wchar_t chr = 0;
424 size_t end = std::min(num_chars, i + 6);
425
426 for (; i < end; ++i) {
427 if ('0' <= src[i] && src[i] <= '9') chr = chr * 0x10 + src[i] - '0';
428 else if ('A' <= src[i] && src[i] <= 'F') chr = chr * 0x10 + src[i] - 'A' + 10;
429 else if ('a' <= src[i] && src[i] <= 'f') chr = chr * 0x10 + src[i] - 'a' + 10;
430 else break;
431 }
432
433 dst += static_cast<T>(chr);
434
435 if (i < end && src[i] == ' ') {
436 // Skip space after `\nnnn`.
437 i++;
438 }
439 break;
440 }
441
442 default: dst += src[i++];
443 }
444 }
445 }
446 }
447
454 template<class T, size_t N, class TR = std::char_traits<T>, class AX = std::allocator<T>>
455 void css_unescape(
456 _Inout_ std::basic_string<T, TR, AX>& dst,
457 _In_ const T (&src)[N])
458 {
459 css_unescape(dst, src, N);
460 }
461
468 template<class T, class TR_dst = std::char_traits<T>, class AX_dst = std::allocator<T>, class TR_src = std::char_traits<T>, class AX_src = std::allocator<T>>
469 void css_unescape(
470 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
471 _In_ const std::basic_string<T, TR_src, AX_src>& src)
472 {
473 css_unescape(dst, src.data(), src.size());
474 }
475
483 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
484 void css_escape(
485 _Inout_ std::basic_string<char, TR, AX>& dst,
486 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars)
487 {
488 _Assume_(src || !num_chars);
489 for (size_t i = 0; i < num_chars && src[i]; ++i) {
490 switch (src[i]) {
491 case '\\': dst += "\\\\"; break;
492 case '\n': dst += "\\n"; break;
493 case '\r': dst += "\\r"; break;
494 case '\t': dst += "\\t"; break;
495 case '\"': dst += "\\\""; break;
496 case '\'': dst += "\\'"; break;
497 default: dst += src[i]; break;
498 }
499 }
500 }
501
509 template<class TR = std::char_traits<wchar_t>, class AX = std::allocator<wchar_t>>
510 void css_escape(
511 _Inout_ std::basic_string<wchar_t, TR, AX>& dst,
512 _In_reads_or_z_opt_(num_chars) const wchar_t* src, _In_ size_t num_chars)
513 {
514 _Assume_(src || !num_chars);
515 for (size_t i = 0; i < num_chars && src[i]; ++i) {
516 switch (src[i]) {
517 case L'\\': dst += L"\\\\"; break;
518 case L'\n': dst += L"\\n"; break;
519 case L'\r': dst += L"\\r"; break;
520 case L'\t': dst += L"\\t"; break;
521 case L'\"': dst += L"\\\""; break;
522 case L'\'': dst += L"\\'"; break;
523 default: dst += src[i]; break;
524 }
525 }
526 }
527
534 template<class T, size_t N, class TR = std::char_traits<T>, class AX = std::allocator<T>>
535 void css_escape(
536 _Inout_ std::basic_string<T, TR, AX>& dst,
537 _In_ const T (&src)[N])
538 {
539 css_escape(dst, src, N);
540 }
541
548 template<class T, class TR_dst = std::char_traits<T>, class AX_dst = std::allocator<T>, class TR_src = std::char_traits<T>, class AX_src = std::allocator<T>>
549 void css_escape(
550 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
551 _In_ const std::basic_string<T, TR_src, AX_src>& src)
552 {
553 css_escape(dst, src.data(), src.size());
554 }
555
559 enum class element_t {
560 empty = 0,
561 a,
562 abbr,
563 acronym,
564 address,
565 applet,
566 area,
567 b,
568 base,
569 basefont,
570 bdo,
571 bgsound, // Microsoft Specific
572 big,
573 blink, // Microsoft Specific
574 blockquote,
575 body,
576 br,
577 button,
578 caption,
579 center,
580 cite,
581 code,
582 col,
583 colgroup,
584 comment, // Microsoft Specific
585 dd,
586 del,
587 dfn,
588 dir,
589 div,
590 dl,
591 dt,
592 em,
593 embed, // Microsoft Specific
594 fieldset,
595 font,
596 form,
597 frame,
598 frameset,
599 h1,
600 h2,
601 h3,
602 h4,
603 h5,
604 h6,
605 head,
606 hr,
607 html,
608 i,
609 iframe,
610 img,
611 input,
612 ins,
613 isindex,
614 kbd,
615 label,
616 legend,
617 li,
618 link,
619 listing, // Microsoft Specific
620 map,
621 marquee, // Microsoft Specific
622 menu,
623 meta,
624 nextid, // Microsoft Specific
625 nobr, // Microsoft Specific
626 noembed, // Microsoft Specific
627 noframes,
628 noscript,
629 object,
630 ol,
631 optgroup,
632 option,
633 p,
634 param,
635 plaintext, // Microsoft Specific
636 pre,
637 q,
638 rt, // Microsoft Specific
639 ruby, // Microsoft Specific
640 s,
641 samp,
642 script,
643 select,
644 small,
645 span,
646 strike,
647 strong,
648 style,
649 sub,
650 sup,
651 table,
652 tbody,
653 td,
654 textarea,
655 tfoot,
656 th,
657 thead,
658 title,
659 tr,
660 tt,
661 u,
662 ul,
663 var,
664 wbr, // Microsoft Specific
665 xmp, // Microsoft Specific
666
667 unknown = -1,
668 PCDATA = -2,
669 CDATA = -3,
670 };
671
675 enum class element_span_t {
676 needs_end = 0,
677 end_optional,
678 immediate,
679 };
680
685 {
691 static element_span_t span(_In_ element_t code)
692 {
693 static element_span_t lookup[] = {
694 element_span_t::needs_end, // a
695 element_span_t::needs_end, // abbr
696 element_span_t::needs_end, // acronym
697 element_span_t::needs_end, // address
698 element_span_t::needs_end, // applet
699 element_span_t::immediate, // area
700 element_span_t::needs_end, // b
701 element_span_t::immediate, // base
702 element_span_t::immediate, // basefont
703 element_span_t::needs_end, // bdo
704 element_span_t::immediate, // bgsound
705 element_span_t::needs_end, // big
706 element_span_t::needs_end, // blink
707 element_span_t::needs_end, // blockquote
708 element_span_t::end_optional, // body
709 element_span_t::immediate, // br
710 element_span_t::needs_end, // button
711 element_span_t::needs_end, // caption
712 element_span_t::needs_end, // center
713 element_span_t::needs_end, // cite
714 element_span_t::needs_end, // code
715 element_span_t::immediate, // col
716 element_span_t::end_optional, // colgroup
717 element_span_t::needs_end, // comment
718 element_span_t::end_optional, // dd
719 element_span_t::needs_end, // del
720 element_span_t::needs_end, // dfn
721 element_span_t::needs_end, // dir
722 element_span_t::needs_end, // div
723 element_span_t::needs_end, // dl
724 element_span_t::end_optional, // dt
725 element_span_t::needs_end, // em
726 element_span_t::immediate, // embed
727 element_span_t::needs_end, // fieldset
728 element_span_t::needs_end, // font
729 element_span_t::needs_end, // form
730 element_span_t::immediate, // frame
731 element_span_t::needs_end, // frameset
732 element_span_t::needs_end, // h1
733 element_span_t::needs_end, // h2
734 element_span_t::needs_end, // h3
735 element_span_t::needs_end, // h4
736 element_span_t::needs_end, // h5
737 element_span_t::needs_end, // h6
738 element_span_t::end_optional, // head
739 element_span_t::immediate, // hr
740 element_span_t::end_optional, // html
741 element_span_t::needs_end, // i
742 element_span_t::needs_end, // iframe
743 element_span_t::immediate, // img
744 element_span_t::immediate, // input
745 element_span_t::needs_end, // ins
746 element_span_t::immediate, // isindex
747 element_span_t::needs_end, // kbd
748 element_span_t::needs_end, // label
749 element_span_t::needs_end, // legend
750 element_span_t::end_optional, // li
751 element_span_t::immediate, // link
752 element_span_t::needs_end, // listing
753 element_span_t::needs_end, // map
754 element_span_t::needs_end, // marquee
755 element_span_t::needs_end, // menu
756 element_span_t::immediate, // meta
757 element_span_t::immediate, // nextid
758 element_span_t::needs_end, // nobr
759 element_span_t::needs_end, // noembed
760 element_span_t::needs_end, // noframes
761 element_span_t::needs_end, // noscript
762 element_span_t::needs_end, // object
763 element_span_t::needs_end, // ol
764 element_span_t::needs_end, // optgroup
765 element_span_t::end_optional, // option
766 element_span_t::end_optional, // p
767 element_span_t::immediate, // param
768 element_span_t::end_optional, // plaintext
769 element_span_t::needs_end, // pre
770 element_span_t::needs_end, // q
771 element_span_t::immediate, // rt
772 element_span_t::needs_end, // ruby
773 element_span_t::needs_end, // s
774 element_span_t::needs_end, // samp
775 element_span_t::needs_end, // script
776 element_span_t::needs_end, // select
777 element_span_t::needs_end, // small
778 element_span_t::needs_end, // span
779 element_span_t::needs_end, // strike
780 element_span_t::needs_end, // strong
781 element_span_t::needs_end, // style
782 element_span_t::needs_end, // sub
783 element_span_t::needs_end, // sup
784 element_span_t::needs_end, // table
785 element_span_t::end_optional, // tbody
786 element_span_t::end_optional, // td
787 element_span_t::needs_end, // textarea
788 element_span_t::end_optional, // tfoot
789 element_span_t::end_optional, // th
790 element_span_t::end_optional, // thead
791 element_span_t::needs_end, // title
792 element_span_t::end_optional, // tr
793 element_span_t::needs_end, // tt
794 element_span_t::needs_end, // u
795 element_span_t::needs_end, // ul
796 element_span_t::needs_end, // var
797 element_span_t::immediate, // wbr
798 element_span_t::needs_end, // xmp
799 };
800 return element_t::a <= code && code <= element_t::xmp ?
801 lookup[static_cast<size_t>(code) - static_cast<size_t>(element_t::a)] :
802 element_span_t::needs_end;
803 }
804
810 static bool is_fontstyle(_In_ element_t code)
811 {
812 switch (code) {
813 case element_t::tt:
814 case element_t::i:
815 case element_t::b:
816 case element_t::u:
817 case element_t::s:
818 case element_t::strike:
819 case element_t::blink:
820 case element_t::big:
821 case element_t::small:
822 return true;
823 };
824 return false;
825 }
826
832 static bool is_phrase(_In_ element_t code)
833 {
834 switch (code) {
835 case element_t::em:
836 case element_t::strong:
837 case element_t::dfn:
838 case element_t::code:
839 case element_t::samp:
840 case element_t::kbd:
841 case element_t::var:
842 case element_t::cite:
843 case element_t::abbr:
844 case element_t::acronym:
845 case element_t::xmp:
846 return true;
847 };
848 return false;
849 }
850
856 static bool is_special(_In_ element_t code)
857 {
858 switch (code) {
859 case element_t::a:
860 case element_t::img:
861 case element_t::applet:
862 case element_t::object:
863 case element_t::embed:
864 case element_t::font:
865 case element_t::basefont:
866 case element_t::br:
867 case element_t::wbr:
868 case element_t::rt:
869 case element_t::script:
870 case element_t::map:
871 case element_t::q:
872 case element_t::sub:
873 case element_t::sup:
874 case element_t::ruby:
875 case element_t::span:
876 case element_t::bdo:
877 case element_t::iframe:
878 case element_t::nobr:
879 return true;
880 };
881 return false;
882 }
883
889 static bool is_formctrl(_In_ element_t code)
890 {
891 switch (code) {
892 case element_t::input:
893 case element_t::select:
894 case element_t::textarea:
895 case element_t::label:
896 case element_t::button:
897 return true;
898 };
899 return false;
900 }
901
907 static bool is_inline(_In_ element_t code)
908 {
909 return
910 code == element_t::PCDATA ||
911 is_fontstyle(code) ||
912 is_phrase(code) ||
913 is_special(code) ||
914 is_formctrl(code);
915 }
916
922 static bool is_heading(_In_ element_t code)
923 {
924 switch (code) {
925 case element_t::h1:
926 case element_t::h2:
927 case element_t::h3:
928 case element_t::h4:
929 case element_t::h5:
930 case element_t::h6:
931 return true;
932 };
933 return false;
934 }
935
941 static bool is_list(_In_ element_t code)
942 {
943 switch (code) {
944 case element_t::ul:
945 case element_t::ol:
946 case element_t::dir:
947 case element_t::menu:
948 return true;
949 };
950 return false;
951 }
952
958 static bool is_preformatted(_In_ element_t code)
959 {
960 switch (code) {
961 case element_t::pre:
962 case element_t::listing:
963 return true;
964 }
965 return false;
966 }
967
973 static bool is_block(_In_ element_t code)
974 {
975 if (is_heading(code) ||
976 is_list(code) ||
977 is_preformatted(code)) return true;
978 switch (code) {
979 case element_t::p:
980 case element_t::dl:
981 case element_t::div:
982 case element_t::center:
983 case element_t::marquee:
984 case element_t::noscript:
985 case element_t::noframes:
986 case element_t::noembed:
987 case element_t::blockquote:
988 case element_t::form:
989 case element_t::isindex:
990 case element_t::hr:
991 case element_t::table:
992 case element_t::fieldset:
993 case element_t::address:
994 return true;
995 };
996 return false;
997 }
998
1004 static bool is_flow(_In_ element_t code)
1005 {
1006 return is_block(code) || is_inline(code);
1007 }
1008
1014 static bool is_head_content(_In_ element_t code)
1015 {
1016 switch (code) {
1017 case element_t::title:
1018 case element_t::isindex:
1019 case element_t::base:
1020 case element_t::nextid:
1021 return true;
1022 };
1023 return false;
1024 }
1025
1031 static bool is_head_misc(_In_ element_t code)
1032 {
1033 switch (code) {
1034 case element_t::script:
1035 case element_t::style:
1036 case element_t::meta:
1037 case element_t::link:
1038 case element_t::object:
1039 return true;
1040 };
1041 return false;
1042 }
1043
1049 static bool is_pre_exclusion(_In_ element_t code)
1050 {
1051 switch (code) {
1052 case element_t::img:
1053 case element_t::object:
1054 case element_t::applet:
1055 case element_t::embed:
1056 case element_t::big:
1057 case element_t::small:
1058 case element_t::sub:
1059 case element_t::sup:
1060 case element_t::ruby:
1061 case element_t::font:
1062 case element_t::basefont:
1063 case element_t::nobr:
1064 return true;
1065 };
1066 return false;
1067 }
1068
1074 static bool is_html_content(_In_ element_t code)
1075 {
1076 switch (code) {
1077 case element_t::head:
1078 case element_t::body:
1079 case element_t::frameset:
1080 return true;
1081 };
1082 return false;
1083 }
1084
1090 static bool is_group(_In_ element_t code)
1091 {
1092 if (is_block(code) ||
1093 is_html_content(code) ||
1094 is_head_content(code)) return true;
1095 switch (code) {
1096 case element_t::col:
1097 case element_t::colgroup:
1098 case element_t::dd:
1099 case element_t::dir:
1100 case element_t::dt:
1101 case element_t::frame:
1102 case element_t::iframe:
1103 case element_t::legend:
1104 case element_t::td:
1105 case element_t::th:
1106 case element_t::tr:
1107 return true;
1108 };
1109 return false;
1110 }
1111
1120 static bool may_contain(_In_ element_t parent, _In_ element_t child)
1121 {
1122 if (child == element_t::unknown || child == element_t::comment)
1123 return true;
1124 if (is_fontstyle(parent) || is_phrase(parent))
1125 return is_inline(child);
1126 if (is_heading(parent))
1127 return is_inline(child);
1128
1129 switch (parent) {
1130 case element_t::a: return is_inline(child) && child != element_t::a;
1131 case element_t::address: return is_inline(child) || child == element_t::p;
1132 case element_t::applet: return is_flow(child) || child == element_t::param;
1133 case element_t::area: return false;
1134 case element_t::base: return false;
1135 case element_t::basefont: return false;
1136 case element_t::bdo: return is_inline(child);
1137 case element_t::blockquote: return is_flow(child);
1138 case element_t::body: return is_flow(child) || child == element_t::ins || child == element_t::del;
1139 case element_t::br: return false;
1140 case element_t::button: return is_flow(child) && !is_formctrl(child) && child != element_t::a && child != element_t::form && child != element_t::isindex && child != element_t::fieldset && child != element_t::iframe;
1141 case element_t::caption: return is_inline(child);
1142 case element_t::center: return is_flow(child);
1143 case element_t::col: return false;
1144 case element_t::colgroup: return child == element_t::col;
1145 case element_t::comment: return child == element_t::CDATA;
1146 case element_t::dd: return is_flow(child);
1147 case element_t::del: return is_flow(child);
1148 case element_t::dir: return child == element_t::li;
1149 case element_t::div: return is_flow(child);
1150 case element_t::dl: return child == element_t::dt || child == element_t::dd;
1151 case element_t::dt: return is_inline(child);
1152 case element_t::embed: return is_flow(child) || child == element_t::param;
1153 case element_t::fieldset: return is_flow(child) || child == element_t::legend || child == element_t::PCDATA;
1154 case element_t::font: return is_inline(child);
1155 case element_t::form: return is_flow(child) && child != element_t::form;
1156 case element_t::frame: return false;
1157 case element_t::frameset: return child == element_t::frameset || child == element_t::frame || child == element_t::noframes;
1158 case element_t::head: return is_head_content(child) || is_head_misc(child);
1159 case element_t::hr: return false;
1160 case element_t::html: return is_html_content(child);
1161 case element_t::iframe: return is_flow(child);
1162 case element_t::img: return false;
1163 case element_t::input: return false;
1164 case element_t::ins: return is_flow(child);
1165 case element_t::isindex: return false;
1166 case element_t::label: return is_inline(child) && child != element_t::label;
1167 case element_t::legend: return is_inline(child);
1168 case element_t::li: return is_flow(child);
1169 case element_t::link: return false;
1170 case element_t::listing: return child == element_t::CDATA;
1171 case element_t::map: return is_block(child) || child == element_t::area;
1172 case element_t::marquee: return is_flow(child);
1173 case element_t::menu: return child == element_t::li;
1174 case element_t::meta: return false;
1175 case element_t::nobr: return is_inline(child) || child == element_t::wbr;
1176 case element_t::noframes: return (is_flow(child) || child == element_t::body) && child != element_t::noframes;
1177 case element_t::noscript: return is_flow(child);
1178 case element_t::noembed: return is_flow(child);
1179 case element_t::object: return is_flow(child) || child == element_t::param;
1180 case element_t::ol: return child == element_t::li;
1181 case element_t::optgroup: return child == element_t::option;
1182 case element_t::option: return child == element_t::PCDATA;
1183 case element_t::p: return is_inline(child);
1184 case element_t::param: return false;
1185 case element_t::plaintext: return is_flow(child);
1186 case element_t::pre: return is_inline(child) && !is_pre_exclusion(child);
1187 case element_t::q: return is_inline(child);
1188 case element_t::rt: return false;
1189 case element_t::ruby: return is_inline(child);
1190 case element_t::script: return child == element_t::CDATA;
1191 case element_t::select: return child == element_t::optgroup || child == element_t::option;
1192 case element_t::span: return is_inline(child);
1193 case element_t::style: return child == element_t::CDATA;
1194 case element_t::sub: return is_inline(child);
1195 case element_t::sup: return is_inline(child);
1196 case element_t::table: return child == element_t::caption || child == element_t::col || child == element_t::colgroup || child == element_t::thead || child == element_t::tfoot || child == element_t::tbody;
1197 case element_t::tbody: return child == element_t::tr;
1198 case element_t::td: return is_flow(child);
1199 case element_t::textarea: return child == element_t::PCDATA;
1200 case element_t::tfoot: return child == element_t::tr;
1201 case element_t::th: return is_flow(child);
1202 case element_t::thead: return child == element_t::tr;
1203 case element_t::title: return child == element_t::PCDATA;
1204 case element_t::tr: return child == element_t::td || child == element_t::th;
1205 case element_t::ul: return child == element_t::li;
1206 case element_t::wbr: return false;
1207 case element_t::unknown: return true;
1208 }
1209 return false;
1210 }
1211
1219 template <class T>
1220 static bool is_uri(_In_ element_t code, _In_reads_or_z_opt_(num_chars) const T* attr_name, _In_ size_t num_chars)
1221 {
1222 _Assume_(attr_name || !num_chars);
1223 switch (code) {
1224 case element_t::a: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX);
1225 case element_t::applet: return !stdex::strnicmp(attr_name, num_chars, "code", SIZE_MAX) ||
1226 !stdex::strnicmp(attr_name, num_chars, "codebase", SIZE_MAX) ||
1227 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1228 case element_t::area: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX);
1229 case element_t::base: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX);
1230 case element_t::bgsound: return !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1231 case element_t::blockquote: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX);
1232 case element_t::body: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX);
1233 case element_t::comment: return !stdex::strnicmp(attr_name, num_chars, "data", SIZE_MAX);
1234 case element_t::del: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX);
1235 case element_t::embed: return !stdex::strnicmp(attr_name, num_chars, "pluginspage", SIZE_MAX) ||
1236 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1237 case element_t::form: return !stdex::strnicmp(attr_name, num_chars, "action", SIZE_MAX);
1238 case element_t::frame: return !stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX) ||
1239 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1240 case element_t::head: return !stdex::strnicmp(attr_name, num_chars, "profile", SIZE_MAX);
1241 case element_t::iframe: return !stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX) ||
1242 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1243 case element_t::img: return !stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX) ||
1244 !stdex::strnicmp(attr_name, num_chars, "lowsrc", SIZE_MAX) ||
1245 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) ||
1246 !stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX);
1247 case element_t::input: return !stdex::strnicmp(attr_name, num_chars, "lowsrc", SIZE_MAX) ||
1248 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) ||
1249 !stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX);
1250 case element_t::ins: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX);
1251 case element_t::link: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX);
1252 case element_t::object: return !stdex::strnicmp(attr_name, num_chars, "basehref", SIZE_MAX) ||
1253 !stdex::strnicmp(attr_name, num_chars, "classid", SIZE_MAX) ||
1254 !stdex::strnicmp(attr_name, num_chars, "code", SIZE_MAX) ||
1255 !stdex::strnicmp(attr_name, num_chars, "codebase", SIZE_MAX) ||
1256 !stdex::strnicmp(attr_name, num_chars, "data", SIZE_MAX) ||
1257 !stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX);
1258 case element_t::q: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX);
1259 case element_t::script: return !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1260 case element_t::table: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX);
1261 case element_t::td: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX);
1262 case element_t::th: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX);
1263 }
1264 return false;
1265 }
1266
1274 template <class T>
1275 static bool is_localizable(element_t code, const T* attr_name, size_t num_chars)
1276 {
1277 _Assume_(attr_name || !num_chars);
1278 if (!stdex::strnicmp(attr_name, num_chars, "title", SIZE_MAX))
1279 return true;
1280 switch (code) {
1281 case element_t::applet: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1282 case element_t::area: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1283 case element_t::img: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1284 case element_t::input: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1285 case element_t::object: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1286 case element_t::table: return !stdex::strnicmp(attr_name, num_chars, "summary", SIZE_MAX);
1287 case element_t::td: return !stdex::strnicmp(attr_name, num_chars, "abbr", SIZE_MAX);
1288 case element_t::th: return !stdex::strnicmp(attr_name, num_chars, "abbr", SIZE_MAX);
1289 }
1290 return false;
1291 }
1292 };
1293
1294 class sequence;
1295 using sequence_store = std::vector<std::unique_ptr<sequence>>;
1296
1301 {
1302 public:
1303 stdex::parser::html_sequence_t type;
1306
1307 sequence(_In_ stdex::parser::html_sequence_t _type = stdex::parser::html_sequence_t::unknown, _In_ size_t start = 0, size_t end = 0, _In_opt_ sequence* _parent = nullptr) :
1308 type(_type),
1309 interval(start, end),
1310 parent(_parent)
1311 {}
1312
1313 virtual ~sequence() {} // make polymorphic
1314 };
1315
1319 class element : public sequence
1320 {
1321 public:
1322 template <class T>
1323 element(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr) :
1324 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1325 code(element_code(src + tag.name.start, tag.name.size())),
1326 name(std::move(tag.name)),
1327 attributes(std::move(tag.attributes))
1328 {}
1329
1330 template <class T>
1331 static element_t element_code(_In_reads_z_(num_chars) const T* name, size_t num_chars)
1332 {
1333 static const struct {
1334 const char* name;
1335 element_t code;
1336 } mapping[] = {
1337 { "a", element_t::a, },
1338 { "abbr", element_t::abbr, },
1339 { "acronym", element_t::acronym, },
1340 { "address", element_t::address, },
1341 { "applet", element_t::applet, },
1342 { "area", element_t::area, },
1343 { "b", element_t::b, },
1344 { "base", element_t::base, },
1345 { "basefont", element_t::basefont, },
1346 { "bdo", element_t::bdo, },
1347 { "bgsound", element_t::bgsound, },
1348 { "big", element_t::big, },
1349 { "blink", element_t::blink, },
1350 { "blockquote", element_t::blockquote, },
1351 { "body", element_t::body, },
1352 { "br", element_t::br, },
1353 { "button", element_t::button, },
1354 { "caption", element_t::caption, },
1355 { "center", element_t::center, },
1356 { "cite", element_t::cite, },
1357 { "code", element_t::code, },
1358 { "col", element_t::col, },
1359 { "colgroup", element_t::colgroup, },
1360 { "comment", element_t::comment, },
1361 { "dd", element_t::dd, },
1362 { "del", element_t::del, },
1363 { "dfn", element_t::dfn, },
1364 { "dir", element_t::dir, },
1365 { "div", element_t::div, },
1366 { "dl", element_t::dl, },
1367 { "dt", element_t::dt, },
1368 { "em", element_t::em, },
1369 { "embed", element_t::embed, },
1370 { "fieldset", element_t::fieldset, },
1371 { "font", element_t::font, },
1372 { "form", element_t::form, },
1373 { "frame", element_t::frame, },
1374 { "frameset", element_t::frameset, },
1375 { "h1", element_t::h1, },
1376 { "h2", element_t::h2, },
1377 { "h3", element_t::h3, },
1378 { "h4", element_t::h4, },
1379 { "h5", element_t::h5, },
1380 { "h6", element_t::h6, },
1381 { "head", element_t::head, },
1382 { "hr", element_t::hr, },
1383 { "html", element_t::html, },
1384 { "i", element_t::i, },
1385 { "iframe", element_t::iframe, },
1386 { "img", element_t::img, },
1387 { "input", element_t::input, },
1388 { "ins", element_t::ins, },
1389 { "isindex", element_t::isindex, },
1390 { "kbd", element_t::kbd, },
1391 { "label", element_t::label, },
1392 { "legend", element_t::legend, },
1393 { "li", element_t::li, },
1394 { "link", element_t::link, },
1395 { "listing", element_t::listing, },
1396 { "map", element_t::map, },
1397 { "marquee", element_t::marquee, },
1398 { "menu", element_t::menu, },
1399 { "meta", element_t::meta, },
1400 { "nextid", element_t::nextid, },
1401 { "nobr", element_t::nobr, },
1402 { "noembed", element_t::noembed, },
1403 { "noframes", element_t::noframes, },
1404 { "noscript", element_t::noscript, },
1405 { "object", element_t::object, },
1406 { "ol", element_t::ol, },
1407 { "optgroup", element_t::optgroup, },
1408 { "option", element_t::option, },
1409 { "p", element_t::p, },
1410 { "param", element_t::param, },
1411 { "plaintext", element_t::plaintext, },
1412 { "pre", element_t::pre, },
1413 { "q", element_t::q, },
1414 { "rt", element_t::rt, },
1415 { "ruby", element_t::ruby, },
1416 { "s", element_t::s, },
1417 { "samp", element_t::samp, },
1418 { "script", element_t::script, },
1419 { "select", element_t::select, },
1420 { "small", element_t::small, },
1421 { "span", element_t::span, },
1422 { "strike", element_t::strike, },
1423 { "strong", element_t::strong, },
1424 { "style", element_t::style, },
1425 { "sub", element_t::sub, },
1426 { "sup", element_t::sup, },
1427 { "table", element_t::table, },
1428 { "tbody", element_t::tbody, },
1429 { "td", element_t::td, },
1430 { "textarea", element_t::textarea, },
1431 { "tfoot", element_t::tfoot, },
1432 { "th", element_t::th, },
1433 { "thead", element_t::thead, },
1434 { "title", element_t::title, },
1435 { "tr", element_t::tr, },
1436 { "tt", element_t::tt, },
1437 { "u", element_t::u, },
1438 { "ul", element_t::ul, },
1439 { "var", element_t::var, },
1440 { "wbr", element_t::wbr, },
1441 { "xmp", element_t::xmp, },
1442 };
1443#ifdef _DEBUG
1444 // The mapping table MUST be sorted and all names in lowercase.
1445 for (size_t i = 1; i < _countof(mapping); i++)
1446 _Assume_(stdex::strcmp(mapping[i - 1].name, mapping[i].name) <= 0);
1447 for (size_t i = 0; i < _countof(mapping); i++) {
1448 for (size_t j = 0; mapping[i].name[j]; j++)
1449 _Assume_(stdex::islower(mapping[i].name[j]) | stdex::isdigit(mapping[i].name[j]));
1450 }
1451#endif
1452 for (size_t i = 0, j = _countof(mapping); i < j; ) {
1453 size_t m = (i + j) / 2;
1454 int r = 0;
1455 for (size_t i1 = 0, i2 = 0;;) {
1456 if (!mapping[m].name[i1]) {
1457 r = i2 >= num_chars || !name[i2] ? 0 : -1;
1458 break;
1459 }
1460 if (i2 >= num_chars || !name[i2]) {
1461 r = 1;
1462 break;
1463 }
1464
1465 auto chr = static_cast<char>(stdex::tolower(name[i2++]));
1466 if (mapping[m].name[i1] > chr) {
1467 r = 1;
1468 break;
1469 }
1470 if (mapping[m].name[i1] < chr) {
1471 r = -1;
1472 break;
1473 }
1474 i1++;
1475 }
1476
1477 if (r < 0)
1478 i = m + 1;
1479 else if (r > 0)
1480 j = m;
1481 else
1482 return mapping[m].code;
1483 }
1484 return element_t::unknown;
1485 }
1486
1487 public:
1488 element_t code;
1490 std::vector<stdex::parser::html_attribute> attributes;
1491 };
1492
1493 class element_end;
1494
1498 class element_start : public element
1499 {
1500 public:
1501 template <class T>
1502 element_start(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr, _In_opt_ sequence* _end = nullptr) :
1503 element(std::move(tag), src, parent),
1504 end(_end)
1505 {}
1506
1507 public:
1509 };
1510
1514 class element_end : public sequence
1515 {
1516 public:
1517 template <class T>
1518 element_end(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr, _In_opt_ element_start* _start = nullptr) :
1519 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1520 code(element::element_code(src + tag.name.start, tag.name.size())),
1521 name(std::move(tag.name)),
1522 start(_start)
1523 {}
1524
1525 public:
1526 element_t code;
1529 };
1530
1534 class declaration : public sequence
1535 {
1536 public:
1537 template <class T>
1538 declaration(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_opt_ sequence* parent = nullptr) :
1539 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1540 name(std::move(tag.name)),
1541 attributes(std::move(tag.attributes))
1542 {}
1543
1544 public:
1546 std::vector<stdex::parser::html_attribute> attributes;
1547 };
1548
1552 class comment : public sequence
1553 {
1554 public:
1555 template <class T>
1556 comment(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_opt_ sequence* parent = nullptr) :
1557 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1558 content(std::move(tag.name))
1559 {}
1560
1561 public:
1563 };
1564
1568 class instruction : public sequence
1569 {
1570 public:
1571 template <class T>
1572 instruction(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_opt_ sequence* parent = nullptr) :
1573 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1574 content(std::move(tag.name))
1575 {}
1576
1577 public:
1579 };
1580
1584 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
1585 struct entity
1586 {
1588 std::basic_string<T, TR, AX> value;
1589 };
1590
1594 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
1595 class parser;
1596
1600 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
1602 {
1603 public:
1604 document() :
1605 m_num_parsed(0),
1606 m_charset(stdex::charset_id::system),
1607
1608 // Declaration parsing data
1611 m_is_cdata(false),
1612 m_is_rcdata(false),
1613
1614 // Element parsing data
1616 {}
1617
1621 void clear()
1622 {
1623 m_source.clear();
1624 m_num_parsed = 0;
1625 m_charset = stdex::charset_id::system;
1626
1627 // Declaration parsing data
1629 m_is_cdata = m_is_rcdata = false;
1630 m_entities.clear();
1631
1632 // Element parsing data
1633 m_sequences.clear();
1634
1635 m_element_stack.clear();
1636 m_is_special_element = false;
1637 }
1638
1642 void append(_In_reads_or_z_opt_(num_chars) const T* source, _In_ size_t num_chars)
1643 {
1644 _Assume_(source || !num_chars);
1645 m_source.append(source, stdex::strnlen(source, num_chars));
1646 source = m_source.data();
1647 num_chars = m_source.size();
1648
1649 for (size_t i = m_num_parsed; i < num_chars;) {
1650 if (m_is_cdata || m_is_rcdata) {
1651 if (m_condition_end.match(source, i, num_chars)) {
1652 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(
1653 m_is_cdata ? stdex::parser::html_sequence_t::CDATA : stdex::parser::html_sequence_t::PCDATA,
1654 m_num_parsed, i,
1655 active_element()))));
1656 m_is_cdata = m_is_rcdata = false;
1657 i = m_num_parsed = m_condition_end.interval.end;
1658 continue;
1659 }
1660 goto next_char;
1661 }
1662
1664 if (m_condition_end.match(source, i, num_chars)) {
1666 i = m_num_parsed = m_condition_end.interval.end;
1667 continue;
1668 }
1669 goto next_char;
1670 }
1671
1672 if (m_num_valid_conditions && m_condition_end.match(source, i, num_chars)) {
1673 if (m_num_parsed < i)
1674 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));
1675
1677 i = m_num_parsed = m_condition_end.interval.end;
1678 continue;
1679 }
1680
1681 if (m_condition_start.match(source, i, num_chars)) {
1682 auto condition_src(replace_entities(source + m_condition_start.condition.start, m_condition_start.condition.size()));
1683 if (stdex::strncmp(condition_src.data(), condition_src.size(), "CDATA", SIZE_MAX) == 0)
1684 m_is_cdata = true;
1685 else if (stdex::strncmp(condition_src.data(), condition_src.size(), "RCDATA", SIZE_MAX) == 0)
1686 m_is_rcdata = true;
1689 else if (stdex::strncmp(condition_src.data(), condition_src.size(), "IGNORE", SIZE_MAX) == 0)
1691 else
1693
1694 i = m_num_parsed = m_condition_start.interval.end;
1695 continue;
1696 }
1697
1699 auto parent = active_element();
1700 _Assume_(parent);
1701 if (m_tag.match(source, i, num_chars) &&
1702 m_tag.type == stdex::parser::html_sequence_t::element_end &&
1703 element::element_code(source + m_tag.name.start, m_tag.name.size()) == parent->code)
1704 {
1705 if (m_num_parsed < i)
1706 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, parent))));
1707 i = m_num_parsed = m_tag.interval.end;
1708 std::unique_ptr<element_end> e(new element_end(std::move(m_tag), source, parent->parent, parent));
1709 parent->end = e.get();
1710 m_sequences.push_back(std::move(e));
1711 m_element_stack.pop_back();
1712 m_is_special_element = false;
1713 continue;
1714 }
1715 goto next_char;
1716 }
1717
1718 if (m_tag.match(source, i, num_chars)) {
1719 if (m_num_parsed < i)
1720 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));
1721 i = m_num_parsed = m_tag.interval.end;
1722
1723 switch (m_tag.type) {
1724 case stdex::parser::html_sequence_t::element:
1725 case stdex::parser::html_sequence_t::element_start: {
1726 std::unique_ptr<element> e(
1727 m_tag.type == stdex::parser::html_sequence_t::element ? new element(std::move(m_tag), source) :
1728 m_tag.type == stdex::parser::html_sequence_t::element_start ? new element_start(std::move(m_tag), source) :
1729 nullptr);
1730
1731 // Does this tag end any of the started elements?
1732 for (size_t j = m_element_stack.size(); j--; ) {
1733 auto starting_tag = m_element_stack[j];
1734 _Assume_(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1735 if (element_traits::may_contain(starting_tag->code, e->code)) {
1736 e->parent = starting_tag;
1737 break;
1738 }
1739 e->parent = starting_tag->parent;
1740 starting_tag->end = e.get();
1741 m_element_stack.resize(j);
1742 }
1743
1744 if (e->type == stdex::parser::html_sequence_t::element_start) {
1745 auto e_start = static_cast<element_start*>(e.get());
1746 if (element_traits::span(e->code) == element_span_t::immediate)
1747 e_start->end = e.get();
1748 else {
1749 m_element_stack.push_back(e_start);
1750 switch (e->code) {
1751 case element_t::code:
1752 case element_t::comment:
1753 case element_t::script:
1754 case element_t::style:
1755 m_is_special_element = true;
1756 break;
1757 }
1758 }
1759 }
1760
1761 if (e->code == element_t::meta && m_charset == stdex::charset_id::system) {
1762 bool is_content_type = false;
1763 stdex::parser::html_attribute* content_attr = nullptr;
1764 for (auto& attr : e->attributes) {
1765 if (!stdex::strnicmp(source + attr.name.start, attr.name.size(), "http-equiv", SIZE_MAX) &&
1766 !stdex::strnicmp(source + attr.value.start, attr.value.size(), "content-type", SIZE_MAX))
1767 is_content_type = true;
1768 else if (!stdex::strnicmp(source + attr.name.start, attr.name.size(), "content", SIZE_MAX))
1769 content_attr = &attr;
1770 }
1771 if (is_content_type && content_attr) {
1772 // <meta http-equiv="Content-Type" content="..."> found.
1774 if (content.match(source, content_attr->value.start, content_attr->value.end) &&
1775 content.charset)
1776 {
1777 std::string str;
1778 str.reserve(content.charset.size());
1779 for (size_t j = content.charset.start; j < content.charset.end; ++j)
1780 str.push_back(static_cast<char>(source[j]));
1781 m_charset = stdex::charset_from_name(str);
1782 }
1783 }
1784 }
1785
1786 m_sequences.push_back(std::move(e));
1787 break;
1788 }
1789 case stdex::parser::html_sequence_t::element_end: {
1790 std::unique_ptr<element_end> e(new element_end(std::move(m_tag), source, active_element()));
1791
1792 for (size_t j = m_element_stack.size(); j--; ) {
1793 auto starting_tag = m_element_stack[j];
1794 _Assume_(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1795 if (starting_tag->code == e->code ||
1796 starting_tag->code == element_t::unknown && e->code == element_t::unknown && !stdex::strnicmp(source + starting_tag->name.start, starting_tag->name.size(), source + e->name.start, e->name.size()))
1797 {
1798 e->start = starting_tag;
1799 e->parent = starting_tag->parent;
1800 starting_tag->end = e.get();
1801 m_element_stack.resize(j);
1802 break;
1803 }
1804 }
1805
1806 m_sequences.push_back(std::move(e));
1807 break;
1808 }
1809 case stdex::parser::html_sequence_t::declaration:
1810 if (m_tag.attributes.size() > 3 &&
1811 !stdex::strnicmp(source + m_tag.attributes[0].name.start, m_tag.attributes[0].name.size(), "entity", SIZE_MAX))
1812 {
1813 if (!stdex::strncmp(source + m_tag.attributes[1].name.start, m_tag.attributes[1].name.size(), "%", SIZE_MAX) &&
1814 stdex::strncmp(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size(), "SYSTEM", SIZE_MAX) &&
1815 stdex::strncmp(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size(), "PUBLIC", SIZE_MAX))
1816 {
1817 std::unique_ptr<entity<T, TR, AX>> e(new entity<T, TR, AX>());
1818 e->name = m_tag.attributes[2].name;
1819 e->value = std::move(replace_entities(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size()));
1820 m_entities.push_back(std::move(e));
1821 }
1822
1823 // TODO: Parse & entities and entities in SYSTEM and PUBLIC external files.
1824 }
1825 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new declaration(std::move(m_tag), active_element()))));
1826 break;
1827 case stdex::parser::html_sequence_t::comment:
1828 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new comment(std::move(m_tag), active_element()))));
1829 break;
1830 case stdex::parser::html_sequence_t::instruction:
1831 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new instruction(std::move(m_tag), active_element()))));
1832 break;
1833 default:
1834 throw std::invalid_argument("unknown tag type");
1835 }
1836
1837 continue;
1838 }
1839
1840 next_char:
1841 if (m_any_char.match(source, i, num_chars)) {
1842 // Skip any character, but don't declare it as parsed yet. It might be a part of unfinished tag.
1843 i = m_any_char.interval.end;
1844 }
1845 else
1846 break;
1847 }
1848 }
1849
1854 {
1855 size_t i = m_source.size();
1856 if (m_num_parsed < i)
1857 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));
1858 m_num_parsed = i;
1859 m_element_stack.clear();
1860 }
1861
1865 void assign(_In_reads_or_z_opt_(num_chars) const T* source, _In_ size_t num_chars)
1866 {
1867 clear();
1868 append(source, num_chars);
1869 finalize();
1870 }
1871
1875 const std::basic_string<T, TR, AX>& source() const { return m_source; }
1876
1877 friend class parser<T, TR, AX>;
1878
1879 protected:
1884 {
1885 return m_element_stack.empty() ? nullptr : m_element_stack.back();
1886 }
1887
1891 std::basic_string<T, TR, AX> replace_entities(_In_reads_or_z_opt_(num_chars) const T* input, _In_ size_t num_chars) const
1892 {
1893 _Assume_(input || !num_chars);
1894 const size_t num_entities = m_entities.size();
1895 const T* source = m_source.data();
1896 std::basic_string<T, TR, AX> output;
1897 for (size_t i = 0; i < num_chars && input[i];) {
1898 if (input[i] == '%') {
1899 for (size_t j = 0; j < num_entities; j++) {
1900 auto& e = m_entities[j];
1901 size_t entity_size = e->name.size();
1902 if (i + entity_size + 1 < num_chars &&
1903 !stdex::strncmp(input + i + 1, source + e->name.start, entity_size) &&
1904 input[i + entity_size + 1] == ';')
1905 {
1906 output += e->value;
1907 i += entity_size + 2;
1908 goto next_char;
1909 }
1910 }
1911 throw std::runtime_error("undefined entity");
1912 }
1913 output += input[i++];
1914 next_char:;
1915 }
1916 return output;
1917 }
1918
1919 protected:
1920 std::basic_string<T, TR, AX> m_source;
1922 stdex::charset_id m_charset;
1923
1924 // Declaration parsing data
1932 std::vector<std::unique_ptr<entity<T, TR, AX>>> m_entities;
1933
1934 // Element parsing data
1936 sequence_store m_sequences;
1937 std::vector<element_start*> m_element_stack;
1939 };
1940
1944 enum class token_t {
1945 root = 0,
1946 complete,
1947 starting,
1948 ending,
1949 url,
1950 };
1951
1955 constexpr size_t token_tag_max =
1956 sizeof(void*) * 2 // Memory address in hexadecimal
1957 + 2 // Leading and trailing parenthesis
1958 + 1; // Zero terminator
1959
1964 constexpr char token_tag_start = '\x12';
1965
1970 constexpr char token_tag_end = '\x13';
1971
1975 class token
1976 {
1977 protected:
1978 token(_In_ token_t _type = token_t::root, _In_opt_ sequence* _sequence = nullptr, _In_ uintptr_t _data = 0) :
1979 type(_type),
1980 sequence(_sequence),
1981 data(_data)
1982 {}
1983
1984 template<class T, class TR, class AX>
1985 friend class parser;
1986
1987 public:
1988 virtual ~token() {} // make polymorphic
1989
1997 template<class TR = std::char_traits<char>, class AX = std::allocator<char>>
1998 size_t append_tag(_Inout_ std::basic_string<char, TR, AX>& str) const
1999 {
2000 size_t n = str.size();
2001 // Use %X instead of %p to ommit leading zeros and save space.
2002 stdex::appendf(str, "%c%zX%c", stdex::locale_C, token_tag_start, reinterpret_cast<uintptr_t>(this), token_tag_end);
2003 return str.size() - n;
2004 }
2005
2013 template<class TR = std::char_traits<wchar_t>, class AX = std::allocator<wchar_t>>
2014 size_t append_tag(_Inout_ std::basic_string<wchar_t, TR, AX>& str) const
2015 {
2016 // Use %X instead of %p to ommit leading zeros and save space.
2017 return stdex::appendf(str, L"%c%zX%c", stdex::locale_C, static_cast<wchar_t>(token_tag_start), reinterpret_cast<uintptr_t>(this), static_cast<wchar_t>(token_tag_end));
2018 }
2019
2020 template<class T>
2021 static token* parse_tag(const T* str, size_t& offset)
2022 {
2023 if (str[offset] != static_cast<T>(token_tag_start))
2024 return nullptr;
2025
2026 // Locate tag end.
2027 size_t end;
2028 for (end = offset + 1; ; end++) {
2029 if (!str[end])
2030 return nullptr;
2031 if (str[end] == token_tag_end)
2032 break;
2033 }
2034
2035 // Parse hexadecimal token memory address.
2036 token* t = reinterpret_cast<token*>(stdex::strtouint<T, uintptr_t>(str + offset + 1, end - offset - 1, nullptr, 16));
2037 if (!t)
2038 throw std::invalid_argument("null token");
2039 offset = end + 1;
2040 return t;
2041 }
2042
2043 public:
2044 token_t type;
2046 uintptr_t data;
2047 };
2048
2049 using token_vector = std::vector<std::unique_ptr<token>>;
2050 using token_list = std::list<token*>;
2051
2055 enum text_type_flag_t : uint32_t {
2056 has_tokens = 1 << 0,
2057 has_text = 1 << 1,
2058 is_title = 1 << 2,
2059 is_bullet = 1 << 3,
2060 };
2061
2065 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
2066 class text_token : public token
2067 {
2068 protected:
2069 text_token(
2070 _In_ token_t type = token_t::complete,
2071 _In_reads_or_z_opt_(num_chars) const T* _text = nullptr, _In_ size_t num_chars = 0,
2072 _In_ uint32_t _text_type = 0,
2073 _In_opt_ stdex::html::sequence* sequence = nullptr, _In_ uintptr_t data = 0) :
2075 text(_text, num_chars),
2076 text_type(_text_type)
2077 {}
2078
2079 friend class parser<T, TR, AX>;
2080
2081 public:
2082 std::basic_string<T, TR, AX> text;
2083 uint32_t text_type;
2084 stdex::mapping_vector<size_t> mapping;
2085 };
2086
2090 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
2091 class starting_token : public text_token<T, TR, AX>
2092 {
2093 protected:
2095 _In_reads_or_z_opt_(num_chars_text) const T* _text = nullptr, _In_ size_t num_chars_text = 0,
2096 _In_reads_or_z_opt_(num_chars_name) const T* _name = nullptr, _In_ size_t num_chars_name = 0,
2097 _In_ uint32_t text_type = 0,
2098 _In_opt_ stdex::html::sequence* sequence = nullptr,
2099 _In_opt_ stdex::html::sequence* _end_sequence = nullptr,
2100 _In_ uintptr_t data = 0) :
2101 text_token(token_t::starting, _text, num_chars_text, text_type, sequence, data),
2102 name(_name, num_chars_name),
2103 end_sequence(_end_sequence)
2104 {}
2105
2106 friend class parser<T, TR, AX>;
2107
2108 public:
2109 std::basic_string<T, TR, AX> name;
2111 };
2112
2116 enum class token_url_t {
2117 plain = 0, // URL is not using any particular encoding scheme (as-is)
2118 sgml, // URL is encoded using SGML entities
2119 css, // URL is encoded using CSS escaping scheme
2120 };
2121
2125 template<class T, class TR = std::char_traits<T>, class AX = std::allocator<T>>
2126 class url_token : public token
2127 {
2128 protected:
2129 url_token(
2130 _In_reads_or_z_opt_(num_chars) const T* _url = nullptr, _In_ size_t num_chars = 0,
2131 token_url_t _encoding = token_url_t::plain,
2132 _In_opt_ stdex::html::sequence* sequence = nullptr, _In_ uintptr_t data = 0) :
2133 token(token_t::url, sequence, data),
2134 url(_url, num_chars),
2135 encoding(_encoding)
2136 {}
2137
2138 friend class parser<T, TR, AX>;
2139
2140 public:
2141 std::basic_string<T, TR, AX> url;
2142 token_url_t encoding;
2143 };
2144
2150 std::list<stdex::html::token*> active_tokens;
2151 size_t word_index;
2153 };
2154
2155 using inserted_token_list = std::list<inserted_token>;
2156
2157 template<class T, class TR, class AX>
2159 {
2160 public:
2161 parser(
2162 _In_ const document<T, TR, AX>& document,
2163 _In_reads_or_z_opt_(num_chars) const stdex::schar_t* url = nullptr, _In_ size_t num_chars = 0,
2164 _In_ bool parse_frames = false, _In_ stdex::progress<size_t>* progress = nullptr) :
2166 m_url(url, stdex::strnlen(url, num_chars)),
2167 m_parse_frames(parse_frames),
2169 m_source(nullptr)
2170 {}
2171
2176 {
2177 _Assume_(m_tokens.empty());
2178
2179 if (m_progress) {
2180 m_progress->set_range(0, m_document.source().size());
2181 m_progress->set(0);
2182 }
2183
2184 m_source = m_document.source().data();
2186 return parse(m_document.m_sequences.end());
2187 }
2188
2195 static void link(_Inout_ std::basic_string<T, TR, AX>& source, _In_ const text_token<T, TR, AX>* t)
2196 {
2197 _Assume_(t);
2198 _Assume_(
2199 t->type == token_t::complete ||
2200 t->type == token_t::starting ||
2201 t->type == token_t::ending ||
2202 t->type == token_t::root);
2203
2204 if (t->text_type & has_tokens) {
2205 const T* root = t->text.data();
2206 for (size_t i = 0, num_chars = t->text.size(); i < num_chars && root[i];) {
2207 _Assume_(root[i] != token_tag_end);
2208 const token* t2 = token::parse_tag(root, i);
2209 if (t2) {
2210 switch (t2->type) {
2211 case token_t::complete:
2212 case token_t::starting:
2213 case token_t::ending:
2214 case token_t::root:
2215 link(source, dynamic_cast<const text_token<T, TR, AX>*>(t2));
2216 break;
2217 case token_t::url: {
2218 auto t2_url = dynamic_cast<const url_token<T, TR, AX>*>(t2);
2219 switch (t2_url->encoding) {
2220 case token_url_t::plain:
2221 source += t2_url->url;
2222 break;
2223 case token_url_t::sgml:
2224 escape(source, t2_url->url.data(), t2_url->url.size());
2225 break;
2226 case token_url_t::css:
2227 css_escape(source, t2_url->url.data(), t2_url->url.size());
2228 break;
2229 default:
2230 throw std::invalid_argument("unsupported URL encoding");
2231 }
2232 break;
2233 }
2234 default:
2235 throw std::invalid_argument("unsupported token type");
2236 }
2237 }
2238 else if (t->text_type & has_text) {
2239 escape_min(source, root[i]);
2240 i++;
2241 }
2242 else
2243 source += root[i++];
2244 }
2245 }
2246 else if (t->text_type & has_text) {
2247 // Token contains no references to other tokens. But, it does contain text that requires escaping.
2248 escape_min(source, t->text.data(), t->text.size());
2249 }
2250 else
2251 source += t->text;
2252 }
2253
2262 static void start_tokens(_Inout_ std::basic_string<T, TR, AX>& source, _Inout_ token_list& active_tokens, _In_ const token_list& new_tokens, _In_ token_list::const_iterator from)
2263 {
2264 for (; from != new_tokens.cend(); ++from) {
2265 auto t = *from;
2266 t->append_tag(source);
2267 active_tokens.push_back(t);
2268 }
2269 }
2270
2280 token_list::const_iterator end_tokens(_Inout_ std::basic_string<T, TR, AX>& source, _Inout_ token_list& active_tokens, _In_ const token_list& new_tokens)
2281 {
2282 // Skip matching tokens in active_tokens and new_tokens.
2283 token_list::const_iterator i1, i2;
2284 for (i1 = active_tokens.cbegin(), i2 = new_tokens.cbegin(); i1 != active_tokens.cend(); ++i1, ++i2) {
2285 if (i2 == new_tokens.cend() || *i1 != *i2) {
2286 // Got two tokens, where lists don't match anymore, or new_tokens list is out.
2287 // End tokens not relevant anymore in reverse order of starting.
2288 for (auto i = active_tokens.cend(); i != active_tokens.cbegin(); ) {
2289 auto t1 = dynamic_cast<starting_token<T, TR, AX>*>(*(--i));
2290 _Assume_(t1 && t1->type == token_t::starting);
2291
2292 std::unique_ptr<text_token<T, TR, AX>> t2(new text_token<T, TR, AX>(token_t::ending));
2293 t2->text.reserve(t1->name.size() + 3);
2294 t2->text += '<';
2295 t2->text += '/';
2296 t2->text += t1->name;
2297 t2->text += '>';
2298 append_token(std::move(t2), source);
2299
2300 // Pop the active token.
2301 if (i1 == i) {
2302 active_tokens.erase(i);
2303 break;
2304 }
2305 active_tokens.erase(i);
2306 i = active_tokens.cend();
2307 }
2308 break;
2309 }
2310 }
2311 return i2;
2312 }
2313
2323 void append_inserted_tokens(_Inout_ std::basic_string<T, TR, AX>& source, _Inout_ inserted_token_list& inserted_tokens,
2324 _In_ size_t word_index, _In_ bool after_word,
2325 _Inout_ token_list& active_tokens)
2326 {
2327 for (auto i = inserted_tokens.begin(); i != inserted_tokens.end(); ) {
2328 auto& t = *i;
2329 _Assume_(t.token);
2330 if (t.word_index == word_index && t.after_word == after_word) {
2331 if (t.token->type != token_t::ending)
2332 start_tokens(source, active_tokens, t.active_tokens, end_tokens(source, active_tokens, t.active_tokens));
2333 t.token->append_tag(source);
2334 inserted_tokens.erase(i++);
2335 }
2336 else
2337 ++i;
2338 }
2339 }
2340
2347 static void merge(_Inout_ token_list& a, _In_ const token_list& b)
2348 {
2349 for (auto i2 = b.begin(); i2 != b.end(); ++i2) {
2350 auto t2 = *i2;
2351 for (auto i1 = a.begin(); i1 != a.end(); ++i1) {
2352 if (i1 == a.end()) {
2353 a.push_back(t2);
2354 break;
2355 }
2356 auto t1 = *i1;
2357 if (t1 == t2)
2358 break;
2359 }
2360 }
2361 }
2362
2366 void make_absolute_url(std::basic_string<T, TR, AX>& rel)
2367 {
2368 _Unreferenced_(rel);
2369
2370 if (m_url.empty())
2371 return;
2372
2373 // TODO: Implement!
2374 }
2375
2379 const token_vector& tokens() const { return m_tokens; }
2380
2381 protected:
2389 template <class T_token>
2390 T_token* append_token(_Inout_ std::unique_ptr<T_token>&& token)
2391 {
2392 if (!token)
2393 return nullptr;
2394 auto t = token.get();
2395 m_tokens.push_back(std::move(token));
2396 return t;
2397 }
2398
2407 template <class T_token>
2408 size_t append_token(_Inout_ std::unique_ptr<T_token>&& token, _Inout_ std::basic_string<T, TR, AX>& source)
2409 {
2410 if (!token)
2411 return 0;
2412 size_t n = token->append_tag(source);
2413 m_tokens.push_back(std::move(token));
2414 return n;
2415 }
2416
2425 text_token<T, TR, AX>* parse(_In_ const sequence_store::const_iterator& end, _In_ uint32_t text_type = 0)
2426 {
2428 std::unique_ptr<text_token<T, TR, AX>> token(new text_token<T, TR, AX>(
2429 token_t::complete,
2430 nullptr, 0,
2431 text_type,
2432 m_offset != end ? m_offset->get() : nullptr));
2433
2434 while (m_offset != end) {
2435 auto& s = *m_offset;
2436
2437 if (m_progress) {
2438 if (m_progress->cancel())
2439 throw stdex::user_cancelled();
2440 m_progress->set(s->interval.start);
2441 }
2442
2443 // No token_tag_start and token_tag_end chars, please.
2444 _Assume_(
2445 stdex::strnchr(m_source + s->interval.start, s->interval.size(), static_cast<T>(token_tag_start)) == stdex::npos &&
2446 stdex::strnchr(m_source + s->interval.start, s->interval.size(), static_cast<T>(token_tag_end)) == stdex::npos);
2447
2448 if (s->type == stdex::parser::html_sequence_t::text) {
2449 rel.from = s->interval.start;
2450 token->mapping.push_back(rel);
2451 stdex::sgml2strcat(token->text, m_source + s->interval.start, s->interval.size(), 0, rel, &token->mapping);
2452 rel.to = token->text.size();
2453 if (!(token->text_type & has_text) &&
2454 !stdex::isblank(m_source + s->interval.start, s->interval.size()))
2455 token->text_type |= has_text;
2456 ++m_offset;
2457 }
2458 else if (s->type == stdex::parser::html_sequence_t::element || s->type == stdex::parser::html_sequence_t::element_start) {
2459 const element* s_el = static_cast<const element*>(s.get());
2460 _Assume_(s_el);
2461 const element_start* s_el_start = s->type == stdex::parser::html_sequence_t::element_start ? static_cast<const element_start*>(s.get()) : nullptr;
2462 if (s_el->code == element_t::frameset && !m_parse_frames)
2463 throw std::invalid_argument("<frameset> detected");
2464
2465 {
2466 size_t offset = s->interval.start;
2467 std::unique_ptr<text_token<T, TR, AX>> t(s->type == stdex::parser::html_sequence_t::element || element_traits::span(s_el_start->code) == element_span_t::immediate ?
2468 new text_token<T, TR, AX>(token_t::complete, nullptr, 0, 0, s.get()) :
2469 new starting_token<T, TR, AX>(nullptr, 0, m_source + s_el_start->name.start, s_el_start->name.size(), 0, s.get(), s_el_start->end));
2470
2471 // Copy the tag contents, but mind any attributes containing localizable text.
2472 for (auto& a : s_el->attributes) {
2473 if (a.value.empty() ||
2474 stdex::isblank(m_source + a.value.start, a.value.size()))
2475 continue;
2476
2477 if (element_traits::is_uri(s_el->code, m_source + a.name.start, a.name.size())) {
2478 t->text.append(m_source + offset, a.value.start - offset);
2479 std::unique_ptr<url_token<T, TR, AX>> t_url(new url_token<T, TR, AX>(
2480 nullptr, 0,
2481 token_url_t::sgml,
2482 s.get()));
2483 stdex::sgml2strcat(t_url->url, m_source + a.value.start, a.value.size());
2484 append_token(std::move(t_url), t->text);
2485 t->text_type |= has_tokens;
2486 offset = a.value.end;
2487 }
2488 else if (element_traits::is_localizable(s_el->code, m_source + a.name.start, a.name.size())) {
2489 t->text.append(m_source + offset, a.value.start - offset);
2490 std::unique_ptr<text_token<T, TR, AX>> t_value(new text_token<T, TR, AX>(
2491 token_t::complete,
2492 nullptr, 0,
2493 has_text | is_title,
2494 s.get()));
2495 stdex::mapping<size_t> rel_value(a.value.start, 0);
2496 t_value->mapping.push_back(rel_value);
2497 stdex::sgml2strcat(t_value->text, m_source + a.value.start, a.value.size(), 0, rel_value, &t_value->mapping);
2498 append_token(std::move(t_value), t->text);
2499 t->text_type |= has_tokens;
2500 offset = a.value.end;
2501 }
2502 }
2503
2504 t->text.append(m_source + offset, s->interval.end - offset);
2505 rel.from = s->interval.start;
2506 token->mapping.push_back(rel);
2507 rel.to += append_token(std::move(t), token->text);
2508 token->text_type |= has_tokens;
2509 }
2510 ++m_offset;
2511
2512 if (s_el_start) {
2513 if (s_el_start->code == element_t::address ||
2514 s_el_start->code == element_t::code ||
2515 s_el_start->code == element_t::comment ||
2516 s_el_start->code == element_t::cite ||
2517 s_el_start->code == element_t::kbd ||
2518 s_el_start->code == element_t::samp ||
2519 s_el_start->code == element_t::script ||
2520 s_el_start->code == element_t::style)
2521 {
2522 // Non-localizable
2523 auto s_end = s_el_start->end;
2524 _Assume_(s_end);
2525
2526 if (s->interval.end < s_end->interval.start) {
2527 if (s_el_start->code != element_t::style) {
2528 rel.from = s->interval.start;
2529 token->mapping.push_back(rel);
2530 rel.to += append_token(std::move(std::unique_ptr<text_token<T, TR, AX>>(
2532 token_t::complete,
2533 m_source + s->interval.end, s_end->interval.start - s->interval.end,
2534 0,
2535 m_offset->get()))),
2536 token->text);
2537 }
2538 else {
2539 // Partially parse CSS. It may contain URLs we need to make absolute.
2540 auto t = parse_css(s->interval.end, s_end->interval.start);
2541 _Assume_(t);
2542 rel.from = s->interval.start;
2543 token->mapping.push_back(rel);
2544 rel.to += t->append_tag(token->text);
2545 }
2546 token->text_type |= has_tokens;
2547 }
2548 while (m_offset != end && m_offset->get() != s_end)
2549 ++m_offset;
2550 }
2551 else if (element_traits::is_group(s_el_start->code)) {
2552 auto limit = m_offset;
2553 while (limit != end && limit->get() != s_el_start->end)
2554 ++limit;
2555 auto t = parse(limit,
2556 (element_traits::is_heading(s_el_start->code) || s_el_start->code == element_t::dt || s_el_start->code == element_t::title ? is_title : 0) |
2557 (element_traits::is_list(s_el_start->code) ? is_bullet : 0));
2558 rel.from = s->interval.start;
2559 token->mapping.push_back(rel);
2560 rel.to += t->append_tag(token->text);
2561 token->text_type |= has_tokens;
2562 }
2563 }
2564 }
2565 else if (s->type == stdex::parser::html_sequence_t::element_end) {
2566 rel.from = s->interval.start;
2567 token->mapping.push_back(rel);
2568 rel.to += append_token(std::move(std::unique_ptr<text_token<T, TR, AX>>(
2570 token_t::ending,
2571 m_source + s->interval.start, s->interval.size(),
2572 0,
2573 s.get()))),
2574 token->text);
2575 token->text_type |= has_tokens;
2576 ++m_offset;
2577 }
2578 else {
2579 // Declaration, instruction, (P)CDATA section, comment...
2580 rel.from = s->interval.start;
2581 token->mapping.push_back(rel);
2582 rel.to += append_token(std::move(std::unique_ptr<text_token<T, TR, AX>>(
2584 token_t::complete,
2585 m_source + s->interval.start, s->interval.size(),
2586 0,
2587 s.get()))),
2588 token->text);
2589 token->text_type |= has_tokens;
2590 ++m_offset;
2591 }
2592 }
2593
2594 return append_token(std::move(token));
2595 }
2596
2600 text_token<T, TR, AX>* parse_css(size_t start, size_t end)
2601 {
2602 stdex::interval<size_t> section, content;
2603 std::unique_ptr<text_token<T, TR, AX>> token(
2605 token_t::complete,
2606 nullptr, 0,
2607 0,
2608 m_offset->get()));
2609
2610 for (;;) {
2611 if (m_css_comment.match(m_source, start, end)) {
2612 token->text.append(m_source + start, m_css_comment.interval.end - start);
2613 start = m_css_comment.interval.end;
2614 }
2615 else if (m_css_cdo.match(m_source, start, end)) {
2616 token->text.append(m_source + start, m_css_cdo.interval.end - start);
2617 start = m_css_cdo.interval.end;
2618 }
2619 else if (m_css_cdc.match(m_source, start, end)) {
2620 token->text.append(m_source + start, m_css_cdc.interval.end - start);
2621 start = m_css_cdc.interval.end;
2622 }
2623 else if (
2624 m_css_import.match(m_source, start, end) && (section = m_css_import.interval, content = m_css_import.content, true) ||
2625 m_css_uri.match(m_source, start, end) && (section = m_css_uri.interval, content = m_css_uri.content, true))
2626 {
2627 std::unique_ptr<url_token<T, TR, AX>> t_url(
2629 nullptr, 0,
2630 token_url_t::css,
2631 m_offset->get()));
2632 css_unescape(t_url->url, m_source + content.start, content.size());
2633 token->text.append(m_source + start, content.start - start);
2634 append_token(std::move(t_url), token->text);
2635 token->text.append(m_source + content.end, section.end - content.end);
2636 token->text_type |= has_tokens;
2637 start = section.end;
2638 }
2639 else if (m_any_char.match(m_source, start, end)) {
2640 token->text.append(m_source + start, m_any_char.interval.end - start);
2641 start = m_any_char.interval.end;
2642 }
2643 else
2644 break;
2645 }
2646
2647 return append_token(std::move(token));
2648 }
2649
2650 protected:
2652 const stdex::sstring m_url;
2653 const bool m_parse_frames;
2655 const T* m_source;
2656 token_vector m_tokens;
2657 sequence_store::const_iterator m_offset;
2658
2659 // For detecting URLs in CSS
2667 };
2668 }
2669}
HTML comment.
Definition html.hpp:1553
stdex::interval< size_t > content
Comment content position in source.
Definition html.hpp:1562
HTML declaration.
Definition html.hpp:1535
stdex::interval< size_t > name
Declaration name position in source.
Definition html.hpp:1545
std::vector< stdex::parser::html_attribute > attributes
Declaration attribute positions in source.
Definition html.hpp:1546
HTML document.
Definition html.hpp:1602
bool m_is_rcdata
Inside of RCDATA?
Definition html.hpp:1928
const std::basic_string< T, TR, AX > & source() const
Returns document HTML source code.
Definition html.hpp:1875
void append(_In_reads_or_z_opt_(num_chars) const T *source, size_t num_chars)
Parses HTML source code by chunks.
Definition html.hpp:1642
size_t m_num_valid_conditions
Number of started valid conditions.
Definition html.hpp:1925
size_t m_num_invalid_conditions
Number of started invalid conditions.
Definition html.hpp:1926
bool m_is_cdata
Inside of CDATA?
Definition html.hpp:1927
stdex::charset_id m_charset
Document charset.
Definition html.hpp:1922
sequence_store m_sequences
Store of sequences.
Definition html.hpp:1936
element_start * active_element() const
Returns starting tag of currently active element or nullptr if no element is known to be started.
Definition html.hpp:1883
size_t m_num_parsed
Number of characters already parsed.
Definition html.hpp:1921
std::vector< element_start * > m_element_stack
LIFO stack of started elements.
Definition html.hpp:1937
void finalize()
Finalizes document when no more appending is planned.
Definition html.hpp:1853
std::basic_string< T, TR, AX > replace_entities(_In_reads_or_z_opt_(num_chars) const T *input, size_t num_chars) const
Replaces entities with their content.
Definition html.hpp:1891
void assign(_In_reads_or_z_opt_(num_chars) const T *source, size_t num_chars)
Parses HTML document source code.
Definition html.hpp:1865
bool m_is_special_element
Inside of a special element (<SCRIPT>, <STYLE>, ...)?
Definition html.hpp:1938
std::vector< std::unique_ptr< entity< T, TR, AX > > > m_entities
Array of entities.
Definition html.hpp:1932
void clear()
Empties document.
Definition html.hpp:1621
std::basic_string< T, TR, AX > m_source
Document HTML source code.
Definition html.hpp:1920
Ending tag of an HTML element </...>
Definition html.hpp:1515
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1527
element_start * start
Corresponding starting tag.
Definition html.hpp:1528
element_t code
Element code.
Definition html.hpp:1526
Starting tag of an HTML element <...>
Definition html.hpp:1499
sequence * end
Corresponding ending tag of type element_end; When element is ended by a start of another element,...
Definition html.hpp:1508
HTML element <.../>
Definition html.hpp:1320
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1489
std::vector< stdex::parser::html_attribute > attributes
Element attribute positions in source.
Definition html.hpp:1490
element_t code
Element code.
Definition html.hpp:1488
HTML instruction.
Definition html.hpp:1569
stdex::interval< size_t > content
Instruction content position in source.
Definition html.hpp:1578
HTML parser.
Definition html.hpp:2159
token_vector m_tokens
HTML token storage.
Definition html.hpp:2656
void append_inserted_tokens(std::basic_string< T, TR, AX > &source, inserted_token_list &inserted_tokens, size_t word_index, bool after_word, token_list &active_tokens)
Adds matching inserted tokens before/after the given word in source code.
Definition html.hpp:2323
text_token< T, TR, AX > * parse(const sequence_store::const_iterator &end, uint32_t text_type=0)
Recursively parses HTML document.
Definition html.hpp:2425
const stdex::sstring m_url
Absolute document URL.
Definition html.hpp:2652
text_token< T, TR, AX > * parse()
Parses HTML document.
Definition html.hpp:2175
const document< T, TR, AX > & m_document
Document being analyzed.
Definition html.hpp:2651
token_list::const_iterator end_tokens(std::basic_string< T, TR, AX > &source, token_list &active_tokens, const token_list &new_tokens)
Pops ending tokens from the active token list and append their tags to the source code string.
Definition html.hpp:2280
static void merge(token_list &a, const token_list &b)
Adds tokens from list b to list a creating an union.
Definition html.hpp:2347
text_token< T, TR, AX > * parse_css(size_t start, size_t end)
Parses CSS.
Definition html.hpp:2600
static void start_tokens(std::basic_string< T, TR, AX > &source, token_list &active_tokens, const token_list &new_tokens, token_list::const_iterator from)
Pushes tokens to the active token list and appends their tags to the source code string.
Definition html.hpp:2262
static void link(std::basic_string< T, TR, AX > &source, const text_token< T, TR, AX > *t)
Rebuilds HTML source code from the token tree.
Definition html.hpp:2195
T_token * append_token(std::unique_ptr< T_token > &&token)
Adds token to the collection.
Definition html.hpp:2390
sequence_store::const_iterator m_offset
Index of active section.
Definition html.hpp:2657
const T * m_source
HTML source code.
Definition html.hpp:2655
stdex::progress< size_t > * m_progress
Progress indicator.
Definition html.hpp:2654
const bool m_parse_frames
Parse frames.
Definition html.hpp:2653
void make_absolute_url(std::basic_string< T, TR, AX > &rel)
Converts URL to absolute.
Definition html.hpp:2366
size_t append_token(std::unique_ptr< T_token > &&token, std::basic_string< T, TR, AX > &source)
Adds token to the collection and appends its tag to the source code string.
Definition html.hpp:2408
const token_vector & tokens() const
Returns collection of tokens.
Definition html.hpp:2379
Base class for HTML sequences.
Definition html.hpp:1301
stdex::interval< size_t > interval
Sequence position in source.
Definition html.hpp:1304
stdex::parser::html_sequence_t type
Sequence type. Enum is used for performance reasons (vs. dynamic_cast)
Definition html.hpp:1303
sequence * parent
Parent sequence.
Definition html.hpp:1305
Token representing start HTML tag.
Definition html.hpp:2092
stdex::html::sequence * end_sequence
Ending tag sequence.
Definition html.hpp:2110
std::basic_string< T, TR, AX > name
Element name allowing later recreation of ending </tag>
Definition html.hpp:2109
Token representing part of HTML text.
Definition html.hpp:2067
stdex::mapping_vector< size_t > mapping
Mapping between source and text positions.
Definition html.hpp:2084
uint32_t text_type
Mask of text_type_flag_t to specify text content.
Definition html.hpp:2083
std::basic_string< T, TR, AX > text
Token text.
Definition html.hpp:2082
HTML token base class.
Definition html.hpp:1976
sequence * sequence
Pointer to the sequence this token represents or nullptr when it doesn't trivially represent one sequ...
Definition html.hpp:2045
uintptr_t data
Any user-supplied data.
Definition html.hpp:2046
size_t append_tag(std::basic_string< wchar_t, TR, AX > &str) const
Appends token tag to the source code.
Definition html.hpp:2014
token_t type
Token type.
Definition html.hpp:2044
size_t append_tag(std::basic_string< char, TR, AX > &str) const
Appends token tag to the source code.
Definition html.hpp:1998
HTTP token representing an URL.
Definition html.hpp:2127
token_url_t encoding
URL encoding.
Definition html.hpp:2142
std::basic_string< T, TR, AX > url
URL.
Definition html.hpp:2141
stdex::interval< size_t > interval
Region of the last match.
Definition parser.hpp:115
Test for given string.
Definition parser.hpp:830
Progress indicator base class.
Definition progress.hpp:19
virtual bool cancel()
Query whether user requested abort.
Definition progress.hpp:65
virtual void set(T value)
Set current progress.
Definition progress.hpp:47
virtual void set_range(T start, T end)
Set progress range extent.
Definition progress.hpp:37
User cancelled exception.
Definition exception.hpp:17
Describes attributes associated with a HTML element.
Definition html.hpp:685
static bool is_group(element_t code)
Does element represent a separate part of text?
Definition html.hpp:1090
static bool is_flow(element_t code)
Does element typically represent text?
Definition html.hpp:1004
static bool is_heading(element_t code)
Does element represent a heading?
Definition html.hpp:922
static bool is_head_content(element_t code)
Is element part of the document head?
Definition html.hpp:1014
static bool is_fontstyle(element_t code)
Does element represent font styling?
Definition html.hpp:810
static bool is_block(element_t code)
Is element typically displayed as a stand-alone section of text?
Definition html.hpp:973
static bool is_head_misc(element_t code)
May element be a part of document head?
Definition html.hpp:1031
static bool is_list(element_t code)
Does element represent a list of items?
Definition html.hpp:941
static bool is_uri(element_t code, _In_reads_or_z_opt_(num_chars) const T *attr_name, size_t num_chars)
Checks if expected element attribute value is URI.
Definition html.hpp:1220
static bool is_preformatted(element_t code)
Does element represent preformatted text, source code etc.?
Definition html.hpp:958
static bool is_localizable(element_t code, const T *attr_name, size_t num_chars)
Checks if expected element attribute value is localizable.
Definition html.hpp:1275
static bool is_special(element_t code)
Does element represent non-textual item in the document?
Definition html.hpp:856
static bool is_pre_exclusion(element_t code)
May element be a part of.
Definition html.hpp:1049
static bool is_inline(element_t code)
Is element typically displayed inline with text?
Definition html.hpp:907
static bool is_html_content(element_t code)
Does element represent the document body?
Definition html.hpp:1074
static bool is_formctrl(element_t code)
Does element represent a form control?
Definition html.hpp:889
static bool is_phrase(element_t code)
Does element represent a phrase-of-speech?
Definition html.hpp:832
static bool may_contain(element_t parent, element_t child)
Checks if one element may nest inside another.
Definition html.hpp:1120
static element_span_t span(element_t code)
Returns expected element span in HTML code.
Definition html.hpp:691
HTML entity.
Definition html.hpp:1586
std::basic_string< T, TR, AX > value
Entity value.
Definition html.hpp:1588
stdex::interval< size_t > name
Name position in source.
Definition html.hpp:1587
Inserted HTML token.
Definition html.hpp:2148
bool after_word
true if token is anchored after the word; false if anchored before the word
Definition html.hpp:2152
std::list< stdex::html::token * > active_tokens
List of started tokens at inserted token.
Definition html.hpp:2150
size_t word_index
Index of the word, token is anchored to.
Definition html.hpp:2151
token * token
Points to the token.
Definition html.hpp:2149
Numerical interval.
Definition interval.hpp:18
T size() const
Returns interval size.
Definition interval.hpp:47
T end
interval end
Definition interval.hpp:20
T start
interval start
Definition interval.hpp:19
Maps index in source string to index in destination string.
Definition mapping.hpp:17
Tag attribute.
Definition parser.hpp:8038
stdex::interval< size_t > value
attribute value position in source
Definition parser.hpp:8040