stdex
Additional custom or not Standard C++ covered algorithms
Loading...
Searching...
No Matches
html.hpp
1/*
2 SPDX-License-Identifier: MIT
3 Copyright © 2016-2023 Amebis
4*/
5
6#pragma once
7
8#include "compat.hpp"
9#include "exception.hpp"
10#include "interval.hpp"
11#include "mapping.hpp"
12#include "parser.hpp"
13#include "progress.hpp"
14#include "sgml.hpp"
15#include "string.hpp"
16#include "system.hpp"
17#include "unicode.hpp"
18#include <exception>
19#include <list>
20#include <map>
21#include <memory>
22#include <stdexcept>
23#include <vector>
24
25#ifdef _WIN32
26#undef small
27#endif
28
29namespace stdex
30{
31 namespace html
32 {
40 template<class _Traits = std::char_traits<char>, class _Alloc = std::allocator<char>>
41 inline void escape(
42 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
43 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars = SIZE_MAX)
44 {
45 _Assume_(src || !num_chars);
46 for (size_t i = 0; i < num_chars && src[i]; ++i) {
47 switch (src[i]) {
48 case '&': dst += "&amp;"; break;
49 case ';': dst += "&semi;"; break;
50 case '\"': dst += "&quot;"; break;
51 case '\'': dst += "&#x27;"; break;
52 case '<': dst += "&lt;"; break;
53 case '>': dst += "&gt;"; break;
54 case 0x00a0: dst += "&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
55 default: dst += src[i]; break;
56 }
57 }
58 }
59
67 template<class _Traits = std::char_traits<wchar_t>, class _Alloc = std::allocator<wchar_t>>
68 inline void escape(
69 _Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& dst,
70 _In_reads_or_z_opt_(num_chars) const wchar_t* src, _In_ size_t num_chars = SIZE_MAX)
71 {
72 _Assume_(src || !num_chars);
73 for (size_t i = 0; i < num_chars && src[i]; ++i) {
74 switch (src[i]) {
75 case L'&': dst += L"&amp;"; break;
76 case L';': dst += L"&semi;"; break;
77 case L'\"': dst += L"&quot;"; break;
78 case L'\'': dst += L"&#x27;"; break;
79 case L'<': dst += L"&lt;"; break;
80 case L'>': dst += L"&gt;"; break;
81 case L'\u00a0': dst += L"&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
82 default: dst += src[i]; break;
83 }
84 }
85 }
86
93 template<class _Traits = std::char_traits<char>, class _Alloc = std::allocator<char>>
94 inline void escape_min(_Inout_ std::basic_string<char, _Traits, _Alloc>& dst, _In_ char chr)
95 {
96 switch (chr) {
97 case '&': dst += "&amp;"; break;
98 case '<': dst += "&lt;"; break;
99 case '>': dst += "&gt;"; break;
100 case 0x00a0: dst += "&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
101 default: dst += chr; break;
102 }
103 }
104
111 template<class _Traits = std::char_traits<wchar_t>, class _Alloc = std::allocator<wchar_t>>
112 inline void escape_min(_Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& dst, _In_ wchar_t chr)
113 {
114 switch (chr) {
115 case L'&': dst += L"&amp;"; break;
116 case L'<': dst += L"&lt;"; break;
117 case L'>': dst += L"&gt;"; break;
118 case L'\u00a0': dst += L"&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
119 default: dst += chr; break;
120 }
121 }
122
130 template<class _Traits = std::char_traits<char>, class _Alloc = std::allocator<char>>
131 inline void escape_min(
132 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
133 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars = SIZE_MAX)
134 {
135 _Assume_(src || !num_chars);
136 for (size_t i = 0; i < num_chars && src[i]; ++i) {
137 switch (src[i]) {
138 case '&': dst += "&amp;"; break;
139 case '<': dst += "&lt;"; break;
140 case '>': dst += "&gt;"; break;
141 case 0x00a0: dst += "&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
142 default: dst += src[i]; break;
143 }
144 }
145 }
146
154 template<class _Traits = std::char_traits<wchar_t>, class _Alloc = std::allocator<wchar_t>>
155 inline void escape_min(
156 _Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& dst,
157 _In_reads_or_z_opt_(num_chars) const wchar_t* src, _In_ size_t num_chars = SIZE_MAX)
158 {
159 _Assume_(src || !num_chars);
160 for (size_t i = 0; i < num_chars && src[i]; ++i) {
161 switch (src[i]) {
162 case L'&': dst += L"&amp;"; break;
163 case L'<': dst += L"&lt;"; break;
164 case L'>': dst += L"&gt;"; break;
165 case L'\u00a0': dst += L"&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
166 default: dst += src[i]; break;
167 }
168 }
169 }
170
178 template<class _Traits = std::char_traits<char>, class _Alloc = std::allocator<char>>
179 inline void url_unescape(
180 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
181 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars = SIZE_MAX)
182 {
183 _Assume_(src || !num_chars);
184 for (size_t i = 0; i < num_chars && src[i];) {
185 switch (src[i]) {
186 case '+':
187 dst += ' '; i++;
188 break;
189
190 case '%': {
191 i++;
192
193 uint8_t chr;
194 if ('0' <= src[i] && src[i] <= '9') chr = (src[i++] - '0') << 4;
195 else if ('A' <= src[i] && src[i] <= 'F') chr = (src[i++] - 'A' + 10) << 4;
196 else if ('a' <= src[i] && src[i] <= 'f') chr = (src[i++] - 'a' + 10) << 4;
197 else { dst += '%'; continue; }
198 if ('0' <= src[i] && src[i] <= '9') chr |= (src[i++] - '0');
199 else if ('A' <= src[i] && src[i] <= 'F') chr |= (src[i++] - 'A' + 10);
200 else if ('a' <= src[i] && src[i] <= 'f') chr |= (src[i++] - 'a' + 10);
201 else { dst += '%'; dst += src[i - 1]; continue; }
202
203 dst += static_cast<char>(chr);
204 break;
205 }
206
207 default:
208 dst += src[i++];
209 }
210 }
211 }
212
220 template<class _Traits = std::char_traits<char>, class _Alloc = std::allocator<char>>
221 inline void url_escape(
222 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
223 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars = SIZE_MAX)
224 {
225 _Assume_(src || !num_chars);
226 for (size_t i = 0; i < num_chars && src[i]; ++i) {
227 switch (src[i]) {
228 case ' ': dst += "+"; break;
229 case '<': dst += "%3C"; break;
230 case '>': dst += "%3E"; break;
231 case '#': dst += "%23"; break;
232 case '%': dst += "%25"; break;
233 case '{': dst += "%7B"; break;
234 case '}': dst += "%7D"; break;
235 case '|': dst += "%7C"; break;
236 case '\\': dst += "%5C"; break;
237 case '^': dst += "%5E"; break;
238 case '~': dst += "%7E"; break;
239 case '[': dst += "%5B"; break;
240 case ']': dst += "%5D"; break;
241 case '`': dst += "%60"; break;
242 case ';': dst += "%3B"; break;
243 case '/': dst += "%2F"; break;
244 case '?': dst += "%3F"; break;
245 case ':': dst += "%3A"; break;
246 case '@': dst += "%40"; break;
247 case '=': dst += "%3D"; break;
248 case '&': dst += "%26"; break;
249 case '$': dst += "%24"; break;
250 default:
251 if (0x20 < static_cast<uint8_t>(src[i]) && static_cast<uint8_t>(src[i]) < 0x7f)
252 dst += src[i];
253 else {
254 dst += '%';
255 uint8_t n = (static_cast<uint8_t>(src[i]) & 0xf0) >> 4;
256 dst += n < 10 ? static_cast<char>('0' + n) : static_cast<char>('A' + n - 10);
257 n = ((uint8_t)src[i] & 0x0f);
258 dst += n < 10 ? static_cast<char>('0' + n) : static_cast<char>('A' + n - 10);
259 }
260 }
261 }
262 }
263
271 template<class _Elem, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
272 inline void css_unescape(
273 _Inout_ std::basic_string<_Elem, _Traits, _Alloc>& dst,
274 _In_reads_or_z_opt_(num_chars) const _Elem* src, _In_ size_t num_chars = SIZE_MAX)
275 {
276 _Assume_(src || !num_chars);
277 for (size_t i = 0; i < num_chars && src[i];) {
278 if (src[i] != '\\')
279 dst += src[i++];
280 else if (i + 1 < num_chars) {
281 i++;
282
283 switch (src[i]) {
284 // Classic escapes
285 case 'n': dst += '\n'; i++; break;
286 case 'r': dst += '\r'; i++; break;
287 case 't': dst += '\t'; i++; break;
288
289 // `\` at the end of the line
290 case '\n': i++; break;
291
292 // `\nnnn` escape
293 case '0':
294 case '1':
295 case '2':
296 case '3':
297 case '4':
298 case '5':
299 case '6':
300 case '7':
301 case '8':
302 case '9':
303 case 'A': case 'a':
304 case 'B': case 'b':
305 case 'C': case 'c':
306 case 'D': case 'd':
307 case 'E': case 'e':
308 case 'F': case 'f': {
309 wchar_t chr = 0;
310 size_t end = std::min(num_chars, i + 6);
311
312 for (; i < end; ++i) {
313 if ('0' <= src[i] && src[i] <= '9') chr = chr * 0x10 + src[i] - '0';
314 else if ('A' <= src[i] && src[i] <= 'F') chr = chr * 0x10 + src[i] - 'A' + 10;
315 else if ('a' <= src[i] && src[i] <= 'f') chr = chr * 0x10 + src[i] - 'a' + 10;
316 else break;
317 }
318
319 dst += static_cast<_Elem>(chr);
320
321 if (i < end && src[i] == ' ') {
322 // Skip space after `\nnnn`.
323 i++;
324 }
325 break;
326 }
327
328 default: dst += src[i++];
329 }
330 }
331 }
332 }
333
341 template<class _Traits = std::char_traits<char>, class _Alloc = std::allocator<char>>
342 inline void css_escape(
343 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
344 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars = SIZE_MAX)
345 {
346 _Assume_(src || !num_chars);
347 for (size_t i = 0; i < num_chars && src[i]; ++i) {
348 switch (src[i]) {
349 case '\\': dst += "\\\\"; break;
350 case '\n': dst += "\\n"; break;
351 case '\r': dst += "\\r"; break;
352 case '\t': dst += "\\t"; break;
353 case '\"': dst += "\\\""; break;
354 case '\'': dst += "\\'"; break;
355 default: dst += src[i]; break;
356 }
357 }
358 }
359
367 template<class _Traits = std::char_traits<wchar_t>, class _Alloc = std::allocator<wchar_t>>
368 inline void css_escape(
369 _Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& dst,
370 _In_reads_or_z_opt_(num_chars) const wchar_t* src, _In_ size_t num_chars = SIZE_MAX)
371 {
372 _Assume_(src || !num_chars);
373 for (size_t i = 0; i < num_chars && src[i]; ++i) {
374 switch (src[i]) {
375 case L'\\': dst += L"\\\\"; break;
376 case L'\n': dst += L"\\n"; break;
377 case L'\r': dst += L"\\r"; break;
378 case L'\t': dst += L"\\t"; break;
379 case L'\"': dst += L"\\\""; break;
380 case L'\'': dst += L"\\'"; break;
381 default: dst += src[i]; break;
382 }
383 }
384 }
385
389 enum class element_t {
390 empty = 0,
391 a,
392 abbr,
393 acronym,
394 address,
395 applet,
396 area,
397 b,
398 base,
399 basefont,
400 bdo,
401 bgsound, // Microsoft Specific
402 big,
403 blink, // Microsoft Specific
404 blockquote,
405 body,
406 br,
407 button,
408 caption,
409 center,
410 cite,
411 code,
412 col,
413 colgroup,
414 comment, // Microsoft Specific
415 dd,
416 del,
417 dfn,
418 dir,
419 div,
420 dl,
421 dt,
422 em,
423 embed, // Microsoft Specific
424 fieldset,
425 font,
426 form,
427 frame,
428 frameset,
429 h1,
430 h2,
431 h3,
432 h4,
433 h5,
434 h6,
435 head,
436 hr,
437 html,
438 i,
439 iframe,
440 img,
441 input,
442 ins,
443 isindex,
444 kbd,
445 label,
446 legend,
447 li,
448 link,
449 listing, // Microsoft Specific
450 map,
451 marquee, // Microsoft Specific
452 menu,
453 meta,
454 nextid, // Microsoft Specific
455 nobr, // Microsoft Specific
456 noembed, // Microsoft Specific
457 noframes,
458 noscript,
459 object,
460 ol,
461 optgroup,
462 option,
463 p,
464 param,
465 plaintext, // Microsoft Specific
466 pre,
467 q,
468 rt, // Microsoft Specific
469 ruby, // Microsoft Specific
470 s,
471 samp,
472 script,
473 select,
474 small,
475 span,
476 strike,
477 strong,
478 style,
479 sub,
480 sup,
481 table,
482 tbody,
483 td,
484 textarea,
485 tfoot,
486 th,
487 thead,
488 title,
489 tr,
490 tt,
491 u,
492 ul,
493 var,
494 wbr, // Microsoft Specific
495 xmp, // Microsoft Specific
496
497 unknown = -1,
498 PCDATA = -2,
499 CDATA = -3,
500 };
501
505 enum class element_span_t {
506 needs_end = 0,
507 end_optional,
508 immediate,
509 };
510
515 {
521 static inline element_span_t span(_In_ element_t code)
522 {
523 static element_span_t lookup[] = {
524 element_span_t::needs_end, // a
525 element_span_t::needs_end, // abbr
526 element_span_t::needs_end, // acronym
527 element_span_t::needs_end, // address
528 element_span_t::needs_end, // applet
529 element_span_t::immediate, // area
530 element_span_t::needs_end, // b
531 element_span_t::immediate, // base
532 element_span_t::immediate, // basefont
533 element_span_t::needs_end, // bdo
534 element_span_t::immediate, // bgsound
535 element_span_t::needs_end, // big
536 element_span_t::needs_end, // blink
537 element_span_t::needs_end, // blockquote
538 element_span_t::end_optional, // body
539 element_span_t::immediate, // br
540 element_span_t::needs_end, // button
541 element_span_t::needs_end, // caption
542 element_span_t::needs_end, // center
543 element_span_t::needs_end, // cite
544 element_span_t::needs_end, // code
545 element_span_t::immediate, // col
546 element_span_t::end_optional, // colgroup
547 element_span_t::needs_end, // comment
548 element_span_t::end_optional, // dd
549 element_span_t::needs_end, // del
550 element_span_t::needs_end, // dfn
551 element_span_t::needs_end, // dir
552 element_span_t::needs_end, // div
553 element_span_t::needs_end, // dl
554 element_span_t::end_optional, // dt
555 element_span_t::needs_end, // em
556 element_span_t::immediate, // embed
557 element_span_t::needs_end, // fieldset
558 element_span_t::needs_end, // font
559 element_span_t::needs_end, // form
560 element_span_t::immediate, // frame
561 element_span_t::needs_end, // frameset
562 element_span_t::needs_end, // h1
563 element_span_t::needs_end, // h2
564 element_span_t::needs_end, // h3
565 element_span_t::needs_end, // h4
566 element_span_t::needs_end, // h5
567 element_span_t::needs_end, // h6
568 element_span_t::end_optional, // head
569 element_span_t::immediate, // hr
570 element_span_t::end_optional, // html
571 element_span_t::needs_end, // i
572 element_span_t::needs_end, // iframe
573 element_span_t::immediate, // img
574 element_span_t::immediate, // input
575 element_span_t::needs_end, // ins
576 element_span_t::immediate, // isindex
577 element_span_t::needs_end, // kbd
578 element_span_t::needs_end, // label
579 element_span_t::needs_end, // legend
580 element_span_t::end_optional, // li
581 element_span_t::immediate, // link
582 element_span_t::needs_end, // listing
583 element_span_t::needs_end, // map
584 element_span_t::needs_end, // marquee
585 element_span_t::needs_end, // menu
586 element_span_t::immediate, // meta
587 element_span_t::immediate, // nextid
588 element_span_t::needs_end, // nobr
589 element_span_t::needs_end, // noembed
590 element_span_t::needs_end, // noframes
591 element_span_t::needs_end, // noscript
592 element_span_t::needs_end, // object
593 element_span_t::needs_end, // ol
594 element_span_t::needs_end, // optgroup
595 element_span_t::end_optional, // option
596 element_span_t::end_optional, // p
597 element_span_t::immediate, // param
598 element_span_t::end_optional, // plaintext
599 element_span_t::needs_end, // pre
600 element_span_t::needs_end, // q
601 element_span_t::immediate, // rt
602 element_span_t::needs_end, // ruby
603 element_span_t::needs_end, // s
604 element_span_t::needs_end, // samp
605 element_span_t::needs_end, // script
606 element_span_t::needs_end, // select
607 element_span_t::needs_end, // small
608 element_span_t::needs_end, // span
609 element_span_t::needs_end, // strike
610 element_span_t::needs_end, // strong
611 element_span_t::needs_end, // style
612 element_span_t::needs_end, // sub
613 element_span_t::needs_end, // sup
614 element_span_t::needs_end, // table
615 element_span_t::end_optional, // tbody
616 element_span_t::end_optional, // td
617 element_span_t::needs_end, // textarea
618 element_span_t::end_optional, // tfoot
619 element_span_t::end_optional, // th
620 element_span_t::end_optional, // thead
621 element_span_t::needs_end, // title
622 element_span_t::end_optional, // tr
623 element_span_t::needs_end, // tt
624 element_span_t::needs_end, // u
625 element_span_t::needs_end, // ul
626 element_span_t::needs_end, // var
627 element_span_t::immediate, // wbr
628 element_span_t::needs_end, // xmp
629 };
630 return element_t::a <= code && code <= element_t::xmp ?
631 lookup[static_cast<size_t>(code) - static_cast<size_t>(element_t::a)] :
632 element_span_t::needs_end;
633 }
634
640 static inline bool is_fontstyle(_In_ element_t code)
641 {
642 switch (code) {
643 case element_t::tt:
644 case element_t::i:
645 case element_t::b:
646 case element_t::u:
647 case element_t::s:
648 case element_t::strike:
649 case element_t::blink:
650 case element_t::big:
651 case element_t::small:
652 return true;
653 };
654 return false;
655 }
656
662 static inline bool is_phrase(_In_ element_t code)
663 {
664 switch (code) {
665 case element_t::em:
666 case element_t::strong:
667 case element_t::dfn:
668 case element_t::code:
669 case element_t::samp:
670 case element_t::kbd:
671 case element_t::var:
672 case element_t::cite:
673 case element_t::abbr:
674 case element_t::acronym:
675 case element_t::xmp:
676 return true;
677 };
678 return false;
679 }
680
686 static inline bool is_special(_In_ element_t code)
687 {
688 switch (code) {
689 case element_t::a:
690 case element_t::img:
691 case element_t::applet:
692 case element_t::object:
693 case element_t::embed:
694 case element_t::font:
695 case element_t::basefont:
696 case element_t::br:
697 case element_t::wbr:
698 case element_t::rt:
699 case element_t::script:
700 case element_t::map:
701 case element_t::q:
702 case element_t::sub:
703 case element_t::sup:
704 case element_t::ruby:
705 case element_t::span:
706 case element_t::bdo:
707 case element_t::iframe:
708 case element_t::nobr:
709 return true;
710 };
711 return false;
712 }
713
719 static inline bool is_formctrl(_In_ element_t code)
720 {
721 switch (code) {
722 case element_t::input:
723 case element_t::select:
724 case element_t::textarea:
725 case element_t::label:
726 case element_t::button:
727 return true;
728 };
729 return false;
730 }
731
737 static inline bool is_inline(_In_ element_t code)
738 {
739 return
740 code == element_t::PCDATA ||
741 is_fontstyle(code) ||
742 is_phrase(code) ||
743 is_special(code) ||
744 is_formctrl(code);
745 }
746
752 static inline bool is_heading(_In_ element_t code)
753 {
754 switch (code) {
755 case element_t::h1:
756 case element_t::h2:
757 case element_t::h3:
758 case element_t::h4:
759 case element_t::h5:
760 case element_t::h6:
761 return true;
762 };
763 return false;
764 }
765
771 static inline bool is_list(_In_ element_t code)
772 {
773 switch (code) {
774 case element_t::ul:
775 case element_t::ol:
776 case element_t::dir:
777 case element_t::menu:
778 return true;
779 };
780 return false;
781 }
782
788 static inline bool is_preformatted(_In_ element_t code)
789 {
790 switch (code) {
791 case element_t::pre:
792 case element_t::listing:
793 return true;
794 }
795 return false;
796 }
797
803 static inline bool is_block(_In_ element_t code)
804 {
805 if (is_heading(code) ||
806 is_list(code) ||
807 is_preformatted(code)) return true;
808 switch (code) {
809 case element_t::p:
810 case element_t::dl:
811 case element_t::div:
812 case element_t::center:
813 case element_t::marquee:
814 case element_t::noscript:
815 case element_t::noframes:
816 case element_t::noembed:
817 case element_t::blockquote:
818 case element_t::form:
819 case element_t::isindex:
820 case element_t::hr:
821 case element_t::table:
822 case element_t::fieldset:
823 case element_t::address:
824 return true;
825 };
826 return false;
827 }
828
834 static inline bool is_flow(_In_ element_t code)
835 {
836 return is_block(code) || is_inline(code);
837 }
838
844 static inline bool is_head_content(_In_ element_t code)
845 {
846 switch (code) {
847 case element_t::title:
848 case element_t::isindex:
849 case element_t::base:
850 case element_t::nextid:
851 return true;
852 };
853 return false;
854 }
855
861 static inline bool is_head_misc(_In_ element_t code)
862 {
863 switch (code) {
864 case element_t::script:
865 case element_t::style:
866 case element_t::meta:
867 case element_t::link:
868 case element_t::object:
869 return true;
870 };
871 return false;
872 }
873
879 static inline bool is_pre_exclusion(_In_ element_t code)
880 {
881 switch (code) {
882 case element_t::img:
883 case element_t::object:
884 case element_t::applet:
885 case element_t::embed:
886 case element_t::big:
887 case element_t::small:
888 case element_t::sub:
889 case element_t::sup:
890 case element_t::ruby:
891 case element_t::font:
892 case element_t::basefont:
893 case element_t::nobr:
894 return true;
895 };
896 return false;
897 }
898
904 static inline bool is_html_content(_In_ element_t code)
905 {
906 switch (code) {
907 case element_t::head:
908 case element_t::body:
909 case element_t::frameset:
910 return true;
911 };
912 return false;
913 }
914
920 static inline bool is_group(_In_ element_t code)
921 {
922 if (is_block(code) ||
923 is_html_content(code) ||
924 is_head_content(code)) return true;
925 switch (code) {
926 case element_t::col:
927 case element_t::colgroup:
928 case element_t::dd:
929 case element_t::dir:
930 case element_t::dt:
931 case element_t::frame:
932 case element_t::iframe:
933 case element_t::legend:
934 case element_t::td:
935 case element_t::th:
936 case element_t::tr:
937 return true;
938 };
939 return false;
940 }
941
950 static inline bool may_contain(_In_ element_t parent, _In_ element_t child)
951 {
952 if (child == element_t::unknown || child == element_t::comment)
953 return true;
954 if (is_fontstyle(parent) || is_phrase(parent))
955 return is_inline(child);
956 if (is_heading(parent))
957 return is_inline(child);
958
959 switch (parent) {
960 case element_t::a: return is_inline(child) && child != element_t::a;
961 case element_t::address: return is_inline(child) || child == element_t::p;
962 case element_t::applet: return is_flow(child) || child == element_t::param;
963 case element_t::area: return false;
964 case element_t::base: return false;
965 case element_t::basefont: return false;
966 case element_t::bdo: return is_inline(child);
967 case element_t::blockquote: return is_flow(child);
968 case element_t::body: return is_flow(child) || child == element_t::ins || child == element_t::del;
969 case element_t::br: return false;
970 case element_t::button: return is_flow(child) && !is_formctrl(child) && child != element_t::a && child != element_t::form && child != element_t::isindex && child != element_t::fieldset && child != element_t::iframe;
971 case element_t::caption: return is_inline(child);
972 case element_t::center: return is_flow(child);
973 case element_t::col: return false;
974 case element_t::colgroup: return child == element_t::col;
975 case element_t::comment: return child == element_t::CDATA;
976 case element_t::dd: return is_flow(child);
977 case element_t::del: return is_flow(child);
978 case element_t::dir: return child == element_t::li;
979 case element_t::div: return is_flow(child);
980 case element_t::dl: return child == element_t::dt || child == element_t::dd;
981 case element_t::dt: return is_inline(child);
982 case element_t::embed: return is_flow(child) || child == element_t::param;
983 case element_t::fieldset: return is_flow(child) || child == element_t::legend || child == element_t::PCDATA;
984 case element_t::font: return is_inline(child);
985 case element_t::form: return is_flow(child) && child != element_t::form;
986 case element_t::frame: return false;
987 case element_t::frameset: return child == element_t::frameset || child == element_t::frame || child == element_t::noframes;
988 case element_t::head: return is_head_content(child) || is_head_misc(child);
989 case element_t::hr: return false;
990 case element_t::html: return is_html_content(child);
991 case element_t::iframe: return is_flow(child);
992 case element_t::img: return false;
993 case element_t::input: return false;
994 case element_t::ins: return is_flow(child);
995 case element_t::isindex: return false;
996 case element_t::label: return is_inline(child) && child != element_t::label;
997 case element_t::legend: return is_inline(child);
998 case element_t::li: return is_flow(child);
999 case element_t::link: return false;
1000 case element_t::listing: return child == element_t::CDATA;
1001 case element_t::map: return is_block(child) || child == element_t::area;
1002 case element_t::marquee: return is_flow(child);
1003 case element_t::menu: return child == element_t::li;
1004 case element_t::meta: return false;
1005 case element_t::nobr: return is_inline(child) || child == element_t::wbr;
1006 case element_t::noframes: return (is_flow(child) || child == element_t::body) && child != element_t::noframes;
1007 case element_t::noscript: return is_flow(child);
1008 case element_t::noembed: return is_flow(child);
1009 case element_t::object: return is_flow(child) || child == element_t::param;
1010 case element_t::ol: return child == element_t::li;
1011 case element_t::optgroup: return child == element_t::option;
1012 case element_t::option: return child == element_t::PCDATA;
1013 case element_t::p: return is_inline(child);
1014 case element_t::param: return false;
1015 case element_t::plaintext: return is_flow(child);
1016 case element_t::pre: return is_inline(child) && !is_pre_exclusion(child);
1017 case element_t::q: return is_inline(child);
1018 case element_t::rt: return false;
1019 case element_t::ruby: return is_inline(child);
1020 case element_t::script: return child == element_t::CDATA;
1021 case element_t::select: return child == element_t::optgroup || child == element_t::option;
1022 case element_t::span: return is_inline(child);
1023 case element_t::style: return child == element_t::CDATA;
1024 case element_t::sub: return is_inline(child);
1025 case element_t::sup: return is_inline(child);
1026 case element_t::table: return child == element_t::caption || child == element_t::col || child == element_t::colgroup || child == element_t::thead || child == element_t::tfoot || child == element_t::tbody;
1027 case element_t::tbody: return child == element_t::tr;
1028 case element_t::td: return is_flow(child);
1029 case element_t::textarea: return child == element_t::PCDATA;
1030 case element_t::tfoot: return child == element_t::tr;
1031 case element_t::th: return is_flow(child);
1032 case element_t::thead: return child == element_t::tr;
1033 case element_t::title: return child == element_t::PCDATA;
1034 case element_t::tr: return child == element_t::td || child == element_t::th;
1035 case element_t::ul: return child == element_t::li;
1036 case element_t::wbr: return false;
1037 case element_t::unknown: return true;
1038 }
1039 return false;
1040 }
1041
1049 template <class T>
1050 static inline bool is_uri(_In_ element_t code, _In_reads_or_z_opt_(num_chars) const T* attr_name, _In_ size_t num_chars)
1051 {
1052 _Assume_(attr_name || !num_chars);
1053 switch (code) {
1054 case element_t::a: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX);
1055 case element_t::applet: return !stdex::strnicmp(attr_name, num_chars, "code", SIZE_MAX) ||
1056 !stdex::strnicmp(attr_name, num_chars, "codebase", SIZE_MAX) ||
1057 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1058 case element_t::area: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX);
1059 case element_t::base: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX);
1060 case element_t::bgsound: return !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1061 case element_t::blockquote: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX);
1062 case element_t::body: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX);
1063 case element_t::comment: return !stdex::strnicmp(attr_name, num_chars, "data", SIZE_MAX);
1064 case element_t::del: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX);
1065 case element_t::embed: return !stdex::strnicmp(attr_name, num_chars, "pluginspage", SIZE_MAX) ||
1066 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1067 case element_t::form: return !stdex::strnicmp(attr_name, num_chars, "action", SIZE_MAX);
1068 case element_t::frame: return !stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX) ||
1069 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1070 case element_t::head: return !stdex::strnicmp(attr_name, num_chars, "profile", SIZE_MAX);
1071 case element_t::iframe: return !stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX) ||
1072 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1073 case element_t::img: return !stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX) ||
1074 !stdex::strnicmp(attr_name, num_chars, "lowsrc", SIZE_MAX) ||
1075 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) ||
1076 !stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX);
1077 case element_t::input: return !stdex::strnicmp(attr_name, num_chars, "lowsrc", SIZE_MAX) ||
1078 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) ||
1079 !stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX);
1080 case element_t::ins: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX);
1081 case element_t::link: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX);
1082 case element_t::object: return !stdex::strnicmp(attr_name, num_chars, "basehref", SIZE_MAX) ||
1083 !stdex::strnicmp(attr_name, num_chars, "classid", SIZE_MAX) ||
1084 !stdex::strnicmp(attr_name, num_chars, "code", SIZE_MAX) ||
1085 !stdex::strnicmp(attr_name, num_chars, "codebase", SIZE_MAX) ||
1086 !stdex::strnicmp(attr_name, num_chars, "data", SIZE_MAX) ||
1087 !stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX);
1088 case element_t::q: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX);
1089 case element_t::script: return !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX);
1090 case element_t::table: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX);
1091 case element_t::td: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX);
1092 case element_t::th: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX);
1093 }
1094 return false;
1095 }
1096
1104 template <class T>
1105 static inline bool is_localizable(element_t code, const T* attr_name, size_t num_chars)
1106 {
1107 _Assume_(attr_name || !num_chars);
1108 if (!stdex::strnicmp(attr_name, num_chars, "title", SIZE_MAX))
1109 return true;
1110 switch (code) {
1111 case element_t::applet: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1112 case element_t::area: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1113 case element_t::img: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1114 case element_t::input: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1115 case element_t::object: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX);
1116 case element_t::table: return !stdex::strnicmp(attr_name, num_chars, "summary", SIZE_MAX);
1117 case element_t::td: return !stdex::strnicmp(attr_name, num_chars, "abbr", SIZE_MAX);
1118 case element_t::th: return !stdex::strnicmp(attr_name, num_chars, "abbr", SIZE_MAX);
1119 }
1120 return false;
1121 }
1122 };
1123
1124 class sequence;
1125 using sequence_store = std::vector<std::unique_ptr<sequence>>;
1126
1131 {
1132 public:
1133 stdex::parser::html_sequence_t type;
1136
1137 sequence(_In_ stdex::parser::html_sequence_t _type = stdex::parser::html_sequence_t::unknown, _In_ size_t start = 0, size_t end = 0, _In_opt_ sequence* _parent = nullptr) :
1138 type(_type),
1139 interval(start, end),
1140 parent(_parent)
1141 {}
1142
1143 virtual ~sequence() {} // make polymorphic
1144 };
1145
1149 class element : public sequence
1150 {
1151 public:
1152 template <class T>
1153 inline element(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr) :
1154 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1155 code(element_code(src + tag.name.start, tag.name.size())),
1156 name(std::move(tag.name)),
1157 attributes(std::move(tag.attributes))
1158 {}
1159
1160 template <class T>
1161 static element_t element_code(_In_reads_z_(num_chars) const T* name, size_t num_chars)
1162 {
1163 static const struct {
1164 const char* name;
1165 element_t code;
1166 } mapping[] = {
1167 { "a", element_t::a, },
1168 { "abbr", element_t::abbr, },
1169 { "acronym", element_t::acronym, },
1170 { "address", element_t::address, },
1171 { "applet", element_t::applet, },
1172 { "area", element_t::area, },
1173 { "b", element_t::b, },
1174 { "base", element_t::base, },
1175 { "basefont", element_t::basefont, },
1176 { "bdo", element_t::bdo, },
1177 { "bgsound", element_t::bgsound, },
1178 { "big", element_t::big, },
1179 { "blink", element_t::blink, },
1180 { "blockquote", element_t::blockquote, },
1181 { "body", element_t::body, },
1182 { "br", element_t::br, },
1183 { "button", element_t::button, },
1184 { "caption", element_t::caption, },
1185 { "center", element_t::center, },
1186 { "cite", element_t::cite, },
1187 { "code", element_t::code, },
1188 { "col", element_t::col, },
1189 { "colgroup", element_t::colgroup, },
1190 { "comment", element_t::comment, },
1191 { "dd", element_t::dd, },
1192 { "del", element_t::del, },
1193 { "dfn", element_t::dfn, },
1194 { "dir", element_t::dir, },
1195 { "div", element_t::div, },
1196 { "dl", element_t::dl, },
1197 { "dt", element_t::dt, },
1198 { "em", element_t::em, },
1199 { "embed", element_t::embed, },
1200 { "fieldset", element_t::fieldset, },
1201 { "font", element_t::font, },
1202 { "form", element_t::form, },
1203 { "frame", element_t::frame, },
1204 { "frameset", element_t::frameset, },
1205 { "h1", element_t::h1, },
1206 { "h2", element_t::h2, },
1207 { "h3", element_t::h3, },
1208 { "h4", element_t::h4, },
1209 { "h5", element_t::h5, },
1210 { "h6", element_t::h6, },
1211 { "head", element_t::head, },
1212 { "hr", element_t::hr, },
1213 { "html", element_t::html, },
1214 { "i", element_t::i, },
1215 { "iframe", element_t::iframe, },
1216 { "img", element_t::img, },
1217 { "input", element_t::input, },
1218 { "ins", element_t::ins, },
1219 { "isindex", element_t::isindex, },
1220 { "kbd", element_t::kbd, },
1221 { "label", element_t::label, },
1222 { "legend", element_t::legend, },
1223 { "li", element_t::li, },
1224 { "link", element_t::link, },
1225 { "listing", element_t::listing, },
1226 { "map", element_t::map, },
1227 { "marquee", element_t::marquee, },
1228 { "menu", element_t::menu, },
1229 { "meta", element_t::meta, },
1230 { "nextid", element_t::nextid, },
1231 { "nobr", element_t::nobr, },
1232 { "noembed", element_t::noembed, },
1233 { "noframes", element_t::noframes, },
1234 { "noscript", element_t::noscript, },
1235 { "object", element_t::object, },
1236 { "ol", element_t::ol, },
1237 { "optgroup", element_t::optgroup, },
1238 { "option", element_t::option, },
1239 { "p", element_t::p, },
1240 { "param", element_t::param, },
1241 { "plaintext", element_t::plaintext, },
1242 { "pre", element_t::pre, },
1243 { "q", element_t::q, },
1244 { "rt", element_t::rt, },
1245 { "ruby", element_t::ruby, },
1246 { "s", element_t::s, },
1247 { "samp", element_t::samp, },
1248 { "script", element_t::script, },
1249 { "select", element_t::select, },
1250 { "small", element_t::small, },
1251 { "span", element_t::span, },
1252 { "strike", element_t::strike, },
1253 { "strong", element_t::strong, },
1254 { "style", element_t::style, },
1255 { "sub", element_t::sub, },
1256 { "sup", element_t::sup, },
1257 { "table", element_t::table, },
1258 { "tbody", element_t::tbody, },
1259 { "td", element_t::td, },
1260 { "textarea", element_t::textarea, },
1261 { "tfoot", element_t::tfoot, },
1262 { "th", element_t::th, },
1263 { "thead", element_t::thead, },
1264 { "title", element_t::title, },
1265 { "tr", element_t::tr, },
1266 { "tt", element_t::tt, },
1267 { "u", element_t::u, },
1268 { "ul", element_t::ul, },
1269 { "var", element_t::var, },
1270 { "wbr", element_t::wbr, },
1271 { "xmp", element_t::xmp, },
1272 };
1273#ifdef _DEBUG
1274 // The mapping table MUST be sorted and all names in lowercase.
1275 for (size_t i = 1; i < _countof(mapping); i++)
1276 _Assume_(stdex::strcmp(mapping[i - 1].name, mapping[i].name) <= 0);
1277 for (size_t i = 0; i < _countof(mapping); i++) {
1278 for (size_t j = 0; mapping[i].name[j]; j++)
1279 _Assume_(stdex::islower(mapping[i].name[j]) | stdex::isdigit(mapping[i].name[j]));
1280 }
1281#endif
1282 for (size_t i = 0, j = _countof(mapping); i < j; ) {
1283 size_t m = (i + j) / 2;
1284 int r = 0;
1285 for (size_t i1 = 0, i2 = 0;;) {
1286 if (!mapping[m].name[i1]) {
1287 r = i2 >= num_chars || !name[i2] ? 0 : -1;
1288 break;
1289 }
1290 if (i2 >= num_chars || !name[i2]) {
1291 r = 1;
1292 break;
1293 }
1294
1295 auto chr = static_cast<char>(stdex::tolower(name[i2++]));
1296 if (mapping[m].name[i1] > chr) {
1297 r = 1;
1298 break;
1299 }
1300 if (mapping[m].name[i1] < chr) {
1301 r = -1;
1302 break;
1303 }
1304 i1++;
1305 }
1306
1307 if (r < 0)
1308 i = m + 1;
1309 else if (r > 0)
1310 j = m;
1311 else
1312 return mapping[m].code;
1313 }
1314 return element_t::unknown;
1315 }
1316
1317 public:
1318 element_t code;
1320 std::vector<stdex::parser::html_attribute> attributes;
1321 };
1322
1323 class element_end;
1324
1328 class element_start : public element
1329 {
1330 public:
1331 template <class T>
1332 inline element_start(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr, _In_opt_ sequence* _end = nullptr) :
1333 element(std::move(tag), src, parent),
1334 end(_end)
1335 {}
1336
1337 public:
1339 };
1340
1344 class element_end : public sequence
1345 {
1346 public:
1347 template <class T>
1348 inline element_end(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr, _In_opt_ element_start* _start = nullptr) :
1349 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1350 code(element::element_code(src + tag.name.start, tag.name.size())),
1351 name(std::move(tag.name)),
1352 start(_start)
1353 {}
1354
1355 public:
1356 element_t code;
1359 };
1360
1364 class declaration : public sequence
1365 {
1366 public:
1367 template <class T>
1368 inline declaration(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_opt_ sequence* parent = nullptr) :
1369 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1370 name(std::move(tag.name)),
1371 attributes(std::move(tag.attributes))
1372 {}
1373
1374 public:
1376 std::vector<stdex::parser::html_attribute> attributes;
1377 };
1378
1382 class comment : public sequence
1383 {
1384 public:
1385 template <class T>
1386 inline comment(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_opt_ sequence* parent = nullptr) :
1387 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1388 content(std::move(tag.name))
1389 {}
1390
1391 public:
1393 };
1394
1398 class instruction : public sequence
1399 {
1400 public:
1401 template <class T>
1402 inline instruction(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_opt_ sequence* parent = nullptr) :
1403 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1404 content(std::move(tag.name))
1405 {}
1406
1407 public:
1409 };
1410
1414 template<class _Elem, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
1415 struct entity
1416 {
1418 std::basic_string<_Elem, _Traits, _Alloc> value;
1419 };
1420
1424 template<class _Elem, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
1425 class parser;
1426
1430 template<class _Elem, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
1432 {
1433 public:
1434 document() :
1435 m_num_parsed(0),
1436 m_charset(stdex::charset_id::system),
1437
1438 // Declaration parsing data
1441 m_is_cdata(false),
1442 m_is_rcdata(false),
1443
1444 // Element parsing data
1446 {}
1447
1451 void clear()
1452 {
1453 m_source.clear();
1454 m_num_parsed = 0;
1455 m_charset = stdex::charset_id::system;
1456
1457 // Declaration parsing data
1459 m_is_cdata = m_is_rcdata = false;
1460 m_entities.clear();
1461
1462 // Element parsing data
1463 m_sequences.clear();
1464
1465 m_element_stack.clear();
1466 m_is_special_element = false;
1467 }
1468
1472 void append(_In_reads_or_z_opt_(num_chars) const _Elem* source, _In_ size_t num_chars = SIZE_MAX)
1473 {
1474 _Assume_(source || !num_chars);
1475 m_source.append(source, stdex::strnlen(source, num_chars));
1476 source = m_source.data();
1477 num_chars = m_source.size();
1478
1479 for (size_t i = m_num_parsed; i < num_chars;) {
1480 if (m_is_cdata || m_is_rcdata) {
1481 if (m_condition_end.match(source, i, num_chars)) {
1482 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(
1483 m_is_cdata ? stdex::parser::html_sequence_t::CDATA : stdex::parser::html_sequence_t::PCDATA,
1484 m_num_parsed, i,
1485 active_element()))));
1486 m_is_cdata = m_is_rcdata = false;
1487 i = m_num_parsed = m_condition_end.interval.end;
1488 continue;
1489 }
1490 goto next_char;
1491 }
1492
1494 if (m_condition_end.match(source, i, num_chars)) {
1496 i = m_num_parsed = m_condition_end.interval.end;
1497 continue;
1498 }
1499 goto next_char;
1500 }
1501
1502 if (m_num_valid_conditions && m_condition_end.match(source, i, num_chars)) {
1503 if (m_num_parsed < i)
1504 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));
1505
1507 i = m_num_parsed = m_condition_end.interval.end;
1508 continue;
1509 }
1510
1511 if (m_condition_start.match(source, i, num_chars)) {
1512 auto condition_src(replace_entities(source + m_condition_start.condition.start, m_condition_start.condition.size()));
1513 if (!stdex::strcmp(condition_src.c_str(), "CDATA"))
1514 m_is_cdata = true;
1515 else if (!stdex::strcmp(condition_src.c_str(), "RCDATA"))
1516 m_is_rcdata = true;
1519 else if (!stdex::strcmp(condition_src.c_str(), "IGNORE"))
1521 else
1523
1524 i = m_num_parsed = m_condition_start.interval.end;
1525 continue;
1526 }
1527
1529 auto parent = active_element();
1530 _Assume_(parent);
1531 if (m_tag.match(source, i, num_chars) &&
1532 m_tag.type == stdex::parser::html_sequence_t::element_end &&
1533 element::element_code(source + m_tag.name.start, m_tag.name.size()) == parent->code)
1534 {
1535 if (m_num_parsed < i)
1536 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, parent))));
1537 i = m_num_parsed = m_tag.interval.end;
1538 std::unique_ptr<element_end> e(new element_end(std::move(m_tag), source, parent->parent, parent));
1539 parent->end = e.get();
1540 m_sequences.push_back(std::move(e));
1541 m_element_stack.pop_back();
1542 m_is_special_element = false;
1543 continue;
1544 }
1545 goto next_char;
1546 }
1547
1548 if (m_tag.match(source, i, num_chars)) {
1549 if (m_num_parsed < i)
1550 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));
1551 i = m_num_parsed = m_tag.interval.end;
1552
1553 switch (m_tag.type) {
1554 case stdex::parser::html_sequence_t::element:
1555 case stdex::parser::html_sequence_t::element_start: {
1556 std::unique_ptr<element> e(
1557 m_tag.type == stdex::parser::html_sequence_t::element ? new element(std::move(m_tag), source) :
1558 m_tag.type == stdex::parser::html_sequence_t::element_start ? new element_start(std::move(m_tag), source) :
1559 nullptr);
1560
1561 // Does this tag end any of the started elements?
1562 for (size_t j = m_element_stack.size(); j--; ) {
1563 auto starting_tag = m_element_stack[j];
1564 _Assume_(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1565 if (element_traits::may_contain(starting_tag->code, e->code)) {
1566 e->parent = starting_tag;
1567 break;
1568 }
1569 e->parent = starting_tag->parent;
1570 starting_tag->end = e.get();
1571 m_element_stack.resize(j);
1572 }
1573
1574 if (e->type == stdex::parser::html_sequence_t::element_start) {
1575 auto e_start = static_cast<element_start*>(e.get());
1576 if (element_traits::span(e->code) == element_span_t::immediate)
1577 e_start->end = e.get();
1578 else {
1579 m_element_stack.push_back(e_start);
1580 switch (e->code) {
1581 case element_t::code:
1582 case element_t::comment:
1583 case element_t::script:
1584 case element_t::style:
1585 m_is_special_element = true;
1586 break;
1587 }
1588 }
1589 }
1590
1591 if (e->code == element_t::meta && m_charset == stdex::charset_id::system) {
1592 bool is_content_type = false;
1593 stdex::parser::html_attribute* content_attr = nullptr;
1594 for (auto& attr : e->attributes) {
1595 if (!stdex::strnicmp(source + attr.name.start, attr.name.size(), "http-equiv", SIZE_MAX) &&
1596 !stdex::strnicmp(source + attr.value.start, attr.value.size(), "content-type", SIZE_MAX))
1597 is_content_type = true;
1598 else if (!stdex::strnicmp(source + attr.name.start, attr.name.size(), "content", SIZE_MAX))
1599 content_attr = &attr;
1600 }
1601 if (is_content_type && content_attr) {
1602 // <meta http-equiv="Content-Type" content="..."> found.
1604 if (content.match(source, content_attr->value.start, content_attr->value.end) &&
1605 content.charset)
1606 {
1607 std::string str;
1608 str.reserve(content.charset.size());
1609 for (size_t j = content.charset.start; j < content.charset.end; ++j)
1610 str.push_back(static_cast<char>(source[j]));
1611 m_charset = stdex::charset_from_name(str.c_str());
1612 }
1613 }
1614 }
1615
1616 m_sequences.push_back(std::move(e));
1617 break;
1618 }
1619 case stdex::parser::html_sequence_t::element_end: {
1620 std::unique_ptr<element_end> e(new element_end(std::move(m_tag), source, active_element()));
1621
1622 for (size_t j = m_element_stack.size(); j--; ) {
1623 auto starting_tag = m_element_stack[j];
1624 _Assume_(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1625 if (starting_tag->code == e->code ||
1626 starting_tag->code == element_t::unknown && e->code == element_t::unknown && !stdex::strnicmp(source + starting_tag->name.start, starting_tag->name.size(), source + e->name.start, e->name.size()))
1627 {
1628 e->start = starting_tag;
1629 e->parent = starting_tag->parent;
1630 starting_tag->end = e.get();
1631 m_element_stack.resize(j);
1632 break;
1633 }
1634 }
1635
1636 m_sequences.push_back(std::move(e));
1637 break;
1638 }
1639 case stdex::parser::html_sequence_t::declaration:
1640 if (m_tag.attributes.size() > 3 &&
1641 !stdex::strnicmp(source + m_tag.attributes[0].name.start, m_tag.attributes[0].name.size(), "entity", SIZE_MAX))
1642 {
1643 if (!stdex::strncmp(source + m_tag.attributes[1].name.start, m_tag.attributes[1].name.size(), "%", SIZE_MAX) &&
1644 stdex::strncmp(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size(), "SYSTEM", SIZE_MAX) &&
1645 stdex::strncmp(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size(), "PUBLIC", SIZE_MAX))
1646 {
1647 std::unique_ptr<entity<_Elem, _Traits, _Alloc>> e(new entity<_Elem, _Traits, _Alloc>());
1648 e->name = m_tag.attributes[2].name;
1649 e->value = std::move(replace_entities(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size()));
1650 m_entities.push_back(std::move(e));
1651 }
1652
1653 // TODO: Parse & entities and entities in SYSTEM and PUBLIC external files.
1654 }
1655 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new declaration(std::move(m_tag), active_element()))));
1656 break;
1657 case stdex::parser::html_sequence_t::comment:
1658 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new comment(std::move(m_tag), active_element()))));
1659 break;
1660 case stdex::parser::html_sequence_t::instruction:
1661 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new instruction(std::move(m_tag), active_element()))));
1662 break;
1663 default:
1664 throw std::invalid_argument("unknown tag type");
1665 }
1666
1667 continue;
1668 }
1669
1670 next_char:
1671 if (m_any_char.match(source, i, num_chars)) {
1672 // Skip any character, but don't declare it as parsed yet. It might be a part of unfinished tag.
1673 i = m_any_char.interval.end;
1674 }
1675 else
1676 break;
1677 }
1678 }
1679
1684 {
1685 size_t i = m_source.size();
1686 if (m_num_parsed < i)
1687 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));
1688 m_num_parsed = i;
1689 m_element_stack.clear();
1690 }
1691
1695 inline void assign(_In_reads_or_z_opt_(num_chars) const _Elem* source, _In_ size_t num_chars = SIZE_MAX)
1696 {
1697 clear();
1698 append(source, num_chars);
1699 finalize();
1700 }
1701
1705 inline const std::basic_string<_Elem, _Traits, _Alloc>& source() const { return m_source; }
1706
1707 friend class parser<_Elem, _Traits, _Alloc>;
1708
1709 protected:
1714 {
1715 return m_element_stack.empty() ? nullptr : m_element_stack.back();
1716 }
1717
1721 std::basic_string<_Elem, _Traits, _Alloc> replace_entities(_In_reads_or_z_opt_(num_chars) const _Elem* input, _In_ size_t num_chars) const
1722 {
1723 _Assume_(input || !num_chars);
1724 const size_t num_entities = m_entities.size();
1725 const _Elem* source = m_source.data();
1726 std::basic_string<_Elem, _Traits, _Alloc> output;
1727 for (size_t i = 0; i < num_chars && input[i];) {
1728 if (input[i] == '%') {
1729 for (size_t j = 0; j < num_entities; j++) {
1730 auto& e = m_entities[j];
1731 size_t entity_size = e->name.size();
1732 if (i + entity_size + 1 < num_chars &&
1733 !stdex::strncmp(input + i + 1, source + e->name.start, entity_size) &&
1734 input[i + entity_size + 1] == ';')
1735 {
1736 output += e->value;
1737 i += entity_size + 2;
1738 goto next_char;
1739 }
1740 }
1741 throw std::runtime_error("undefined entity");
1742 }
1743 output += input[i++];
1744 next_char:;
1745 }
1746 return output;
1747 }
1748
1749 protected:
1750 std::basic_string<_Elem, _Traits, _Alloc> m_source;
1752 stdex::charset_id m_charset;
1753
1754 // Declaration parsing data
1762 std::vector<std::unique_ptr<entity<_Elem, _Traits, _Alloc>>> m_entities;
1763
1764 // Element parsing data
1766 sequence_store m_sequences;
1767 std::vector<element_start*> m_element_stack;
1769 };
1770
1774 enum class token_t {
1775 root = 0,
1776 complete,
1777 starting,
1778 ending,
1779 url,
1780 };
1781
1785 constexpr size_t token_tag_max =
1786 sizeof(void*) * 2 // Memory address in hexadecimal
1787 + 2 // Leading and trailing parenthesis
1788 + 1; // Zero terminator
1789
1794 constexpr char token_tag_start = '\x12';
1795
1800 constexpr char token_tag_end = '\x13';
1801
1805 class token
1806 {
1807 protected:
1808 inline token(_In_ token_t _type = token_t::root, _In_opt_ sequence* _sequence = nullptr, _In_ uintptr_t _data = 0) :
1809 type(_type),
1810 sequence(_sequence),
1811 data(_data)
1812 {}
1813
1814 template<class _Elem, class _Traits, class _Alloc>
1815 friend class parser;
1816
1817 public:
1818 virtual ~token() {} // make polymorphic
1819
1827 template<class _Traits = std::char_traits<char>, class _Alloc = std::allocator<char>>
1828 inline size_t append_tag(_Inout_ std::basic_string<char, _Traits, _Alloc>& str) const
1829 {
1830 size_t n = str.size();
1831 // Use %X instead of %p to ommit leading zeros and save space.
1832 stdex::appendf(str, "%c%zX%c", stdex::locale_C.get(), token_tag_start, reinterpret_cast<uintptr_t>(this), token_tag_end);
1833 return str.size() - n;
1834 }
1835
1843 template<class _Traits = std::char_traits<wchar_t>, class _Alloc = std::allocator<wchar_t>>
1844 inline size_t append_tag(_Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& str) const
1845 {
1846 // Use %X instead of %p to ommit leading zeros and save space.
1847 return stdex::appendf(str, L"%c%zX%c", stdex::locale_C.get(), static_cast<wchar_t>(token_tag_start), reinterpret_cast<uintptr_t>(this), static_cast<wchar_t>(token_tag_end));
1848 }
1849
1850 template<class T>
1851 static inline token* parse_tag(const T* str, size_t& offset)
1852 {
1853 if (str[offset] != static_cast<T>(token_tag_start))
1854 return nullptr;
1855
1856 // Locate tag end.
1857 size_t end;
1858 for (end = offset + 1; ; end++) {
1859 if (!str[end])
1860 return nullptr;
1861 if (str[end] == token_tag_end)
1862 break;
1863 }
1864
1865 // Parse hexadecimal token memory address.
1866 token* t = reinterpret_cast<token*>(stdex::strtouint<T, uintptr_t>(str + offset + 1, end - offset - 1, nullptr, 16));
1867 if (!t)
1868 throw std::invalid_argument("null token");
1869 offset = end + 1;
1870 return t;
1871 }
1872
1873 public:
1874 token_t type;
1876 uintptr_t data;
1877 };
1878
1879 using token_vector = std::vector<std::unique_ptr<token>>;
1880 using token_list = std::list<token*>;
1881
1885 enum text_type_flag_t : uint32_t {
1886 has_tokens = 1 << 0,
1887 has_text = 1 << 1,
1888 is_title = 1 << 2,
1889 is_bullet = 1 << 3,
1890 };
1891
1895 template<class _Elem, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
1896 class text_token : public token
1897 {
1898 protected:
1899 inline text_token(
1900 _In_ token_t type = token_t::complete,
1901 _In_reads_or_z_opt_(num_chars) const _Elem* _text = nullptr, _In_ size_t num_chars = 0,
1902 _In_ uint32_t _text_type = 0,
1903 _In_opt_ stdex::html::sequence* sequence = nullptr, _In_ uintptr_t data = 0) :
1905 text(_text, num_chars),
1906 text_type(_text_type)
1907 {}
1908
1909 friend class parser<_Elem, _Traits, _Alloc>;
1910
1911 public:
1912 std::basic_string<_Elem, _Traits, _Alloc> text;
1913 uint32_t text_type;
1914 stdex::mapping_vector<size_t> mapping;
1915 };
1916
1920 template<class _Elem, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
1921 class starting_token : public text_token<_Elem, _Traits, _Alloc>
1922 {
1923 protected:
1924 inline starting_token(
1925 _In_reads_or_z_opt_(num_chars_text) const _Elem* _text = nullptr, _In_ size_t num_chars_text = 0,
1926 _In_reads_or_z_opt_(num_chars_name) const _Elem* _name = nullptr, _In_ size_t num_chars_name = 0,
1927 _In_ uint32_t text_type = 0,
1928 _In_opt_ stdex::html::sequence* sequence = nullptr,
1929 _In_opt_ stdex::html::sequence* _end_sequence = nullptr,
1930 _In_ uintptr_t data = 0) :
1931 text_token(token_t::starting, _text, num_chars_text, text_type, sequence, data),
1932 name(_name, num_chars_name),
1933 end_sequence(_end_sequence)
1934 {}
1935
1936 friend class parser<_Elem, _Traits, _Alloc>;
1937
1938 public:
1939 std::basic_string<_Elem, _Traits, _Alloc> name;
1941 };
1942
1946 enum class token_url_t {
1947 plain = 0, // URL is not using any particular encoding scheme (as-is)
1948 sgml, // URL is encoded using SGML entities
1949 css, // URL is encoded using CSS escaping scheme
1950 };
1951
1955 template<class _Elem, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
1956 class url_token : public token
1957 {
1958 protected:
1959 inline url_token(
1960 _In_reads_or_z_opt_(num_chars) const _Elem* _url = nullptr, _In_ size_t num_chars = 0,
1961 token_url_t _encoding = token_url_t::plain,
1962 _In_opt_ stdex::html::sequence* sequence = nullptr, _In_ uintptr_t data = 0) :
1963 token(token_t::url, sequence, data),
1964 url(_url, num_chars),
1965 encoding(_encoding)
1966 {}
1967
1968 friend class parser<_Elem, _Traits, _Alloc>;
1969
1970 public:
1971 std::basic_string<_Elem, _Traits, _Alloc> url;
1972 token_url_t encoding;
1973 };
1974
1980 std::list<stdex::html::token*> active_tokens;
1981 size_t word_index;
1983 };
1984
1985 using inserted_token_list = std::list<inserted_token>;
1986
1987 template<class _Elem, class _Traits, class _Alloc>
1989 {
1990 public:
1991 inline parser(
1993 _In_reads_or_z_opt_(num_chars) const stdex::schar_t* url = nullptr, _In_ size_t num_chars = 0,
1994 _In_ bool parse_frames = false, _In_ stdex::progress<size_t>* progress = nullptr) :
1996 m_url(url, stdex::strnlen(url, num_chars)),
1997 m_parse_frames(parse_frames),
1999 m_source(nullptr)
2000 {}
2001
2006 {
2007 _Assume_(m_tokens.empty());
2008
2009 if (m_progress) {
2010 m_progress->set_range(0, m_document.source().size());
2011 m_progress->set(0);
2012 }
2013
2014 m_source = m_document.source().data();
2016 return parse(m_document.m_sequences.end());
2017 }
2018
2025 static void link(_Inout_ std::basic_string<_Elem, _Traits, _Alloc>& source, _In_ const text_token<_Elem, _Traits, _Alloc>* t)
2026 {
2027 _Assume_(t);
2028 _Assume_(
2029 t->type == token_t::complete ||
2030 t->type == token_t::starting ||
2031 t->type == token_t::ending ||
2032 t->type == token_t::root);
2033
2034 if (t->text_type & has_tokens) {
2035 const _Elem* root = t->text.data();
2036 for (size_t i = 0, num_chars = t->text.size(); i < num_chars && root[i];) {
2037 _Assume_(root[i] != token_tag_end);
2038 const token* t2 = token::parse_tag(root, i);
2039 if (t2) {
2040 switch (t2->type) {
2041 case token_t::complete:
2042 case token_t::starting:
2043 case token_t::ending:
2044 case token_t::root:
2045 link(source, dynamic_cast<const text_token<_Elem, _Traits, _Alloc>*>(t2));
2046 break;
2047 case token_t::url: {
2048 auto t2_url = dynamic_cast<const url_token<_Elem, _Traits, _Alloc>*>(t2);
2049 switch (t2_url->encoding) {
2050 case token_url_t::plain:
2051 source += t2_url->url;
2052 break;
2053 case token_url_t::sgml:
2054 escape(source, t2_url->url.data(), t2_url->url.size());
2055 break;
2056 case token_url_t::css:
2057 css_escape(source, t2_url->url.data(), t2_url->url.size());
2058 break;
2059 default:
2060 throw std::invalid_argument("unsupported URL encoding");
2061 }
2062 break;
2063 }
2064 default:
2065 throw std::invalid_argument("unsupported token type");
2066 }
2067 }
2068 else if (t->text_type & has_text) {
2069 escape_min(source, root[i]);
2070 i++;
2071 }
2072 else
2073 source += root[i++];
2074 }
2075 }
2076 else if (t->text_type & has_text) {
2077 // Token contains no references to other tokens. But, it does contain text that requires escaping.
2078 escape_min(source, t->text.data(), t->text.size());
2079 }
2080 else
2081 source += t->text;
2082 }
2083
2092 static void start_tokens(_Inout_ std::basic_string<_Elem, _Traits, _Alloc>& source, _Inout_ token_list& active_tokens, _In_ const token_list& new_tokens, _In_ token_list::const_iterator from)
2093 {
2094 for (; from != new_tokens.cend(); ++from) {
2095 auto t = *from;
2096 t->append_tag(source);
2097 active_tokens.push_back(t);
2098 }
2099 }
2100
2110 token_list::const_iterator end_tokens(_Inout_ std::basic_string<_Elem, _Traits, _Alloc>& source, _Inout_ token_list& active_tokens, _In_ const token_list& new_tokens)
2111 {
2112 // Skip matching tokens in active_tokens and new_tokens.
2113 token_list::const_iterator i1, i2;
2114 for (i1 = active_tokens.cbegin(), i2 = new_tokens.cbegin(); i1 != active_tokens.cend(); ++i1, ++i2) {
2115 if (i2 == new_tokens.cend() || *i1 != *i2) {
2116 // Got two tokens, where lists don't match anymore, or new_tokens list is out.
2117 // End tokens not relevant anymore in reverse order of starting.
2118 for (auto i = active_tokens.cend(); i != active_tokens.cbegin(); ) {
2119 auto t1 = dynamic_cast<starting_token<_Elem, _Traits, _Alloc>*>(*(--i));
2120 _Assume_(t1 && t1->type == token_t::starting);
2121
2122 std::unique_ptr<text_token<_Elem, _Traits, _Alloc>> t2(new text_token<_Elem, _Traits, _Alloc>(token_t::ending));
2123 t2->text.reserve(t1->name.size() + 3);
2124 t2->text += '<';
2125 t2->text += '/';
2126 t2->text += t1->name;
2127 t2->text += '>';
2128 append_token(std::move(t2), source);
2129
2130 // Pop the active token.
2131 if (i1 == i) {
2132 active_tokens.erase(i);
2133 break;
2134 }
2135 active_tokens.erase(i);
2136 i = active_tokens.cend();
2137 }
2138 break;
2139 }
2140 }
2141 return i2;
2142 }
2143
2153 void append_inserted_tokens(_Inout_ std::basic_string<_Elem, _Traits, _Alloc>& source, _Inout_ inserted_token_list& inserted_tokens,
2154 _In_ size_t word_index, _In_ bool after_word,
2155 _Inout_ token_list& active_tokens)
2156 {
2157 for (auto i = inserted_tokens.begin(); i != inserted_tokens.end(); ) {
2158 auto& t = *i;
2159 _Assume_(t.token);
2160 if (t.word_index == word_index && t.after_word == after_word) {
2161 if (t.token->type != token_t::ending)
2162 start_tokens(source, active_tokens, t.active_tokens, end_tokens(source, active_tokens, t.active_tokens));
2163 t.token->append_tag(source);
2164 inserted_tokens.erase(i++);
2165 }
2166 else
2167 ++i;
2168 }
2169 }
2170
2177 static void merge(_Inout_ token_list& a, _In_ const token_list& b)
2178 {
2179 for (auto i2 = b.begin(); i2 != b.end(); ++i2) {
2180 auto t2 = *i2;
2181 for (auto i1 = a.begin(); i1 != a.end(); ++i1) {
2182 if (i1 == a.end()) {
2183 a.push_back(t2);
2184 break;
2185 }
2186 auto t1 = *i1;
2187 if (t1 == t2)
2188 break;
2189 }
2190 }
2191 }
2192
2196 void make_absolute_url(std::basic_string<_Elem, _Traits, _Alloc>& rel)
2197 {
2198 _Unreferenced_(rel);
2199
2200 if (m_url.empty())
2201 return;
2202
2203 // TODO: Implement!
2204 }
2205
2209 inline const token_vector& tokens() const { return m_tokens; }
2210
2211 protected:
2219 template <class T>
2220 inline T* append_token(_Inout_ std::unique_ptr<T>&& token)
2221 {
2222 if (!token)
2223 return nullptr;
2224 auto t = token.get();
2225 m_tokens.push_back(std::move(token));
2226 return t;
2227 }
2228
2237 template <class T>
2238 inline size_t append_token(_Inout_ std::unique_ptr<T>&& token, _Inout_ std::basic_string<_Elem, _Traits, _Alloc>& source)
2239 {
2240 if (!token)
2241 return 0;
2242 size_t n = token->append_tag(source);
2243 m_tokens.push_back(std::move(token));
2244 return n;
2245 }
2246
2255 text_token<_Elem, _Traits, _Alloc>* parse(_In_ const sequence_store::const_iterator& end, _In_ uint32_t text_type = 0)
2256 {
2258 std::unique_ptr<text_token<_Elem, _Traits, _Alloc>> token(new text_token<_Elem, _Traits, _Alloc>(
2259 token_t::complete,
2260 nullptr, 0,
2261 text_type,
2262 m_offset != end ? m_offset->get() : nullptr));
2263
2264 while (m_offset != end) {
2265 auto& s = *m_offset;
2266
2267 if (m_progress) {
2268 if (m_progress->cancel())
2269 throw stdex::user_cancelled();
2270 m_progress->set(s->interval.start);
2271 }
2272
2273 // No token_tag_start and token_tag_end chars, please.
2274 _Assume_(
2275 stdex::strnchr(m_source + s->interval.start, s->interval.size(), static_cast<_Elem>(token_tag_start)) == stdex::npos &&
2276 stdex::strnchr(m_source + s->interval.start, s->interval.size(), static_cast<_Elem>(token_tag_end)) == stdex::npos);
2277
2278 if (s->type == stdex::parser::html_sequence_t::text) {
2279 rel.from = s->interval.start;
2280 token->mapping.push_back(rel);
2281 stdex::sgml2strcat(token->text, m_source + s->interval.start, s->interval.size(), 0, rel, &token->mapping);
2282 rel.to = token->text.size();
2283 if (!(token->text_type & has_text) &&
2284 !stdex::isblank(m_source + s->interval.start, s->interval.size()))
2285 token->text_type |= has_text;
2286 ++m_offset;
2287 }
2288 else if (s->type == stdex::parser::html_sequence_t::element || s->type == stdex::parser::html_sequence_t::element_start) {
2289 const element* s_el = static_cast<const element*>(s.get());
2290 _Assume_(s_el);
2291 const element_start* s_el_start = s->type == stdex::parser::html_sequence_t::element_start ? static_cast<const element_start*>(s.get()) : nullptr;
2292 if (s_el->code == element_t::frameset && !m_parse_frames)
2293 throw std::invalid_argument("<frameset> detected");
2294
2295 {
2296 size_t offset = s->interval.start;
2297 std::unique_ptr<text_token<_Elem, _Traits, _Alloc>> t(s->type == stdex::parser::html_sequence_t::element || element_traits::span(s_el_start->code) == element_span_t::immediate ?
2298 new text_token<_Elem, _Traits, _Alloc>(token_t::complete, nullptr, 0, 0, s.get()) :
2299 new starting_token<_Elem, _Traits, _Alloc>(nullptr, 0, m_source + s_el_start->name.start, s_el_start->name.size(), 0, s.get(), s_el_start->end));
2300
2301 // Copy the tag contents, but mind any attributes containing localizable text.
2302 for (auto& a : s_el->attributes) {
2303 if (a.value.empty() ||
2304 stdex::isblank(m_source + a.value.start, a.value.size()))
2305 continue;
2306
2307 if (element_traits::is_uri(s_el->code, m_source + a.name.start, a.name.size())) {
2308 t->text.append(m_source + offset, a.value.start - offset);
2309 std::unique_ptr<url_token<_Elem, _Traits, _Alloc>> t_url(new url_token<_Elem, _Traits, _Alloc>(
2310 nullptr, 0,
2311 token_url_t::sgml,
2312 s.get()));
2313 stdex::sgml2strcat(t_url->url, m_source + a.value.start, a.value.size());
2314 append_token(std::move(t_url), t->text);
2315 t->text_type |= has_tokens;
2316 offset = a.value.end;
2317 }
2318 else if (element_traits::is_localizable(s_el->code, m_source + a.name.start, a.name.size())) {
2319 t->text.append(m_source + offset, a.value.start - offset);
2320 std::unique_ptr<text_token<_Elem, _Traits, _Alloc>> t_value(new text_token<_Elem, _Traits, _Alloc>(
2321 token_t::complete,
2322 nullptr, 0,
2323 has_text | is_title,
2324 s.get()));
2325 stdex::mapping<size_t> rel_value(a.value.start, 0);
2326 t_value->mapping.push_back(rel_value);
2327 stdex::sgml2strcat(t_value->text, m_source + a.value.start, a.value.size(), 0, rel_value, &t_value->mapping);
2328 append_token(std::move(t_value), t->text);
2329 t->text_type |= has_tokens;
2330 offset = a.value.end;
2331 }
2332 }
2333
2334 t->text.append(m_source + offset, s->interval.end - offset);
2335 rel.from = s->interval.start;
2336 token->mapping.push_back(rel);
2337 rel.to += append_token(std::move(t), token->text);
2338 token->text_type |= has_tokens;
2339 }
2340 ++m_offset;
2341
2342 if (s_el_start) {
2343 if (s_el_start->code == element_t::address ||
2344 s_el_start->code == element_t::code ||
2345 s_el_start->code == element_t::comment ||
2346 s_el_start->code == element_t::cite ||
2347 s_el_start->code == element_t::kbd ||
2348 s_el_start->code == element_t::samp ||
2349 s_el_start->code == element_t::script ||
2350 s_el_start->code == element_t::style)
2351 {
2352 // Non-localizable
2353 auto s_end = s_el_start->end;
2354 _Assume_(s_end);
2355
2356 if (s->interval.end < s_end->interval.start) {
2357 if (s_el_start->code != element_t::style) {
2358 rel.from = s->interval.start;
2359 token->mapping.push_back(rel);
2360 rel.to += append_token(std::move(std::unique_ptr<text_token<_Elem, _Traits, _Alloc>>(
2362 token_t::complete,
2363 m_source + s->interval.end, s_end->interval.start - s->interval.end,
2364 0,
2365 m_offset->get()))),
2366 token->text);
2367 }
2368 else {
2369 // Partially parse CSS. It may contain URLs we need to make absolute.
2370 auto t = parse_css(s->interval.end, s_end->interval.start);
2371 _Assume_(t);
2372 rel.from = s->interval.start;
2373 token->mapping.push_back(rel);
2374 rel.to += t->append_tag(token->text);
2375 }
2376 token->text_type |= has_tokens;
2377 }
2378 while (m_offset != end && m_offset->get() != s_end)
2379 ++m_offset;
2380 }
2381 else if (element_traits::is_group(s_el_start->code)) {
2382 auto limit = m_offset;
2383 while (limit != end && limit->get() != s_el_start->end)
2384 ++limit;
2385 auto t = parse(limit,
2386 (element_traits::is_heading(s_el_start->code) || s_el_start->code == element_t::dt || s_el_start->code == element_t::title ? is_title : 0) |
2387 (element_traits::is_list(s_el_start->code) ? is_bullet : 0));
2388 rel.from = s->interval.start;
2389 token->mapping.push_back(rel);
2390 rel.to += t->append_tag(token->text);
2391 token->text_type |= has_tokens;
2392 }
2393 }
2394 }
2395 else if (s->type == stdex::parser::html_sequence_t::element_end) {
2396 rel.from = s->interval.start;
2397 token->mapping.push_back(rel);
2398 rel.to += append_token(std::move(std::unique_ptr<text_token<_Elem, _Traits, _Alloc>>(
2400 token_t::ending,
2401 m_source + s->interval.start, s->interval.size(),
2402 0,
2403 s.get()))),
2404 token->text);
2405 token->text_type |= has_tokens;
2406 ++m_offset;
2407 }
2408 else {
2409 // Declaration, instruction, (P)CDATA section, comment...
2410 rel.from = s->interval.start;
2411 token->mapping.push_back(rel);
2412 rel.to += append_token(std::move(std::unique_ptr<text_token<_Elem, _Traits, _Alloc>>(
2414 token_t::complete,
2415 m_source + s->interval.start, s->interval.size(),
2416 0,
2417 s.get()))),
2418 token->text);
2419 token->text_type |= has_tokens;
2420 ++m_offset;
2421 }
2422 }
2423
2424 return append_token(std::move(token));
2425 }
2426
2431 {
2432 stdex::interval<size_t> section, content;
2433 std::unique_ptr<text_token<_Elem, _Traits, _Alloc>> token(
2435 token_t::complete,
2436 nullptr, 0,
2437 0,
2438 m_offset->get()));
2439
2440 for (;;) {
2441 if (m_css_comment.match(m_source, start, end)) {
2442 token->text.append(m_source + start, m_css_comment.interval.end - start);
2443 start = m_css_comment.interval.end;
2444 }
2445 else if (m_css_cdo.match(m_source, start, end)) {
2446 token->text.append(m_source + start, m_css_cdo.interval.end - start);
2447 start = m_css_cdo.interval.end;
2448 }
2449 else if (m_css_cdc.match(m_source, start, end)) {
2450 token->text.append(m_source + start, m_css_cdc.interval.end - start);
2451 start = m_css_cdc.interval.end;
2452 }
2453 else if (
2454 m_css_import.match(m_source, start, end) && (section = m_css_import.interval, content = m_css_import.content, true) ||
2455 m_css_uri.match(m_source, start, end) && (section = m_css_uri.interval, content = m_css_uri.content, true))
2456 {
2457 std::unique_ptr<url_token<_Elem, _Traits, _Alloc>> t_url(
2459 nullptr, 0,
2460 token_url_t::css,
2461 m_offset->get()));
2462 css_unescape(t_url->url, m_source + content.start, content.size());
2463 token->text.append(m_source + start, content.start - start);
2464 append_token(std::move(t_url), token->text);
2465 token->text.append(m_source + content.end, section.end - content.end);
2466 token->text_type |= has_tokens;
2467 start = section.end;
2468 }
2469 else if (m_any_char.match(m_source, start, end)) {
2470 token->text.append(m_source + start, m_any_char.interval.end - start);
2471 start = m_any_char.interval.end;
2472 }
2473 else
2474 break;
2475 }
2476
2477 return append_token(std::move(token));
2478 }
2479
2480 protected:
2482 const stdex::sys_string m_url;
2483 const bool m_parse_frames;
2485 const _Elem* m_source;
2486 token_vector m_tokens;
2487 sequence_store::const_iterator m_offset;
2488
2489 // For detecting URLs in CSS
2497 };
2498 }
2499}
HTML comment.
Definition html.hpp:1383
stdex::interval< size_t > content
Comment content position in source.
Definition html.hpp:1392
HTML declaration.
Definition html.hpp:1365
stdex::interval< size_t > name
Declaration name position in source.
Definition html.hpp:1375
std::vector< stdex::parser::html_attribute > attributes
Declaration attribute positions in source.
Definition html.hpp:1376
HTML document.
Definition html.hpp:1432
std::vector< element_start * > m_element_stack
LIFO stack of started elements.
Definition html.hpp:1767
void finalize()
Finalizes document when no more appending is planned.
Definition html.hpp:1683
stdex::charset_id m_charset
Document charset.
Definition html.hpp:1752
bool m_is_rcdata
Inside of RCDATA?
Definition html.hpp:1758
bool m_is_special_element
Inside of a special element (<SCRIPT>, <STYLE>, ...)?
Definition html.hpp:1768
void append(_In_reads_or_z_opt_(num_chars) const _Elem *source, size_t num_chars=SIZE_MAX)
Parses HTML source code by chunks.
Definition html.hpp:1472
sequence_store m_sequences
Store of sequences.
Definition html.hpp:1766
size_t m_num_invalid_conditions
Number of started invalid conditions.
Definition html.hpp:1756
void assign(_In_reads_or_z_opt_(num_chars) const _Elem *source, size_t num_chars=SIZE_MAX)
Parses HTML document source code.
Definition html.hpp:1695
std::vector< std::unique_ptr< entity< _Elem, _Traits, _Alloc > > > m_entities
Array of entities.
Definition html.hpp:1762
std::basic_string< _Elem, _Traits, _Alloc > m_source
Document HTML source code.
Definition html.hpp:1750
element_start * active_element() const
Returns starting tag of currently active element or nullptr if no element is known to be started.
Definition html.hpp:1713
std::basic_string< _Elem, _Traits, _Alloc > replace_entities(_In_reads_or_z_opt_(num_chars) const _Elem *input, size_t num_chars) const
Replaces entities with their content.
Definition html.hpp:1721
void clear()
Empties document.
Definition html.hpp:1451
size_t m_num_valid_conditions
Number of started valid conditions.
Definition html.hpp:1755
const std::basic_string< _Elem, _Traits, _Alloc > & source() const
Returns document HTML source code.
Definition html.hpp:1705
size_t m_num_parsed
Number of characters already parsed.
Definition html.hpp:1751
bool m_is_cdata
Inside of CDATA?
Definition html.hpp:1757
Ending tag of an HTML element </...>
Definition html.hpp:1345
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1357
element_start * start
Corresponding starting tag.
Definition html.hpp:1358
element_t code
Element code.
Definition html.hpp:1356
Starting tag of an HTML element <...>
Definition html.hpp:1329
sequence * end
Corresponding ending tag of type element_end; When element is ended by a start of another element,...
Definition html.hpp:1338
HTML element <.../>
Definition html.hpp:1150
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1319
std::vector< stdex::parser::html_attribute > attributes
Element attribute positions in source.
Definition html.hpp:1320
element_t code
Element code.
Definition html.hpp:1318
HTML instruction.
Definition html.hpp:1399
stdex::interval< size_t > content
Instruction content position in source.
Definition html.hpp:1408
HTML parser.
Definition html.hpp:1989
stdex::progress< size_t > * m_progress
Progress indicator.
Definition html.hpp:2484
text_token< _Elem, _Traits, _Alloc > * parse_css(size_t start, size_t end)
Parses CSS.
Definition html.hpp:2430
static void merge(token_list &a, const token_list &b)
Adds tokens from list b to list a creating an union.
Definition html.hpp:2177
token_list::const_iterator end_tokens(std::basic_string< _Elem, _Traits, _Alloc > &source, token_list &active_tokens, const token_list &new_tokens)
Pops ending tokens from the active token list and append their tags to the source code string.
Definition html.hpp:2110
static void link(std::basic_string< _Elem, _Traits, _Alloc > &source, const text_token< _Elem, _Traits, _Alloc > *t)
Rebuilds HTML source code from the token tree.
Definition html.hpp:2025
text_token< _Elem, _Traits, _Alloc > * parse(const sequence_store::const_iterator &end, uint32_t text_type=0)
Recursively parses HTML document.
Definition html.hpp:2255
const _Elem * m_source
HTML source code.
Definition html.hpp:2485
token_vector m_tokens
HTML token storage.
Definition html.hpp:2486
text_token< _Elem, _Traits, _Alloc > * parse()
Parses HTML document.
Definition html.hpp:2005
const document< _Elem, _Traits, _Alloc > & m_document
Document being analyzed.
Definition html.hpp:2481
void make_absolute_url(std::basic_string< _Elem, _Traits, _Alloc > &rel)
Converts URL to absolute.
Definition html.hpp:2196
size_t append_token(std::unique_ptr< T > &&token, std::basic_string< _Elem, _Traits, _Alloc > &source)
Adds token to the collection and appends its tag to the source code string.
Definition html.hpp:2238
const token_vector & tokens() const
Returns collection of tokens.
Definition html.hpp:2209
const stdex::sys_string m_url
Absolute document URL.
Definition html.hpp:2482
const bool m_parse_frames
Parse frames.
Definition html.hpp:2483
static void start_tokens(std::basic_string< _Elem, _Traits, _Alloc > &source, token_list &active_tokens, const token_list &new_tokens, token_list::const_iterator from)
Pushes tokens to the active token list and appends their tags to the source code string.
Definition html.hpp:2092
T * append_token(std::unique_ptr< T > &&token)
Adds token to the collection.
Definition html.hpp:2220
void append_inserted_tokens(std::basic_string< _Elem, _Traits, _Alloc > &source, inserted_token_list &inserted_tokens, size_t word_index, bool after_word, token_list &active_tokens)
Adds matching inserted tokens before/after the given word in source code.
Definition html.hpp:2153
sequence_store::const_iterator m_offset
Index of active section.
Definition html.hpp:2487
Base class for HTML sequences.
Definition html.hpp:1131
stdex::interval< size_t > interval
Sequence position in source.
Definition html.hpp:1134
stdex::parser::html_sequence_t type
Sequence type. Enum is used for performance reasons (vs. dynamic_cast)
Definition html.hpp:1133
sequence * parent
Parent sequence.
Definition html.hpp:1135
Token representing start HTML tag.
Definition html.hpp:1922
stdex::html::sequence * end_sequence
Ending tag sequence.
Definition html.hpp:1940
std::basic_string< _Elem, _Traits, _Alloc > name
Element name allowing later recreation of ending </tag>
Definition html.hpp:1939
Token representing part of HTML text.
Definition html.hpp:1897
stdex::mapping_vector< size_t > mapping
Mapping between source and text positions.
Definition html.hpp:1914
std::basic_string< _Elem, _Traits, _Alloc > text
Token text.
Definition html.hpp:1912
uint32_t text_type
Mask of text_type_flag_t to specify text content.
Definition html.hpp:1913
HTML token base class.
Definition html.hpp:1806
sequence * sequence
Pointer to the sequence this token represents or nullptr when it doesn't trivially represent one sequ...
Definition html.hpp:1875
size_t append_tag(std::basic_string< char, _Traits, _Alloc > &str) const
Appends token tag to the source code.
Definition html.hpp:1828
uintptr_t data
Any user-supplied data.
Definition html.hpp:1876
token_t type
Token type.
Definition html.hpp:1874
size_t append_tag(std::basic_string< wchar_t, _Traits, _Alloc > &str) const
Appends token tag to the source code.
Definition html.hpp:1844
HTTP token representing an URL.
Definition html.hpp:1957
token_url_t encoding
URL encoding.
Definition html.hpp:1972
std::basic_string< _Elem, _Traits, _Alloc > url
URL.
Definition html.hpp:1971
stdex::interval< size_t > content
content position in source
Definition parser.hpp:7833
stdex::interval< size_t > content
content position in source
Definition parser.hpp:7748
std::vector< html_attribute > attributes
tag attributes
Definition parser.hpp:8355
html_sequence_t type
tag type
Definition parser.hpp:8353
stdex::interval< size_t > name
tag name position in source
Definition parser.hpp:8354
stdex::interval< size_t > interval
Region of the last match.
Definition parser.hpp:172
Test for given string.
Definition parser.hpp:818
Progress indicator base class.
Definition progress.hpp:19
virtual bool cancel()
Query whether user requested abort.
Definition progress.hpp:65
virtual void set(T value)
Set current progress.
Definition progress.hpp:47
virtual void set_range(T start, T end)
Set progress range extent.
Definition progress.hpp:37
User cancelled exception.
Definition exception.hpp:17
Describes attributes associated with a HTML element.
Definition html.hpp:515
static bool is_group(element_t code)
Does element represent a separate part of text?
Definition html.hpp:920
static bool is_flow(element_t code)
Does element typically represent text?
Definition html.hpp:834
static bool is_heading(element_t code)
Does element represent a heading?
Definition html.hpp:752
static bool is_head_content(element_t code)
Is element part of the document head?
Definition html.hpp:844
static bool is_fontstyle(element_t code)
Does element represent font styling?
Definition html.hpp:640
static bool is_block(element_t code)
Is element typically displayed as a stand-alone section of text?
Definition html.hpp:803
static bool is_head_misc(element_t code)
May element be a part of document head?
Definition html.hpp:861
static bool is_list(element_t code)
Does element represent a list of items?
Definition html.hpp:771
static bool is_uri(element_t code, _In_reads_or_z_opt_(num_chars) const T *attr_name, size_t num_chars)
Checks if expected element attribute value is URI.
Definition html.hpp:1050
static bool is_preformatted(element_t code)
Does element represent preformatted text, source code etc.?
Definition html.hpp:788
static bool is_localizable(element_t code, const T *attr_name, size_t num_chars)
Checks if expected element attribute value is localizable.
Definition html.hpp:1105
static bool is_special(element_t code)
Does element represent non-textual item in the document?
Definition html.hpp:686
static bool is_pre_exclusion(element_t code)
May element be a part of.
Definition html.hpp:879
static bool is_inline(element_t code)
Is element typically displayed inline with text?
Definition html.hpp:737
static bool is_html_content(element_t code)
Does element represent the document body?
Definition html.hpp:904
static bool is_formctrl(element_t code)
Does element represent a form control?
Definition html.hpp:719
static bool is_phrase(element_t code)
Does element represent a phrase-of-speech?
Definition html.hpp:662
static bool may_contain(element_t parent, element_t child)
Checks if one element may nest inside another.
Definition html.hpp:950
static element_span_t span(element_t code)
Returns expected element span in HTML code.
Definition html.hpp:521
HTML entity.
Definition html.hpp:1416
stdex::interval< size_t > name
Name position in source.
Definition html.hpp:1417
std::basic_string< _Elem, _Traits, _Alloc > value
Entity value.
Definition html.hpp:1418
Inserted HTML token.
Definition html.hpp:1978
bool after_word
true if token is anchored after the word; false if anchored before the word
Definition html.hpp:1982
std::list< stdex::html::token * > active_tokens
List of started tokens at inserted token.
Definition html.hpp:1980
size_t word_index
Index of the word, token is anchored to.
Definition html.hpp:1981
token * token
Points to the token.
Definition html.hpp:1979
Numerical interval.
Definition interval.hpp:18
T size() const
Returns interval size.
Definition interval.hpp:47
T end
interval end
Definition interval.hpp:20
T start
interval start
Definition interval.hpp:19
Maps index in source string to index in destination string.
Definition mapping.hpp:17
mapping()
Constructs a zero to zero mapping.
Definition mapping.hpp:24
Tag attribute.
Definition parser.hpp:8127
stdex::interval< size_t > value
attribute value position in source
Definition parser.hpp:8129