stdex
Additional custom or not Standard C++ covered algorithms
Loading...
Searching...
No Matches
html.hpp
1/*
2 SPDX-License-Identifier: MIT
3 Copyright © 2016-2023 Amebis
4*/
5
6#pragma once
7
8#include "compat.hpp"
9#include "exception.hpp"
10#include "interval.hpp"
11#include "mapping.hpp"
12#include "parser.hpp"
13#include "progress.hpp"
14#include "sgml.hpp"
15#include "string.hpp"
16#include "system.hpp"
17#include "unicode.hpp"
18#include <exception>
19#include <list>
20#include <map>
21#include <memory>
22#include <stdexcept>
23#include <vector>
24
25#ifdef _WIN32
26#undef small
27#endif
28
29namespace stdex
30{
31 namespace html
32 {
40 template<class _Traits = std::char_traits<char>, class _Alloc = std::allocator<char>>
41 inline void escape(
42 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
43 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars = SIZE_MAX)
44 {
45 _Assume_(src || !num_chars);
46 for (size_t i = 0; i < num_chars && src[i]; ++i) {
47 switch (src[i]) {
48 case '&': dst += "&amp;"; break;
49 case ';': dst += "&semi;"; break;
50 case '\"': dst += "&quot;"; break;
51 case '\'': dst += "&#x27;"; break;
52 case '<': dst += "&lt;"; break;
53 case '>': dst += "&gt;"; break;
54 case 0x00a0: dst += "&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
55 default: dst += src[i]; break;
56 }
57 }
58 }
59
67 template<class _Traits = std::char_traits<wchar_t>, class _Alloc = std::allocator<wchar_t>>
68 inline void escape(
69 _Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& dst,
70 _In_reads_or_z_opt_(num_chars) const wchar_t* src, _In_ size_t num_chars = SIZE_MAX)
71 {
72 _Assume_(src || !num_chars);
73 for (size_t i = 0; i < num_chars && src[i]; ++i) {
74 switch (src[i]) {
75 case L'&': dst += L"&amp;"; break;
76 case L';': dst += L"&semi;"; break;
77 case L'\"': dst += L"&quot;"; break;
78 case L'\'': dst += L"&#x27;"; break;
79 case L'<': dst += L"&lt;"; break;
80 case L'>': dst += L"&gt;"; break;
81 case L'\u00a0': dst += L"&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
82 default: dst += src[i]; break;
83 }
84 }
85 }
86
93 template<class _Traits = std::char_traits<char>, class _Alloc = std::allocator<char>>
94 inline void escape_min(_Inout_ std::basic_string<char, _Traits, _Alloc>& dst, _In_ char chr)
95 {
96 switch (chr) {
97 case '&': dst += "&amp;"; break;
98 case '<': dst += "&lt;"; break;
99 case '>': dst += "&gt;"; break;
100 case 0x00a0: dst += "&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
101 default: dst += chr; break;
102 }
103 }
104
111 template<class _Traits = std::char_traits<wchar_t>, class _Alloc = std::allocator<wchar_t>>
112 inline void escape_min(_Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& dst, _In_ wchar_t chr)
113 {
114 switch (chr) {
115 case L'&': dst += L"&amp;"; break;
116 case L'<': dst += L"&lt;"; break;
117 case L'>': dst += L"&gt;"; break;
118 case L'\u00a0': dst += L"&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
119 default: dst += chr; break;
120 }
121 }
122
130 template<class _Traits = std::char_traits<char>, class _Alloc = std::allocator<char>>
131 inline void escape_min(
132 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
133 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars = SIZE_MAX)
134 {
135 _Assume_(src || !num_chars);
136 for (size_t i = 0; i < num_chars && src[i]; ++i) {
137 switch (src[i]) {
138 case '&': dst += "&amp;"; break;
139 case '<': dst += "&lt;"; break;
140 case '>': dst += "&gt;"; break;
141 case 0x00a0: dst += "&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
142 default: dst += src[i]; break;
143 }
144 }
145 }
146
154 template<class _Traits = std::char_traits<wchar_t>, class _Alloc = std::allocator<wchar_t>>
155 inline void escape_min(
156 _Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& dst,
157 _In_reads_or_z_opt_(num_chars) const wchar_t* src, _In_ size_t num_chars = SIZE_MAX)
158 {
159 _Assume_(src || !num_chars);
160 for (size_t i = 0; i < num_chars && src[i]; ++i) {
161 switch (src[i]) {
162 case L'&': dst += L"&amp;"; break;
163 case L'<': dst += L"&lt;"; break;
164 case L'>': dst += L"&gt;"; break;
165 case L'\u00a0': dst += L"&nbsp;"; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space.
166 default: dst += src[i]; break;
167 }
168 }
169 }
170
178 template<class _Traits = std::char_traits<char>, class _Alloc = std::allocator<char>>
179 inline void url_unescape(
180 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
181 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars = SIZE_MAX)
182 {
183 _Assume_(src || !num_chars);
184 for (size_t i = 0; i < num_chars && src[i];) {
185 switch (src[i]) {
186 case '+':
187 dst += ' '; i++;
188 break;
189
190 case '%': {
191 i++;
192
193 uint8_t chr;
194 if ('0' <= src[i] && src[i] <= '9') chr = (src[i++] - '0') << 4;
195 else if ('A' <= src[i] && src[i] <= 'F') chr = (src[i++] - 'A' + 10) << 4;
196 else if ('a' <= src[i] && src[i] <= 'f') chr = (src[i++] - 'a' + 10) << 4;
197 else { dst += '%'; continue; }
198 if ('0' <= src[i] && src[i] <= '9') chr |= (src[i++] - '0');
199 else if ('A' <= src[i] && src[i] <= 'F') chr |= (src[i++] - 'A' + 10);
200 else if ('a' <= src[i] && src[i] <= 'f') chr |= (src[i++] - 'a' + 10);
201 else { dst += '%'; dst += src[i - 1]; continue; }
202
203 dst += static_cast<char>(chr);
204 break;
205 }
206
207 default:
208 dst += src[i++];
209 }
210 }
211 }
212
220 template<class _Traits = std::char_traits<char>, class _Alloc = std::allocator<char>>
221 inline void url_escape(
222 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
223 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars = SIZE_MAX)
224 {
225 _Assume_(src || !num_chars);
226 for (size_t i = 0; i < num_chars && src[i]; ++i) {
227 switch (src[i]) {
228 case ' ': dst += "+"; break;
229 case '<': dst += "%3C"; break;
230 case '>': dst += "%3E"; break;
231 case '#': dst += "%23"; break;
232 case '%': dst += "%25"; break;
233 case '{': dst += "%7B"; break;
234 case '}': dst += "%7D"; break;
235 case '|': dst += "%7C"; break;
236 case '\\': dst += "%5C"; break;
237 case '^': dst += "%5E"; break;
238 case '~': dst += "%7E"; break;
239 case '[': dst += "%5B"; break;
240 case ']': dst += "%5D"; break;
241 case '`': dst += "%60"; break;
242 case ';': dst += "%3B"; break;
243 case '/': dst += "%2F"; break;
244 case '?': dst += "%3F"; break;
245 case ':': dst += "%3A"; break;
246 case '@': dst += "%40"; break;
247 case '=': dst += "%3D"; break;
248 case '&': dst += "%26"; break;
249 case '$': dst += "%24"; break;
250 default:
251 if (0x20 < static_cast<uint8_t>(src[i]) && static_cast<uint8_t>(src[i]) < 0x7f)
252 dst += src[i];
253 else {
254 dst += '%';
255 uint8_t n = (static_cast<uint8_t>(src[i]) & 0xf0) >> 4;
256 dst += n < 10 ? static_cast<char>('0' + n) : static_cast<char>('A' + n - 10);
257 n = ((uint8_t)src[i] & 0x0f);
258 dst += n < 10 ? static_cast<char>('0' + n) : static_cast<char>('A' + n - 10);
259 }
260 }
261 }
262 }
263
271 template<class _Elem, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
272 inline void css_unescape(
273 _Inout_ std::basic_string<_Elem, _Traits, _Alloc>& dst,
274 _In_reads_or_z_opt_(num_chars) const _Elem* src, _In_ size_t num_chars = SIZE_MAX)
275 {
276 _Assume_(src || !num_chars);
277 for (size_t i = 0; i < num_chars && src[i];) {
278 if (src[i] != '\\')
279 dst += src[i++];
280 else if (i + 1 < num_chars) {
281 i++;
282
283 switch (src[i]) {
284 // Classic escapes
285 case 'n': dst += '\n'; i++; break;
286 case 'r': dst += '\r'; i++; break;
287 case 't': dst += '\t'; i++; break;
288
289 // `\` at the end of the line
290 case '\n': i++; break;
291
292 // `\nnnn` escape
293 case '0':
294 case '1':
295 case '2':
296 case '3':
297 case '4':
298 case '5':
299 case '6':
300 case '7':
301 case '8':
302 case '9':
303 case 'A': case 'a':
304 case 'B': case 'b':
305 case 'C': case 'c':
306 case 'D': case 'd':
307 case 'E': case 'e':
308 case 'F': case 'f': {
309 wchar_t chr = 0;
310 size_t end = std::min(num_chars, i + 6);
311
312 for (; i < end; ++i) {
313 if ('0' <= src[i] && src[i] <= '9') chr = chr * 0x10 + src[i] - '0';
314 else if ('A' <= src[i] && src[i] <= 'F') chr = chr * 0x10 + src[i] - 'A' + 10;
315 else if ('a' <= src[i] && src[i] <= 'f') chr = chr * 0x10 + src[i] - 'a' + 10;
316 else break;
317 }
318
319 dst += static_cast<_Elem>(chr);
320
321 if (i < end && src[i] == ' ') {
322 // Skip space after `\nnnn`.
323 i++;
324 }
325 break;
326 }
327
328 default: dst += src[i++];
329 }
330 }
331 }
332 }
333
341 template<class _Traits = std::char_traits<char>, class _Alloc = std::allocator<char>>
342 inline void css_escape(
343 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
344 _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars = SIZE_MAX)
345 {
346 _Assume_(src || !num_chars);
347 for (size_t i = 0; i < num_chars && src[i]; ++i) {
348 switch (src[i]) {
349 case '\\': dst += "\\\\"; break;
350 case '\n': dst += "\\n"; break;
351 case '\r': dst += "\\r"; break;
352 case '\t': dst += "\\t"; break;
353 case '\"': dst += "\\\""; break;
354 case '\'': dst += "\\'"; break;
355 default: dst += src[i]; break;
356 }
357 }
358 }
359
367 template<class _Traits = std::char_traits<wchar_t>, class _Alloc = std::allocator<wchar_t>>
368 inline void css_escape(
369 _Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& dst,
370 _In_reads_or_z_opt_(num_chars) const wchar_t* src, _In_ size_t num_chars = SIZE_MAX)
371 {
372 _Assume_(src || !num_chars);
373 for (size_t i = 0; i < num_chars && src[i]; ++i) {
374 switch (src[i]) {
375 case L'\\': dst += L"\\\\"; break;
376 case L'\n': dst += L"\\n"; break;
377 case L'\r': dst += L"\\r"; break;
378 case L'\t': dst += L"\\t"; break;
379 case L'\"': dst += L"\\\""; break;
380 case L'\'': dst += L"\\'"; break;
381 default: dst += src[i]; break;
382 }
383 }
384 }
385
389 enum class element_t {
390 empty = 0,
391 a,
392 abbr,
393 acronym,
394 address,
395 applet,
396 area,
397 b,
398 base,
399 basefont,
400 bdo,
401 bgsound, // Microsoft Specific
402 big,
403 blink, // Microsoft Specific
404 blockquote,
405 body,
406 br,
407 button,
408 caption,
409 center,
410 cite,
411 code,
412 col,
413 colgroup,
414 comment, // Microsoft Specific
415 dd,
416 del,
417 dfn,
418 dir,
419 div,
420 dl,
421 dt,
422 em,
423 embed, // Microsoft Specific
424 fieldset,
425 font,
426 form,
427 frame,
428 frameset,
429 h1,
430 h2,
431 h3,
432 h4,
433 h5,
434 h6,
435 head,
436 hr,
437 html,
438 i,
439 iframe,
440 img,
441 input,
442 ins,
443 isindex,
444 kbd,
445 label,
446 legend,
447 li,
448 link,
449 listing, // Microsoft Specific
450 map,
451 marquee, // Microsoft Specific
452 menu,
453 meta,
454 nextid, // Microsoft Specific
455 nobr, // Microsoft Specific
456 noembed, // Microsoft Specific
457 noframes,
458 noscript,
459 object,
460 ol,
461 optgroup,
462 option,
463 p,
464 param,
465 plaintext, // Microsoft Specific
466 pre,
467 q,
468 rt, // Microsoft Specific
469 ruby, // Microsoft Specific
470 s,
471 samp,
472 script,
473 select,
474 small,
475 span,
476 strike,
477 strong,
478 style,
479 sub,
480 sup,
481 table,
482 tbody,
483 td,
484 textarea,
485 tfoot,
486 th,
487 thead,
488 title,
489 tr,
490 tt,
491 u,
492 ul,
493 var,
494 wbr, // Microsoft Specific
495 xmp, // Microsoft Specific
496
497 unknown = -1,
498 PCDATA = -2,
499 CDATA = -3,
500 };
501
505 enum class element_span_t {
506 needs_end = 0,
507 end_optional,
508 immediate,
509 };
510
515 {
521 static inline element_span_t span(_In_ element_t code)
522 {
523 static element_span_t lookup[] = {
524 element_span_t::needs_end, // a
525 element_span_t::needs_end, // abbr
526 element_span_t::needs_end, // acronym
527 element_span_t::needs_end, // address
528 element_span_t::needs_end, // applet
529 element_span_t::immediate, // area
530 element_span_t::needs_end, // b
531 element_span_t::immediate, // base
532 element_span_t::immediate, // basefont
533 element_span_t::needs_end, // bdo
534 element_span_t::immediate, // bgsound
535 element_span_t::needs_end, // big
536 element_span_t::needs_end, // blink
537 element_span_t::needs_end, // blockquote
538 element_span_t::end_optional, // body
539 element_span_t::immediate, // br
540 element_span_t::needs_end, // button
541 element_span_t::needs_end, // caption
542 element_span_t::needs_end, // center
543 element_span_t::needs_end, // cite
544 element_span_t::needs_end, // code
545 element_span_t::immediate, // col
546 element_span_t::end_optional, // colgroup
547 element_span_t::needs_end, // comment
548 element_span_t::end_optional, // dd
549 element_span_t::needs_end, // del
550 element_span_t::needs_end, // dfn
551 element_span_t::needs_end, // dir
552 element_span_t::needs_end, // div
553 element_span_t::needs_end, // dl
554 element_span_t::end_optional, // dt
555 element_span_t::needs_end, // em
556 element_span_t::immediate, // embed
557 element_span_t::needs_end, // fieldset
558 element_span_t::needs_end, // font
559 element_span_t::needs_end, // form
560 element_span_t::immediate, // frame
561 element_span_t::needs_end, // frameset
562 element_span_t::needs_end, // h1
563 element_span_t::needs_end, // h2
564 element_span_t::needs_end, // h3
565 element_span_t::needs_end, // h4
566 element_span_t::needs_end, // h5
567 element_span_t::needs_end, // h6
568 element_span_t::end_optional, // head
569 element_span_t::immediate, // hr
570 element_span_t::end_optional, // html
571 element_span_t::needs_end, // i
572 element_span_t::needs_end, // iframe
573 element_span_t::immediate, // img
574 element_span_t::immediate, // input
575 element_span_t::needs_end, // ins
576 element_span_t::immediate, // isindex
577 element_span_t::needs_end, // kbd
578 element_span_t::needs_end, // label
579 element_span_t::needs_end, // legend
580 element_span_t::end_optional, // li
581 element_span_t::immediate, // link
582 element_span_t::needs_end, // listing
583 element_span_t::needs_end, // map
584 element_span_t::needs_end, // marquee
585 element_span_t::needs_end, // menu
586 element_span_t::immediate, // meta
587 element_span_t::immediate, // nextid
588 element_span_t::needs_end, // nobr
589 element_span_t::needs_end, // noembed
590 element_span_t::needs_end, // noframes
591 element_span_t::needs_end, // noscript
592 element_span_t::needs_end, // object
593 element_span_t::needs_end, // ol
594 element_span_t::needs_end, // optgroup
595 element_span_t::end_optional, // option
596 element_span_t::end_optional, // p
597 element_span_t::immediate, // param
598 element_span_t::end_optional, // plaintext
599 element_span_t::needs_end, // pre
600 element_span_t::needs_end, // q
601 element_span_t::immediate, // rt
602 element_span_t::needs_end, // ruby
603 element_span_t::needs_end, // s
604 element_span_t::needs_end, // samp
605 element_span_t::needs_end, // script
606 element_span_t::needs_end, // select
607 element_span_t::needs_end, // small
608 element_span_t::needs_end, // span
609 element_span_t::needs_end, // strike
610 element_span_t::needs_end, // strong
611 element_span_t::needs_end, // style
612 element_span_t::needs_end, // sub
613 element_span_t::needs_end, // sup
614 element_span_t::needs_end, // table
615 element_span_t::end_optional, // tbody
616 element_span_t::end_optional, // td
617 element_span_t::needs_end, // textarea
618 element_span_t::end_optional, // tfoot
619 element_span_t::end_optional, // th
620 element_span_t::end_optional, // thead
621 element_span_t::needs_end, // title
622 element_span_t::end_optional, // tr
623 element_span_t::needs_end, // tt
624 element_span_t::needs_end, // u
625 element_span_t::needs_end, // ul
626 element_span_t::needs_end, // var
627 element_span_t::immediate, // wbr
628 element_span_t::needs_end, // xmp
629 };
630 return element_t::a <= code && code <= element_t::xmp ?
631 lookup[static_cast<size_t>(code) - static_cast<size_t>(element_t::a)] :
632 element_span_t::needs_end;
633 }
634
640 static inline bool is_fontstyle(_In_ element_t code)
641 {
642 switch (code) {
643 case element_t::tt:
644 case element_t::i:
645 case element_t::b:
646 case element_t::u:
647 case element_t::s:
648 case element_t::strike:
649 case element_t::blink:
650 case element_t::big:
651 case element_t::small:
652 return true;
653 };
654 return false;
655 }
656
662 static inline bool is_phrase(_In_ element_t code)
663 {
664 switch (code) {
665 case element_t::em:
666 case element_t::strong:
667 case element_t::dfn:
668 case element_t::code:
669 case element_t::samp:
670 case element_t::kbd:
671 case element_t::var:
672 case element_t::cite:
673 case element_t::abbr:
674 case element_t::acronym:
675 case element_t::xmp:
676 return true;
677 };
678 return false;
679 }
680
686 static inline bool is_special(_In_ element_t code)
687 {
688 switch (code) {
689 case element_t::a:
690 case element_t::img:
691 case element_t::applet:
692 case element_t::object:
693 case element_t::embed:
694 case element_t::font:
695 case element_t::basefont:
696 case element_t::br:
697 case element_t::wbr:
698 case element_t::rt:
699 case element_t::script:
700 case element_t::map:
701 case element_t::q:
702 case element_t::sub:
703 case element_t::sup:
704 case element_t::ruby:
705 case element_t::span:
706 case element_t::bdo:
707 case element_t::iframe:
708 case element_t::nobr:
709 return true;
710 };
711 return false;
712 }
713
719 static inline bool is_formctrl(_In_ element_t code)
720 {
721 switch (code) {
722 case element_t::input:
723 case element_t::select:
724 case element_t::textarea:
725 case element_t::label:
726 case element_t::button:
727 return true;
728 };
729 return false;
730 }
731
737 static inline bool is_inline(_In_ element_t code)
738 {
739 return
740 code == element_t::PCDATA ||
741 is_fontstyle(code) ||
742 is_phrase(code) ||
743 is_special(code) ||
744 is_formctrl(code);
745 }
746
752 static inline bool is_heading(_In_ element_t code)
753 {
754 switch (code) {
755 case element_t::h1:
756 case element_t::h2:
757 case element_t::h3:
758 case element_t::h4:
759 case element_t::h5:
760 case element_t::h6:
761 return true;
762 };
763 return false;
764 }
765
771 static inline bool is_list(_In_ element_t code)
772 {
773 switch (code) {
774 case element_t::ul:
775 case element_t::ol:
776 case element_t::dir:
777 case element_t::menu:
778 return true;
779 };
780 return false;
781 }
782
788 static inline bool is_preformatted(_In_ element_t code)
789 {
790 switch (code) {
791 case element_t::pre:
792 case element_t::listing:
793 return true;
794 }
795 return false;
796 }
797
803 static inline bool is_block(_In_ element_t code)
804 {
805 if (is_heading(code) ||
806 is_list(code) ||
807 is_preformatted(code)) return true;
808 switch (code) {
809 case element_t::p:
810 case element_t::dl:
811 case element_t::div:
812 case element_t::center:
813 case element_t::marquee:
814 case element_t::noscript:
815 case element_t::noframes:
816 case element_t::noembed:
817 case element_t::blockquote:
818 case element_t::form:
819 case element_t::isindex:
820 case element_t::hr:
821 case element_t::table:
822 case element_t::fieldset:
823 case element_t::address:
824 return true;
825 };
826 return false;
827 }
828
834 static inline bool is_flow(_In_ element_t code)
835 {
836 return is_block(code) || is_inline(code);
837 }
838
844 static inline bool is_head_content(_In_ element_t code)
845 {
846 switch (code) {
847 case element_t::title:
848 case element_t::isindex:
849 case element_t::base:
850 case element_t::nextid:
851 return true;
852 };
853 return false;
854 }
855
861 static inline bool is_head_misc(_In_ element_t code)
862 {
863 switch (code) {
864 case element_t::script:
865 case element_t::style:
866 case element_t::meta:
867 case element_t::link:
868 case element_t::object:
869 return true;
870 };
871 return false;
872 }
873
879 static inline bool is_pre_exclusion(_In_ element_t code)
880 {
881 switch (code) {
882 case element_t::img:
883 case element_t::object:
884 case element_t::applet:
885 case element_t::embed:
886 case element_t::big:
887 case element_t::small:
888 case element_t::sub:
889 case element_t::sup:
890 case element_t::ruby:
891 case element_t::font:
892 case element_t::basefont:
893 case element_t::nobr:
894 return true;
895 };
896 return false;
897 }
898
904 static inline bool is_html_content(_In_ element_t code)
905 {
906 switch (code) {
907 case element_t::head:
908 case element_t::body:
909 case element_t::frameset:
910 return true;
911 };
912 return false;
913 }
914
920 static inline bool is_group(_In_ element_t code)
921 {
922 if (is_block(code) ||
923 is_html_content(code) ||
924 is_head_content(code)) return true;
925 switch (code) {
926 case element_t::col:
927 case element_t::colgroup:
928 case element_t::dd:
929 case element_t::dir:
930 case element_t::dt:
931 case element_t::frame:
932 case element_t::iframe:
933 case element_t::legend:
934 case element_t::td:
935 case element_t::th:
936 case element_t::tr:
937 return true;
938 };
939 return false;
940 }
941
950 static inline bool may_contain(_In_ element_t parent, _In_ element_t child)
951 {
952 if (child == element_t::unknown || child == element_t::comment)
953 return true;
954 if (is_fontstyle(parent) || is_phrase(parent))
955 return is_inline(child);
956 if (is_heading(parent))
957 return is_inline(child);
958
959 switch (parent) {
960 case element_t::a: return is_inline(child) && child != element_t::a;
961 case element_t::address: return is_inline(child) || child == element_t::p;
962 case element_t::applet: return is_flow(child) || child == element_t::param;
963 case element_t::area: return false;
964 case element_t::base: return false;
965 case element_t::basefont: return false;
966 case element_t::bdo: return is_inline(child);
967 case element_t::blockquote: return is_flow(child);
968 case element_t::body: return is_flow(child) || child == element_t::ins || child == element_t::del;
969 case element_t::br: return false;
970 case element_t::button: return is_flow(child) && !is_formctrl(child) && child != element_t::a && child != element_t::form && child != element_t::isindex && child != element_t::fieldset && child != element_t::iframe;
971 case element_t::caption: return is_inline(child);
972 case element_t::center: return is_flow(child);
973 case element_t::col: return false;
974 case element_t::colgroup: return child == element_t::col;
975 case element_t::comment: return child == element_t::CDATA;
976 case element_t::dd: return is_flow(child);
977 case element_t::del: return is_flow(child);
978 case element_t::dir: return child == element_t::li;
979 case element_t::div: return is_flow(child);
980 case element_t::dl: return child == element_t::dt || child == element_t::dd;
981 case element_t::dt: return is_inline(child);
982 case element_t::embed: return is_flow(child) || child == element_t::param;
983 case element_t::fieldset: return is_flow(child) || child == element_t::legend || child == element_t::PCDATA;
984 case element_t::font: return is_inline(child);
985 case element_t::form: return is_flow(child) && child != element_t::form;
986 case element_t::frame: return false;
987 case element_t::frameset: return child == element_t::frameset || child == element_t::frame || child == element_t::noframes;
988 case element_t::head: return is_head_content(child) || is_head_misc(child);
989 case element_t::hr: return false;
990 case element_t::html: return is_html_content(child);
991 case element_t::iframe: return is_flow(child);
992 case element_t::img: return false;
993 case element_t::input: return false;
994 case element_t::ins: return is_flow(child);
995 case element_t::isindex: return false;
996 case element_t::label: return is_inline(child) && child != element_t::label;
997 case element_t::legend: return is_inline(child);
998 case element_t::li: return is_flow(child);
999 case element_t::link: return false;
1000 case element_t::listing: return child == element_t::CDATA;
1001 case element_t::map: return is_block(child) || child == element_t::area;
1002 case element_t::marquee: return is_flow(child);
1003 case element_t::menu: return child == element_t::li;
1004 case element_t::meta: return false;
1005 case element_t::nobr: return is_inline(child) || child == element_t::wbr;
1006 case element_t::noframes: return (is_flow(child) || child == element_t::body) && child != element_t::noframes;
1007 case element_t::noscript: return is_flow(child);
1008 case element_t::noembed: return is_flow(child);
1009 case element_t::object: return is_flow(child) || child == element_t::param;
1010 case element_t::ol: return child == element_t::li;
1011 case element_t::optgroup: return child == element_t::option;
1012 case element_t::option: return child == element_t::PCDATA;
1013 case element_t::p: return is_inline(child);
1014 case element_t::param: return false;
1015 case element_t::plaintext: return is_flow(child);
1016 case element_t::pre: return is_inline(child) && !is_pre_exclusion(child);
1017 case element_t::q: return is_inline(child);
1018 case element_t::rt: return false;
1019 case element_t::ruby: return is_inline(child);
1020 case element_t::script: return child == element_t::CDATA;
1021 case element_t::select: return child == element_t::optgroup || child == element_t::option;
1022 case element_t::span: return is_inline(child);
1023 case element_t::style: return child == element_t::CDATA;
1024 case element_t::sub: return is_inline(child);
1025 case element_t::sup: return is_inline(child);
1026 case element_t::table: return child == element_t::caption || child == element_t::col || child == element_t::colgroup || child == element_t::thead || child == element_t::tfoot || child == element_t::tbody;
1027 case element_t::tbody: return child == element_t::tr;
1028 case element_t::td: return is_flow(child);
1029 case element_t::textarea: return child == element_t::PCDATA;
1030 case element_t::tfoot: return child == element_t::tr;
1031 case element_t::th: return is_flow(child);
1032 case element_t::thead: return child == element_t::tr;
1033 case element_t::title: return child == element_t::PCDATA;
1034 case element_t::tr: return child == element_t::td || child == element_t::th;
1035 case element_t::ul: return child == element_t::li;
1036 case element_t::wbr: return false;
1037 case element_t::unknown: return true;
1038 }
1039 return false;
1040 }
1041
1049 template <class T>
1050 static inline bool is_uri(_In_ element_t code, _In_reads_or_z_opt_(num_chars) const T* attr_name, _In_ size_t num_chars)
1051 {
1052 _Assume_(attr_name || !num_chars);
1053 switch (code) {
1054 case element_t::a: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX, stdex::std_locale_C);
1055 case element_t::applet: return !stdex::strnicmp(attr_name, num_chars, "code", SIZE_MAX, stdex::std_locale_C) ||
1056 !stdex::strnicmp(attr_name, num_chars, "codebase", SIZE_MAX, stdex::std_locale_C) ||
1057 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX, stdex::std_locale_C);
1058 case element_t::area: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX, stdex::std_locale_C);
1059 case element_t::base: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX, stdex::std_locale_C);
1060 case element_t::bgsound: return !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX, stdex::std_locale_C);
1061 case element_t::blockquote: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX, stdex::std_locale_C);
1062 case element_t::body: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX, stdex::std_locale_C);
1063 case element_t::comment: return !stdex::strnicmp(attr_name, num_chars, "data", SIZE_MAX, stdex::std_locale_C);
1064 case element_t::del: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX, stdex::std_locale_C);
1065 case element_t::embed: return !stdex::strnicmp(attr_name, num_chars, "pluginspage", SIZE_MAX, stdex::std_locale_C) ||
1066 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX, stdex::std_locale_C);
1067 case element_t::form: return !stdex::strnicmp(attr_name, num_chars, "action", SIZE_MAX, stdex::std_locale_C);
1068 case element_t::frame: return !stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX, stdex::std_locale_C) ||
1069 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX, stdex::std_locale_C);
1070 case element_t::head: return !stdex::strnicmp(attr_name, num_chars, "profile", SIZE_MAX, stdex::std_locale_C);
1071 case element_t::iframe: return !stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX, stdex::std_locale_C) ||
1072 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX, stdex::std_locale_C);
1073 case element_t::img: return !stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX, stdex::std_locale_C) ||
1074 !stdex::strnicmp(attr_name, num_chars, "lowsrc", SIZE_MAX, stdex::std_locale_C) ||
1075 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX, stdex::std_locale_C) ||
1076 !stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX, stdex::std_locale_C);
1077 case element_t::input: return !stdex::strnicmp(attr_name, num_chars, "lowsrc", SIZE_MAX, stdex::std_locale_C) ||
1078 !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX, stdex::std_locale_C) ||
1079 !stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX, stdex::std_locale_C);
1080 case element_t::ins: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX, stdex::std_locale_C);
1081 case element_t::link: return !stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX, stdex::std_locale_C);
1082 case element_t::object: return !stdex::strnicmp(attr_name, num_chars, "basehref", SIZE_MAX, stdex::std_locale_C) ||
1083 !stdex::strnicmp(attr_name, num_chars, "classid", SIZE_MAX, stdex::std_locale_C) ||
1084 !stdex::strnicmp(attr_name, num_chars, "code", SIZE_MAX, stdex::std_locale_C) ||
1085 !stdex::strnicmp(attr_name, num_chars, "codebase", SIZE_MAX, stdex::std_locale_C) ||
1086 !stdex::strnicmp(attr_name, num_chars, "data", SIZE_MAX, stdex::std_locale_C) ||
1087 !stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX, stdex::std_locale_C);
1088 case element_t::q: return !stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX, stdex::std_locale_C);
1089 case element_t::script: return !stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX, stdex::std_locale_C);
1090 case element_t::table: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX, stdex::std_locale_C);
1091 case element_t::td: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX, stdex::std_locale_C);
1092 case element_t::th: return !stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX, stdex::std_locale_C);
1093 }
1094 return false;
1095 }
1096
1104 template <class T>
1105 static inline bool is_localizable(element_t code, const T* attr_name, size_t num_chars)
1106 {
1107 _Assume_(attr_name || !num_chars);
1108 if (!stdex::strnicmp(attr_name, num_chars, "title", SIZE_MAX, stdex::std_locale_C))
1109 return true;
1110 switch (code) {
1111 case element_t::applet: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX, stdex::std_locale_C);
1112 case element_t::area: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX, stdex::std_locale_C);
1113 case element_t::img: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX, stdex::std_locale_C);
1114 case element_t::input: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX, stdex::std_locale_C);
1115 case element_t::object: return !stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX, stdex::std_locale_C);
1116 case element_t::table: return !stdex::strnicmp(attr_name, num_chars, "summary", SIZE_MAX, stdex::std_locale_C);
1117 case element_t::td: return !stdex::strnicmp(attr_name, num_chars, "abbr", SIZE_MAX, stdex::std_locale_C);
1118 case element_t::th: return !stdex::strnicmp(attr_name, num_chars, "abbr", SIZE_MAX, stdex::std_locale_C);
1119 }
1120 return false;
1121 }
1122 };
1123
1124 class sequence;
1125 using sequence_store = std::vector<std::unique_ptr<sequence>>;
1126
1131 {
1132 public:
1133 stdex::parser::html_sequence_t type;
1136
1137 sequence(_In_ stdex::parser::html_sequence_t _type = stdex::parser::html_sequence_t::unknown, _In_ size_t start = 0, size_t end = 0, _In_opt_ sequence* _parent = nullptr) :
1138 type(_type),
1139 interval(start, end),
1140 parent(_parent)
1141 {}
1142
1143 virtual ~sequence() {} // make polymorphic
1144 };
1145
1149 class element : public sequence
1150 {
1151 public:
1152 template <class T>
1153 inline element(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr) :
1154 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1155 code(element_code(src + tag.name.start, tag.name.size())),
1156 name(std::move(tag.name)),
1157 attributes(std::move(tag.attributes))
1158 {}
1159
1160 template <class T>
1161 static element_t element_code(_In_reads_z_(num_chars) const T* name, size_t num_chars)
1162 {
1163 static const struct {
1164 const char* name;
1165 element_t code;
1166 } mapping[] = {
1167 { "a", element_t::a, },
1168 { "abbr", element_t::abbr, },
1169 { "acronym", element_t::acronym, },
1170 { "address", element_t::address, },
1171 { "applet", element_t::applet, },
1172 { "area", element_t::area, },
1173 { "b", element_t::b, },
1174 { "base", element_t::base, },
1175 { "basefont", element_t::basefont, },
1176 { "bdo", element_t::bdo, },
1177 { "bgsound", element_t::bgsound, },
1178 { "big", element_t::big, },
1179 { "blink", element_t::blink, },
1180 { "blockquote", element_t::blockquote, },
1181 { "body", element_t::body, },
1182 { "br", element_t::br, },
1183 { "button", element_t::button, },
1184 { "caption", element_t::caption, },
1185 { "center", element_t::center, },
1186 { "cite", element_t::cite, },
1187 { "code", element_t::code, },
1188 { "col", element_t::col, },
1189 { "colgroup", element_t::colgroup, },
1190 { "comment", element_t::comment, },
1191 { "dd", element_t::dd, },
1192 { "del", element_t::del, },
1193 { "dfn", element_t::dfn, },
1194 { "dir", element_t::dir, },
1195 { "div", element_t::div, },
1196 { "dl", element_t::dl, },
1197 { "dt", element_t::dt, },
1198 { "em", element_t::em, },
1199 { "embed", element_t::embed, },
1200 { "fieldset", element_t::fieldset, },
1201 { "font", element_t::font, },
1202 { "form", element_t::form, },
1203 { "frame", element_t::frame, },
1204 { "frameset", element_t::frameset, },
1205 { "h1", element_t::h1, },
1206 { "h2", element_t::h2, },
1207 { "h3", element_t::h3, },
1208 { "h4", element_t::h4, },
1209 { "h5", element_t::h5, },
1210 { "h6", element_t::h6, },
1211 { "head", element_t::head, },
1212 { "hr", element_t::hr, },
1213 { "html", element_t::html, },
1214 { "i", element_t::i, },
1215 { "iframe", element_t::iframe, },
1216 { "img", element_t::img, },
1217 { "input", element_t::input, },
1218 { "ins", element_t::ins, },
1219 { "isindex", element_t::isindex, },
1220 { "kbd", element_t::kbd, },
1221 { "label", element_t::label, },
1222 { "legend", element_t::legend, },
1223 { "li", element_t::li, },
1224 { "link", element_t::link, },
1225 { "listing", element_t::listing, },
1226 { "map", element_t::map, },
1227 { "marquee", element_t::marquee, },
1228 { "menu", element_t::menu, },
1229 { "meta", element_t::meta, },
1230 { "nextid", element_t::nextid, },
1231 { "nobr", element_t::nobr, },
1232 { "noembed", element_t::noembed, },
1233 { "noframes", element_t::noframes, },
1234 { "noscript", element_t::noscript, },
1235 { "object", element_t::object, },
1236 { "ol", element_t::ol, },
1237 { "optgroup", element_t::optgroup, },
1238 { "option", element_t::option, },
1239 { "p", element_t::p, },
1240 { "param", element_t::param, },
1241 { "plaintext", element_t::plaintext, },
1242 { "pre", element_t::pre, },
1243 { "q", element_t::q, },
1244 { "rt", element_t::rt, },
1245 { "ruby", element_t::ruby, },
1246 { "s", element_t::s, },
1247 { "samp", element_t::samp, },
1248 { "script", element_t::script, },
1249 { "select", element_t::select, },
1250 { "small", element_t::small, },
1251 { "span", element_t::span, },
1252 { "strike", element_t::strike, },
1253 { "strong", element_t::strong, },
1254 { "style", element_t::style, },
1255 { "sub", element_t::sub, },
1256 { "sup", element_t::sup, },
1257 { "table", element_t::table, },
1258 { "tbody", element_t::tbody, },
1259 { "td", element_t::td, },
1260 { "textarea", element_t::textarea, },
1261 { "tfoot", element_t::tfoot, },
1262 { "th", element_t::th, },
1263 { "thead", element_t::thead, },
1264 { "title", element_t::title, },
1265 { "tr", element_t::tr, },
1266 { "tt", element_t::tt, },
1267 { "u", element_t::u, },
1268 { "ul", element_t::ul, },
1269 { "var", element_t::var, },
1270 { "wbr", element_t::wbr, },
1271 { "xmp", element_t::xmp, },
1272 };
1273#ifdef _DEBUG
1274 // The mapping table MUST be sorted and all names in lowercase.
1275 for (size_t i = 1; i < _countof(mapping); i++)
1276 _Assume_(stdex::strcmp(mapping[i - 1].name, mapping[i].name) <= 0);
1277 const auto& ctype = std::use_facet<std::ctype<char>>(stdex::std_locale_C);
1278 for (size_t i = 0; i < _countof(mapping); i++) {
1279 for (size_t j = 0; mapping[i].name[j]; j++)
1280 _Assume_(ctype.is(ctype.lower | ctype.digit, mapping[i].name[j]));
1281 }
1282#endif
1283 const auto& ctypeT = std::use_facet<std::ctype<T>>(stdex::std_locale_C);
1284 for (size_t i = 0, j = _countof(mapping); i < j; ) {
1285 size_t m = (i + j) / 2;
1286 int r = 0;
1287 for (size_t i1 = 0, i2 = 0;;) {
1288 if (!mapping[m].name[i1]) {
1289 r = i2 >= num_chars || !name[i2] ? 0 : -1;
1290 break;
1291 }
1292 if (i2 >= num_chars || !name[i2]) {
1293 r = 1;
1294 break;
1295 }
1296
1297 auto chr = static_cast<char>(ctypeT.tolower(name[i2++]));
1298 if (mapping[m].name[i1] > chr) {
1299 r = 1;
1300 break;
1301 }
1302 if (mapping[m].name[i1] < chr) {
1303 r = -1;
1304 break;
1305 }
1306 i1++;
1307 }
1308
1309 if (r < 0)
1310 i = m + 1;
1311 else if (r > 0)
1312 j = m;
1313 else
1314 return mapping[m].code;
1315 }
1316 return element_t::unknown;
1317 }
1318
1319 public:
1320 element_t code;
1322 std::vector<stdex::parser::html_attribute> attributes;
1323 };
1324
1325 class element_end;
1326
1330 class element_start : public element
1331 {
1332 public:
1333 template <class T>
1334 inline element_start(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr, _In_opt_ sequence* _end = nullptr) :
1335 element(std::move(tag), src, parent),
1336 end(_end)
1337 {}
1338
1339 public:
1341 };
1342
1346 class element_end : public sequence
1347 {
1348 public:
1349 template <class T>
1350 inline element_end(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr, _In_opt_ element_start* _start = nullptr) :
1351 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1352 code(element::element_code(src + tag.name.start, tag.name.size())),
1353 name(std::move(tag.name)),
1354 start(_start)
1355 {}
1356
1357 public:
1358 element_t code;
1361 };
1362
1366 class declaration : public sequence
1367 {
1368 public:
1369 template <class T>
1370 inline declaration(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_opt_ sequence* parent = nullptr) :
1371 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1372 name(std::move(tag.name)),
1373 attributes(std::move(tag.attributes))
1374 {}
1375
1376 public:
1378 std::vector<stdex::parser::html_attribute> attributes;
1379 };
1380
1384 class comment : public sequence
1385 {
1386 public:
1387 template <class T>
1388 inline comment(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_opt_ sequence* parent = nullptr) :
1389 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1390 content(std::move(tag.name))
1391 {}
1392
1393 public:
1395 };
1396
1400 class instruction : public sequence
1401 {
1402 public:
1403 template <class T>
1404 inline instruction(_Inout_ stdex::parser::basic_html_tag<T>&& tag, _In_opt_ sequence* parent = nullptr) :
1405 sequence(tag.type, tag.interval.start, tag.interval.end, parent),
1406 content(std::move(tag.name))
1407 {}
1408
1409 public:
1411 };
1412
1416 template<class _Elem, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
1417 struct entity
1418 {
1420 std::basic_string<_Elem, _Traits, _Alloc> value;
1421 };
1422
1426 template<class _Elem, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
1427 class parser;
1428
1432 template<class _Elem, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
1434 {
1435 public:
1436 document() :
1437 m_num_parsed(0),
1438 m_charset(stdex::charset_id::system),
1439
1440 // Declaration parsing data
1443 m_is_cdata(false),
1444 m_is_rcdata(false),
1445
1446 // Element parsing data
1448 {}
1449
1453 void clear()
1454 {
1455 m_source.clear();
1456 m_num_parsed = 0;
1457 m_charset = stdex::charset_id::system;
1458
1459 // Declaration parsing data
1461 m_is_cdata = m_is_rcdata = false;
1462 m_entities.clear();
1463
1464 // Element parsing data
1465 m_sequences.clear();
1466
1467 m_element_stack.clear();
1468 m_is_special_element = false;
1469 }
1470
1474 void append(_In_reads_or_z_opt_(num_chars) const _Elem* source, _In_ size_t num_chars = SIZE_MAX)
1475 {
1476 _Assume_(source || !num_chars);
1477 m_source.append(source, stdex::strnlen(source, num_chars));
1478 source = m_source.data();
1479 num_chars = m_source.size();
1480
1481 for (size_t i = m_num_parsed; i < num_chars;) {
1482 if (m_is_cdata || m_is_rcdata) {
1483 if (m_condition_end.match(source, i, num_chars)) {
1484 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(
1485 m_is_cdata ? stdex::parser::html_sequence_t::CDATA : stdex::parser::html_sequence_t::PCDATA,
1486 m_num_parsed, i,
1487 active_element()))));
1488 m_is_cdata = m_is_rcdata = false;
1489 i = m_num_parsed = m_condition_end.interval.end;
1490 continue;
1491 }
1492 goto next_char;
1493 }
1494
1496 if (m_condition_end.match(source, i, num_chars)) {
1498 i = m_num_parsed = m_condition_end.interval.end;
1499 continue;
1500 }
1501 goto next_char;
1502 }
1503
1504 if (m_num_valid_conditions && m_condition_end.match(source, i, num_chars)) {
1505 if (m_num_parsed < i)
1506 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));
1507
1509 i = m_num_parsed = m_condition_end.interval.end;
1510 continue;
1511 }
1512
1513 if (m_condition_start.match(source, i, num_chars)) {
1514 auto condition_src(replace_entities(source + m_condition_start.condition.start, m_condition_start.condition.size()));
1515 if (!stdex::strcmp(condition_src.c_str(), "CDATA"))
1516 m_is_cdata = true;
1517 else if (!stdex::strcmp(condition_src.c_str(), "RCDATA"))
1518 m_is_rcdata = true;
1521 else if (!stdex::strcmp(condition_src.c_str(), "IGNORE"))
1523 else
1525
1526 i = m_num_parsed = m_condition_start.interval.end;
1527 continue;
1528 }
1529
1531 auto parent = active_element();
1532 _Assume_(parent);
1533 if (m_tag.match(source, i, num_chars) &&
1534 m_tag.type == stdex::parser::html_sequence_t::element_end &&
1535 element::element_code(source + m_tag.name.start, m_tag.name.size()) == parent->code)
1536 {
1537 if (m_num_parsed < i)
1538 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, parent))));
1539 i = m_num_parsed = m_tag.interval.end;
1540 std::unique_ptr<element_end> e(new element_end(std::move(m_tag), source, parent->parent, parent));
1541 parent->end = e.get();
1542 m_sequences.push_back(std::move(e));
1543 m_element_stack.pop_back();
1544 m_is_special_element = false;
1545 continue;
1546 }
1547 goto next_char;
1548 }
1549
1550 if (m_tag.match(source, i, num_chars)) {
1551 if (m_num_parsed < i)
1552 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));
1553 i = m_num_parsed = m_tag.interval.end;
1554
1555 switch (m_tag.type) {
1556 case stdex::parser::html_sequence_t::element:
1557 case stdex::parser::html_sequence_t::element_start: {
1558 std::unique_ptr<element> e(
1559 m_tag.type == stdex::parser::html_sequence_t::element ? new element(std::move(m_tag), source) :
1560 m_tag.type == stdex::parser::html_sequence_t::element_start ? new element_start(std::move(m_tag), source) :
1561 nullptr);
1562
1563 // Does this tag end any of the started elements?
1564 for (size_t j = m_element_stack.size(); j--; ) {
1565 auto starting_tag = m_element_stack[j];
1566 _Assume_(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1567 if (element_traits::may_contain(starting_tag->code, e->code)) {
1568 e->parent = starting_tag;
1569 break;
1570 }
1571 e->parent = starting_tag->parent;
1572 starting_tag->end = e.get();
1573 m_element_stack.resize(j);
1574 }
1575
1576 if (e->type == stdex::parser::html_sequence_t::element_start) {
1577 auto e_start = static_cast<element_start*>(e.get());
1578 if (element_traits::span(e->code) == element_span_t::immediate)
1579 e_start->end = e.get();
1580 else {
1581 m_element_stack.push_back(e_start);
1582 switch (e->code) {
1583 case element_t::code:
1584 case element_t::comment:
1585 case element_t::script:
1586 case element_t::style:
1587 m_is_special_element = true;
1588 break;
1589 }
1590 }
1591 }
1592
1593 if (e->code == element_t::meta && m_charset == stdex::charset_id::system) {
1594 bool is_content_type = false;
1595 stdex::parser::html_attribute* content_attr = nullptr;
1596 for (auto& attr : e->attributes) {
1597 if (!stdex::strnicmp(source + attr.name.start, attr.name.size(), "http-equiv", SIZE_MAX, stdex::std_locale_C) &&
1598 !stdex::strnicmp(source + attr.value.start, attr.value.size(), "content-type", SIZE_MAX, stdex::std_locale_C))
1599 is_content_type = true;
1600 else if (!stdex::strnicmp(source + attr.name.start, attr.name.size(), "content", SIZE_MAX, stdex::std_locale_C))
1601 content_attr = &attr;
1602 }
1603 if (is_content_type && content_attr) {
1604 // <meta http-equiv="Content-Type" content="..."> found.
1606 if (content.match(source, content_attr->value.start, content_attr->value.end) &&
1607 content.charset)
1608 {
1609 std::string str;
1610 str.reserve(content.charset.size());
1611 for (size_t j = content.charset.start; j < content.charset.end; ++j)
1612 str.push_back(static_cast<char>(source[j]));
1613 m_charset = stdex::charset_from_name(str.c_str());
1614 }
1615 }
1616 }
1617
1618 m_sequences.push_back(std::move(e));
1619 break;
1620 }
1621 case stdex::parser::html_sequence_t::element_end: {
1622 std::unique_ptr<element_end> e(new element_end(std::move(m_tag), source, active_element()));
1623
1624 for (size_t j = m_element_stack.size(); j--; ) {
1625 auto starting_tag = m_element_stack[j];
1626 _Assume_(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1627 if (starting_tag->code == e->code ||
1628 starting_tag->code == element_t::unknown && e->code == element_t::unknown && !stdex::strnicmp(source + starting_tag->name.start, starting_tag->name.size(), source + e->name.start, e->name.size(), stdex::std_locale_C))
1629 {
1630 e->start = starting_tag;
1631 e->parent = starting_tag->parent;
1632 starting_tag->end = e.get();
1633 m_element_stack.resize(j);
1634 break;
1635 }
1636 }
1637
1638 m_sequences.push_back(std::move(e));
1639 break;
1640 }
1641 case stdex::parser::html_sequence_t::declaration:
1642 if (m_tag.attributes.size() > 3 &&
1643 !stdex::strnicmp(source + m_tag.attributes[0].name.start, m_tag.attributes[0].name.size(), "entity", SIZE_MAX, stdex::std_locale_C))
1644 {
1645 if (!stdex::strncmp(source + m_tag.attributes[1].name.start, m_tag.attributes[1].name.size(), "%", SIZE_MAX) &&
1646 stdex::strncmp(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size(), "SYSTEM", SIZE_MAX) &&
1647 stdex::strncmp(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size(), "PUBLIC", SIZE_MAX))
1648 {
1649 std::unique_ptr<entity<_Elem, _Traits, _Alloc>> e(new entity<_Elem, _Traits, _Alloc>());
1650 e->name = m_tag.attributes[2].name;
1651 e->value = std::move(replace_entities(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size()));
1652 m_entities.push_back(std::move(e));
1653 }
1654
1655 // TODO: Parse & entities and entities in SYSTEM and PUBLIC external files.
1656 }
1657 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new declaration(std::move(m_tag), active_element()))));
1658 break;
1659 case stdex::parser::html_sequence_t::comment:
1660 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new comment(std::move(m_tag), active_element()))));
1661 break;
1662 case stdex::parser::html_sequence_t::instruction:
1663 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new instruction(std::move(m_tag), active_element()))));
1664 break;
1665 default:
1666 throw std::invalid_argument("unknown tag type");
1667 }
1668
1669 continue;
1670 }
1671
1672 next_char:
1673 if (m_any_char.match(source, i, num_chars)) {
1674 // Skip any character, but don't declare it as parsed yet. It might be a part of unfinished tag.
1675 i = m_any_char.interval.end;
1676 }
1677 else
1678 break;
1679 }
1680 }
1681
1686 {
1687 size_t i = m_source.size();
1688 if (m_num_parsed < i)
1689 m_sequences.push_back(std::move(std::unique_ptr<sequence>(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));
1690 m_num_parsed = i;
1691 m_element_stack.clear();
1692 }
1693
1697 inline void assign(_In_reads_or_z_opt_(num_chars) const _Elem* source, _In_ size_t num_chars = SIZE_MAX)
1698 {
1699 clear();
1700 append(source, num_chars);
1701 finalize();
1702 }
1703
1707 inline const std::basic_string<_Elem, _Traits, _Alloc>& source() const { return m_source; }
1708
1709 friend class parser<_Elem, _Traits, _Alloc>;
1710
1711 protected:
1716 {
1717 return m_element_stack.empty() ? nullptr : m_element_stack.back();
1718 }
1719
1723 std::basic_string<_Elem, _Traits, _Alloc> replace_entities(_In_reads_or_z_opt_(num_chars) const _Elem* input, _In_ size_t num_chars) const
1724 {
1725 _Assume_(input || !num_chars);
1726 const size_t num_entities = m_entities.size();
1727 const _Elem* source = m_source.data();
1728 std::basic_string<_Elem, _Traits, _Alloc> output;
1729 for (size_t i = 0; i < num_chars && input[i];) {
1730 if (input[i] == '%') {
1731 for (size_t j = 0; j < num_entities; j++) {
1732 auto& e = m_entities[j];
1733 size_t entity_size = e->name.size();
1734 if (i + entity_size + 1 < num_chars &&
1735 !stdex::strncmp(input + i + 1, source + e->name.start, entity_size) &&
1736 input[i + entity_size + 1] == ';')
1737 {
1738 output += e->value;
1739 i += entity_size + 2;
1740 goto next_char;
1741 }
1742 }
1743 throw std::runtime_error("undefined entity");
1744 }
1745 output += input[i++];
1746 next_char:;
1747 }
1748 return output;
1749 }
1750
1751 protected:
1752 std::basic_string<_Elem, _Traits, _Alloc> m_source;
1754 stdex::charset_id m_charset;
1755
1756 // Declaration parsing data
1764 std::vector<std::unique_ptr<entity<_Elem, _Traits, _Alloc>>> m_entities;
1765
1766 // Element parsing data
1768 sequence_store m_sequences;
1769 std::vector<element_start*> m_element_stack;
1771 };
1772
1776 enum class token_t {
1777 root = 0,
1778 complete,
1779 starting,
1780 ending,
1781 url,
1782 };
1783
1787 constexpr size_t token_tag_max =
1788 sizeof(void*) * 2 // Memory address in hexadecimal
1789 + 2 // Leading and trailing parenthesis
1790 + 1; // Zero terminator
1791
1796 constexpr char token_tag_start = '\x12';
1797
1802 constexpr char token_tag_end = '\x13';
1803
1807 class token
1808 {
1809 protected:
1810 inline token(_In_ token_t _type = token_t::root, _In_opt_ sequence* _sequence = nullptr, _In_ uintptr_t _data = 0) :
1811 type(_type),
1812 sequence(_sequence),
1813 data(_data)
1814 {}
1815
1816 template<class _Elem, class _Traits, class _Alloc>
1817 friend class parser;
1818
1819 public:
1820 virtual ~token() {} // make polymorphic
1821
1829 template<class _Traits = std::char_traits<char>, class _Alloc = std::allocator<char>>
1830 inline size_t append_tag(_Inout_ std::basic_string<char, _Traits, _Alloc>& str) const
1831 {
1832 size_t n = str.size();
1833 // Use %X instead of %p to ommit leading zeros and save space.
1834 stdex::appendf(str, "%c%zX%c", stdex::locale_C.get(), token_tag_start, reinterpret_cast<uintptr_t>(this), token_tag_end);
1835 return str.size() - n;
1836 }
1837
1845 template<class _Traits = std::char_traits<wchar_t>, class _Alloc = std::allocator<wchar_t>>
1846 inline size_t append_tag(_Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& str) const
1847 {
1848 // Use %X instead of %p to ommit leading zeros and save space.
1849 return stdex::appendf(str, L"%c%zX%c", stdex::locale_C.get(), static_cast<wchar_t>(token_tag_start), reinterpret_cast<uintptr_t>(this), static_cast<wchar_t>(token_tag_end));
1850 }
1851
1852 template<class T>
1853 static inline token* parse_tag(const T* str, size_t& offset)
1854 {
1855 if (str[offset] != static_cast<T>(token_tag_start))
1856 return nullptr;
1857
1858 // Locate tag end.
1859 size_t end;
1860 for (end = offset + 1; ; end++) {
1861 if (!str[end])
1862 return nullptr;
1863 if (str[end] == token_tag_end)
1864 break;
1865 }
1866
1867 // Parse hexadecimal token memory address.
1868 token* t = reinterpret_cast<token*>(stdex::strtouint<T, uintptr_t>(str + offset + 1, end - offset - 1, nullptr, 16));
1869 if (!t)
1870 throw std::invalid_argument("null token");
1871 offset = end + 1;
1872 return t;
1873 }
1874
1875 public:
1876 token_t type;
1878 uintptr_t data;
1879 };
1880
1881 using token_vector = std::vector<std::unique_ptr<token>>;
1882 using token_list = std::list<token*>;
1883
1887 enum text_type_flag_t : uint32_t {
1888 has_tokens = 1 << 0,
1889 has_text = 1 << 1,
1890 is_title = 1 << 2,
1891 is_bullet = 1 << 3,
1892 };
1893
1897 template<class _Elem, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
1898 class text_token : public token
1899 {
1900 protected:
1901 inline text_token(
1902 _In_ token_t type = token_t::complete,
1903 _In_reads_or_z_opt_(num_chars) const _Elem* _text = nullptr, _In_ size_t num_chars = 0,
1904 _In_ uint32_t _text_type = 0,
1905 _In_opt_ stdex::html::sequence* sequence = nullptr, _In_ uintptr_t data = 0) :
1907 text(_text, num_chars),
1908 text_type(_text_type)
1909 {}
1910
1911 friend class parser<_Elem, _Traits, _Alloc>;
1912
1913 public:
1914 std::basic_string<_Elem, _Traits, _Alloc> text;
1915 uint32_t text_type;
1916 stdex::mapping_vector<size_t> mapping;
1917 };
1918
1922 template<class _Elem, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
1923 class starting_token : public text_token<_Elem, _Traits, _Alloc>
1924 {
1925 protected:
1926 inline starting_token(
1927 _In_reads_or_z_opt_(num_chars_text) const _Elem* _text = nullptr, _In_ size_t num_chars_text = 0,
1928 _In_reads_or_z_opt_(num_chars_name) const _Elem* _name = nullptr, _In_ size_t num_chars_name = 0,
1929 _In_ uint32_t text_type = 0,
1930 _In_opt_ stdex::html::sequence* sequence = nullptr,
1931 _In_opt_ stdex::html::sequence* _end_sequence = nullptr,
1932 _In_ uintptr_t data = 0) :
1933 text_token(token_t::starting, _text, num_chars_text, text_type, sequence, data),
1934 name(_name, num_chars_name),
1935 end_sequence(_end_sequence)
1936 {}
1937
1938 friend class parser<_Elem, _Traits, _Alloc>;
1939
1940 public:
1941 std::basic_string<_Elem, _Traits, _Alloc> name;
1943 };
1944
1948 enum class token_url_t {
1949 plain = 0, // URL is not using any particular encoding scheme (as-is)
1950 sgml, // URL is encoded using SGML entities
1951 css, // URL is encoded using CSS escaping scheme
1952 };
1953
1957 template<class _Elem, class _Traits = std::char_traits<_Elem>, class _Alloc = std::allocator<_Elem>>
1958 class url_token : public token
1959 {
1960 protected:
1961 inline url_token(
1962 _In_reads_or_z_opt_(num_chars) const _Elem* _url = nullptr, _In_ size_t num_chars = 0,
1963 token_url_t _encoding = token_url_t::plain,
1964 _In_opt_ stdex::html::sequence* sequence = nullptr, _In_ uintptr_t data = 0) :
1965 token(token_t::url, sequence, data),
1966 url(_url, num_chars),
1967 encoding(_encoding)
1968 {}
1969
1970 friend class parser<_Elem, _Traits, _Alloc>;
1971
1972 public:
1973 std::basic_string<_Elem, _Traits, _Alloc> url;
1974 token_url_t encoding;
1975 };
1976
1982 std::list<stdex::html::token*> active_tokens;
1983 size_t word_index;
1985 };
1986
1987 using inserted_token_list = std::list<inserted_token>;
1988
1989 template<class _Elem, class _Traits, class _Alloc>
1991 {
1992 public:
1993 inline parser(
1995 _In_reads_or_z_opt_(num_chars) const stdex::schar_t* url = nullptr, _In_ size_t num_chars = 0,
1996 _In_ bool parse_frames = false, _In_ stdex::progress<size_t>* progress = nullptr) :
1998 m_url(url, stdex::strnlen(url, num_chars)),
1999 m_parse_frames(parse_frames),
2001 m_source(nullptr)
2002 {}
2003
2008 {
2009 _Assume_(m_tokens.empty());
2010
2011 if (m_progress) {
2012 m_progress->set_range(0, m_document.source().size());
2013 m_progress->set(0);
2014 }
2015
2016 m_source = m_document.source().data();
2018 return parse(m_document.m_sequences.end());
2019 }
2020
2027 static void link(_Inout_ std::basic_string<_Elem, _Traits, _Alloc>& source, _In_ const text_token<_Elem, _Traits, _Alloc>* t)
2028 {
2029 _Assume_(t);
2030 _Assume_(
2031 t->type == token_t::complete ||
2032 t->type == token_t::starting ||
2033 t->type == token_t::ending ||
2034 t->type == token_t::root);
2035
2036 if (t->text_type & has_tokens) {
2037 const _Elem* root = t->text.data();
2038 for (size_t i = 0, num_chars = t->text.size(); i < num_chars && root[i];) {
2039 _Assume_(root[i] != token_tag_end);
2040 const token* t2 = token::parse_tag(root, i);
2041 if (t2) {
2042 switch (t2->type) {
2043 case token_t::complete:
2044 case token_t::starting:
2045 case token_t::ending:
2046 case token_t::root:
2047 link(source, dynamic_cast<const text_token<_Elem, _Traits, _Alloc>*>(t2));
2048 break;
2049 case token_t::url: {
2050 auto t2_url = dynamic_cast<const url_token<_Elem, _Traits, _Alloc>*>(t2);
2051 switch (t2_url->encoding) {
2052 case token_url_t::plain:
2053 source += t2_url->url;
2054 break;
2055 case token_url_t::sgml:
2056 escape(source, t2_url->url.data(), t2_url->url.size());
2057 break;
2058 case token_url_t::css:
2059 css_escape(source, t2_url->url.data(), t2_url->url.size());
2060 break;
2061 default:
2062 throw std::invalid_argument("unsupported URL encoding");
2063 }
2064 break;
2065 }
2066 default:
2067 throw std::invalid_argument("unsupported token type");
2068 }
2069 }
2070 else if (t->text_type & has_text) {
2071 escape_min(source, root[i]);
2072 i++;
2073 }
2074 else
2075 source += root[i++];
2076 }
2077 }
2078 else if (t->text_type & has_text) {
2079 // Token contains no references to other tokens. But, it does contain text that requires escaping.
2080 escape_min(source, t->text.data(), t->text.size());
2081 }
2082 else
2083 source += t->text;
2084 }
2085
2094 static void start_tokens(_Inout_ std::basic_string<_Elem, _Traits, _Alloc>& source, _Inout_ token_list& active_tokens, _In_ const token_list& new_tokens, _In_ token_list::const_iterator from)
2095 {
2096 for (; from != new_tokens.cend(); ++from) {
2097 auto t = *from;
2098 t->append_tag(source);
2099 active_tokens.push_back(t);
2100 }
2101 }
2102
2112 token_list::const_iterator end_tokens(_Inout_ std::basic_string<_Elem, _Traits, _Alloc>& source, _Inout_ token_list& active_tokens, _In_ const token_list& new_tokens)
2113 {
2114 // Skip matching tokens in active_tokens and new_tokens.
2115 token_list::const_iterator i1, i2;
2116 for (i1 = active_tokens.cbegin(), i2 = new_tokens.cbegin(); i1 != active_tokens.cend(); ++i1, ++i2) {
2117 if (i2 == new_tokens.cend() || *i1 != *i2) {
2118 // Got two tokens, where lists don't match anymore, or new_tokens list is out.
2119 // End tokens not relevant anymore in reverse order of starting.
2120 for (auto i = active_tokens.cend(); i != active_tokens.cbegin(); ) {
2121 auto t1 = dynamic_cast<starting_token<_Elem, _Traits, _Alloc>*>(*(--i));
2122 _Assume_(t1 && t1->type == token_t::starting);
2123
2124 std::unique_ptr<text_token<_Elem, _Traits, _Alloc>> t2(new text_token<_Elem, _Traits, _Alloc>(token_t::ending));
2125 t2->text.reserve(t1->name.size() + 3);
2126 t2->text += '<';
2127 t2->text += '/';
2128 t2->text += t1->name;
2129 t2->text += '>';
2130 append_token(std::move(t2), source);
2131
2132 // Pop the active token.
2133 if (i1 == i) {
2134 active_tokens.erase(i);
2135 break;
2136 }
2137 active_tokens.erase(i);
2138 i = active_tokens.cend();
2139 }
2140 break;
2141 }
2142 }
2143 return i2;
2144 }
2145
2155 void append_inserted_tokens(_Inout_ std::basic_string<_Elem, _Traits, _Alloc>& source, _Inout_ inserted_token_list& inserted_tokens,
2156 _In_ size_t word_index, _In_ bool after_word,
2157 _Inout_ token_list& active_tokens)
2158 {
2159 for (auto i = inserted_tokens.begin(); i != inserted_tokens.end(); ) {
2160 auto& t = *i;
2161 _Assume_(t.token);
2162 if (t.word_index == word_index && t.after_word == after_word) {
2163 if (t.token->type != token_t::ending)
2164 start_tokens(source, active_tokens, t.active_tokens, end_tokens(source, active_tokens, t.active_tokens));
2165 t.token->append_tag(source);
2166 inserted_tokens.erase(i++);
2167 }
2168 else
2169 ++i;
2170 }
2171 }
2172
2179 static void merge(_Inout_ token_list& a, _In_ const token_list& b)
2180 {
2181 for (auto i2 = b.begin(); i2 != b.end(); ++i2) {
2182 auto t2 = *i2;
2183 for (auto i1 = a.begin(); i1 != a.end(); ++i1) {
2184 if (i1 == a.end()) {
2185 a.push_back(t2);
2186 break;
2187 }
2188 auto t1 = *i1;
2189 if (t1 == t2)
2190 break;
2191 }
2192 }
2193 }
2194
2198 void make_absolute_url(std::basic_string<_Elem, _Traits, _Alloc>& rel)
2199 {
2200 _Unreferenced_(rel);
2201
2202 if (m_url.empty())
2203 return;
2204
2205 // TODO: Implement!
2206 }
2207
2211 inline const token_vector& tokens() const { return m_tokens; }
2212
2213 protected:
2221 template <class T>
2222 inline T* append_token(_Inout_ std::unique_ptr<T>&& token)
2223 {
2224 if (!token)
2225 return nullptr;
2226 auto t = token.get();
2227 m_tokens.push_back(std::move(token));
2228 return t;
2229 }
2230
2239 template <class T>
2240 inline size_t append_token(_Inout_ std::unique_ptr<T>&& token, _Inout_ std::basic_string<_Elem, _Traits, _Alloc>& source)
2241 {
2242 if (!token)
2243 return 0;
2244 size_t n = token->append_tag(source);
2245 m_tokens.push_back(std::move(token));
2246 return n;
2247 }
2248
2257 text_token<_Elem, _Traits, _Alloc>* parse(_In_ const sequence_store::const_iterator& end, _In_ uint32_t text_type = 0)
2258 {
2260 std::unique_ptr<text_token<_Elem, _Traits, _Alloc>> token(new text_token<_Elem, _Traits, _Alloc>(
2261 token_t::complete,
2262 nullptr, 0,
2263 text_type,
2264 m_offset->get()));
2265
2266 while (m_offset != end) {
2267 auto& s = *m_offset;
2268
2269 if (m_progress) {
2270 if (m_progress->cancel())
2271 throw stdex::user_cancelled();
2272 m_progress->set(s->interval.start);
2273 }
2274
2275 // No token_tag_start and token_tag_end chars, please.
2276 _Assume_(
2277 stdex::strnchr(m_source + s->interval.start, s->interval.size(), static_cast<_Elem>(token_tag_start)) == stdex::npos &&
2278 stdex::strnchr(m_source + s->interval.start, s->interval.size(), static_cast<_Elem>(token_tag_end)) == stdex::npos);
2279
2280 if (s->type == stdex::parser::html_sequence_t::text) {
2281 rel.from = s->interval.start;
2282 token->mapping.push_back(rel);
2283 stdex::sgml2strcat(token->text, m_source + s->interval.start, s->interval.size(), 0, rel, &token->mapping);
2284 rel.to = token->text.size();
2285 if (!(token->text_type & has_text) &&
2286 !stdex::isblank(m_source + s->interval.start, s->interval.size(), stdex::std_locale_C))
2287 token->text_type |= has_text;
2288 ++m_offset;
2289 }
2290 else if (s->type == stdex::parser::html_sequence_t::element || s->type == stdex::parser::html_sequence_t::element_start) {
2291 const element* s_el = static_cast<const element*>(s.get());
2292 _Assume_(s_el);
2293 const element_start* s_el_start = s->type == stdex::parser::html_sequence_t::element_start ? static_cast<const element_start*>(s.get()) : nullptr;
2294 if (s_el->code == element_t::frameset && !m_parse_frames)
2295 throw std::invalid_argument("<frameset> detected");
2296
2297 {
2298 size_t offset = s->interval.start;
2299 std::unique_ptr<text_token<_Elem, _Traits, _Alloc>> t(s->type == stdex::parser::html_sequence_t::element || element_traits::span(s_el_start->code) == element_span_t::immediate ?
2300 new text_token<_Elem, _Traits, _Alloc>(token_t::complete, nullptr, 0, 0, s.get()) :
2301 new starting_token<_Elem, _Traits, _Alloc>(nullptr, 0, m_source + s_el_start->name.start, s_el_start->name.size(), 0, s.get(), s_el_start->end));
2302
2303 // Copy the tag contents, but mind any attributes containing localizable text.
2304 for (auto& a : s_el->attributes) {
2305 if (a.value.empty() ||
2306 stdex::isblank(m_source + a.value.start, a.value.size(), stdex::std_locale_C))
2307 continue;
2308
2309 if (element_traits::is_uri(s_el->code, m_source + a.name.start, a.name.size())) {
2310 t->text.append(m_source + offset, a.value.start - offset);
2311 std::unique_ptr<url_token<_Elem, _Traits, _Alloc>> t_url(new url_token<_Elem, _Traits, _Alloc>(
2312 nullptr, 0,
2313 token_url_t::sgml,
2314 s.get()));
2315 stdex::sgml2strcat(t_url->url, m_source + a.value.start, a.value.size());
2316 append_token(std::move(t_url), t->text);
2317 t->text_type |= has_tokens;
2318 offset = a.value.end;
2319 }
2320 else if (element_traits::is_localizable(s_el->code, m_source + a.name.start, a.name.size())) {
2321 t->text.append(m_source + offset, a.value.start - offset);
2322 std::unique_ptr<text_token<_Elem, _Traits, _Alloc>> t_value(new text_token<_Elem, _Traits, _Alloc>(
2323 token_t::complete,
2324 nullptr, 0,
2325 has_text | is_title,
2326 s.get()));
2327 stdex::mapping<size_t> rel_value(a.value.start, 0);
2328 t_value->mapping.push_back(rel_value);
2329 stdex::sgml2strcpy(t_value->text, m_source + a.value.start, a.value.size(), 0, rel_value, &t_value->mapping);
2330 append_token(std::move(t_value), t->text);
2331 t->text_type |= has_tokens;
2332 offset = a.value.end;
2333 }
2334 }
2335
2336 t->text.append(m_source + offset, s->interval.end - offset);
2337 rel.from = s->interval.start;
2338 token->mapping.push_back(rel);
2339 rel.to += append_token(std::move(t), token->text);
2340 token->text_type |= has_tokens;
2341 }
2342 ++m_offset;
2343
2344 if (s_el_start) {
2345 if (s_el_start->code == element_t::address ||
2346 s_el_start->code == element_t::code ||
2347 s_el_start->code == element_t::comment ||
2348 s_el_start->code == element_t::cite ||
2349 s_el_start->code == element_t::kbd ||
2350 s_el_start->code == element_t::samp ||
2351 s_el_start->code == element_t::script ||
2352 s_el_start->code == element_t::style)
2353 {
2354 // Non-localizable
2355 auto s_end = s_el_start->end;
2356 _Assume_(s_end);
2357
2358 if (s->interval.end < s_end->interval.start) {
2359 if (s_el_start->code != element_t::style) {
2360 rel.from = s->interval.start;
2361 token->mapping.push_back(rel);
2362 rel.to += append_token(std::move(std::unique_ptr<text_token<_Elem, _Traits, _Alloc>>(
2364 token_t::complete,
2365 m_source + s->interval.end, s_end->interval.start - s->interval.end,
2366 0,
2367 m_offset->get()))),
2368 token->text);
2369 }
2370 else {
2371 // Partially parse CSS. It may contain URLs we need to make absolute.
2372 auto t = parse_css(s->interval.end, s_end->interval.start);
2373 _Assume_(t);
2374 rel.from = s->interval.start;
2375 token->mapping.push_back(rel);
2376 rel.to += t->append_tag(token->text);
2377 }
2378 token->text_type |= has_tokens;
2379 }
2380 while (m_offset != end && m_offset->get() != s_end)
2381 ++m_offset;
2382 }
2383 else if (element_traits::is_group(s_el_start->code)) {
2384 auto limit = m_offset;
2385 while (limit != end && limit->get() != s_el_start->end)
2386 ++limit;
2387 auto t = parse(limit,
2388 (element_traits::is_heading(s_el_start->code) || s_el_start->code == element_t::dt || s_el_start->code == element_t::title ? is_title : 0) |
2389 (element_traits::is_list(s_el_start->code) ? is_bullet : 0));
2390 rel.from = s->interval.start;
2391 token->mapping.push_back(rel);
2392 rel.to += t->append_tag(token->text);
2393 token->text_type |= has_tokens;
2394 }
2395 }
2396 }
2397 else if (s->type == stdex::parser::html_sequence_t::element_end) {
2398 rel.from = s->interval.start;
2399 token->mapping.push_back(rel);
2400 rel.to += append_token(std::move(std::unique_ptr<text_token<_Elem, _Traits, _Alloc>>(
2402 token_t::ending,
2403 m_source + s->interval.start, s->interval.size(),
2404 0,
2405 s.get()))),
2406 token->text);
2407 token->text_type |= has_tokens;
2408 ++m_offset;
2409 }
2410 else {
2411 // Declaration, instruction, (P)CDATA section, comment...
2412 rel.from = s->interval.start;
2413 token->mapping.push_back(rel);
2414 rel.to += append_token(std::move(std::unique_ptr<text_token<_Elem, _Traits, _Alloc>>(
2416 token_t::complete,
2417 m_source + s->interval.start, s->interval.size(),
2418 0,
2419 s.get()))),
2420 token->text);
2421 token->text_type |= has_tokens;
2422 ++m_offset;
2423 }
2424 }
2425
2426 return append_token(std::move(token));
2427 }
2428
2433 {
2434 stdex::interval<size_t> section, content;
2435 std::unique_ptr<text_token<_Elem, _Traits, _Alloc>> token(
2437 token_t::complete,
2438 nullptr, 0,
2439 0,
2440 m_offset->get()));
2441
2442 for (;;) {
2443 if (m_css_comment.match(m_source, start, end)) {
2444 token->text.append(m_source + start, m_css_comment.interval.end - start);
2445 start = m_css_comment.interval.end;
2446 }
2447 else if (m_css_cdo.match(m_source, start, end)) {
2448 token->text.append(m_source + start, m_css_cdo.interval.end - start);
2449 start = m_css_cdo.interval.end;
2450 }
2451 else if (m_css_cdc.match(m_source, start, end)) {
2452 token->text.append(m_source + start, m_css_cdc.interval.end - start);
2453 start = m_css_cdc.interval.end;
2454 }
2455 else if (
2456 m_css_import.match(m_source, start, end) && (section = m_css_import.interval, content = m_css_import.content, true) ||
2457 m_css_uri.match(m_source, start, end) && (section = m_css_uri.interval, content = m_css_uri.content, true))
2458 {
2459 std::unique_ptr<url_token<_Elem, _Traits, _Alloc>> t_url(
2461 nullptr, 0,
2462 token_url_t::css,
2463 m_offset->get()));
2464 css_unescape(t_url->url, m_source + content.start, content.size());
2465 token->text.append(m_source + start, content.start - start);
2466 append_token(std::move(t_url), token->text);
2467 token->text.append(m_source + content.end, section.end - content.end);
2468 token->text_type |= has_tokens;
2469 start = section.end;
2470 }
2471 else if (m_any_char.match(m_source, start, end)) {
2472 token->text.append(m_source + start, m_any_char.interval.end - start);
2473 start = m_any_char.interval.end;
2474 }
2475 else
2476 break;
2477 }
2478
2479 return append_token(std::move(token));
2480 }
2481
2482 protected:
2484 const stdex::sys_string m_url;
2485 const bool m_parse_frames;
2487 const _Elem* m_source;
2488 token_vector m_tokens;
2489 sequence_store::const_iterator m_offset;
2490
2491 // For detecting URLs in CSS
2499 };
2500 }
2501}
HTML comment.
Definition html.hpp:1385
stdex::interval< size_t > content
Comment content position in source.
Definition html.hpp:1394
HTML declaration.
Definition html.hpp:1367
stdex::interval< size_t > name
Declaration name position in source.
Definition html.hpp:1377
std::vector< stdex::parser::html_attribute > attributes
Declaration attribute positions in source.
Definition html.hpp:1378
HTML document.
Definition html.hpp:1434
std::vector< element_start * > m_element_stack
LIFO stack of started elements.
Definition html.hpp:1769
void finalize()
Finalizes document when no more appending is planned.
Definition html.hpp:1685
stdex::charset_id m_charset
Document charset.
Definition html.hpp:1754
bool m_is_rcdata
Inside of RCDATA?
Definition html.hpp:1760
bool m_is_special_element
Inside of a special element (<SCRIPT>, <STYLE>, ...)?
Definition html.hpp:1770
void append(_In_reads_or_z_opt_(num_chars) const _Elem *source, size_t num_chars=SIZE_MAX)
Parses HTML source code by chunks.
Definition html.hpp:1474
sequence_store m_sequences
Store of sequences.
Definition html.hpp:1768
size_t m_num_invalid_conditions
Number of started invalid conditions.
Definition html.hpp:1758
void assign(_In_reads_or_z_opt_(num_chars) const _Elem *source, size_t num_chars=SIZE_MAX)
Parses HTML document source code.
Definition html.hpp:1697
std::vector< std::unique_ptr< entity< _Elem, _Traits, _Alloc > > > m_entities
Array of entities.
Definition html.hpp:1764
std::basic_string< _Elem, _Traits, _Alloc > m_source
Document HTML source code.
Definition html.hpp:1752
element_start * active_element() const
Returns starting tag of currently active element or nullptr if no element is known to be started.
Definition html.hpp:1715
std::basic_string< _Elem, _Traits, _Alloc > replace_entities(_In_reads_or_z_opt_(num_chars) const _Elem *input, size_t num_chars) const
Replaces entities with their content.
Definition html.hpp:1723
void clear()
Empties document.
Definition html.hpp:1453
size_t m_num_valid_conditions
Number of started valid conditions.
Definition html.hpp:1757
const std::basic_string< _Elem, _Traits, _Alloc > & source() const
Returns document HTML source code.
Definition html.hpp:1707
size_t m_num_parsed
Number of characters already parsed.
Definition html.hpp:1753
bool m_is_cdata
Inside of CDATA?
Definition html.hpp:1759
Ending tag of an HTML element </...>
Definition html.hpp:1347
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1359
element_start * start
Corresponding starting tag.
Definition html.hpp:1360
element_t code
Element code.
Definition html.hpp:1358
Starting tag of an HTML element <...>
Definition html.hpp:1331
sequence * end
Corresponding ending tag of type element_end; When element is ended by a start of another element,...
Definition html.hpp:1340
HTML element <.../>
Definition html.hpp:1150
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1321
std::vector< stdex::parser::html_attribute > attributes
Element attribute positions in source.
Definition html.hpp:1322
element_t code
Element code.
Definition html.hpp:1320
HTML instruction.
Definition html.hpp:1401
stdex::interval< size_t > content
Instruction content position in source.
Definition html.hpp:1410
HTML parser.
Definition html.hpp:1991
stdex::progress< size_t > * m_progress
Progress indicator.
Definition html.hpp:2486
text_token< _Elem, _Traits, _Alloc > * parse_css(size_t start, size_t end)
Parses CSS.
Definition html.hpp:2432
static void merge(token_list &a, const token_list &b)
Adds tokens from list b to list a creating an union.
Definition html.hpp:2179
token_list::const_iterator end_tokens(std::basic_string< _Elem, _Traits, _Alloc > &source, token_list &active_tokens, const token_list &new_tokens)
Pops ending tokens from the active token list and append their tags to the source code string.
Definition html.hpp:2112
static void link(std::basic_string< _Elem, _Traits, _Alloc > &source, const text_token< _Elem, _Traits, _Alloc > *t)
Rebuilds HTML source code from the token tree.
Definition html.hpp:2027
text_token< _Elem, _Traits, _Alloc > * parse(const sequence_store::const_iterator &end, uint32_t text_type=0)
Recursively parses HTML document.
Definition html.hpp:2257
const _Elem * m_source
HTML source code.
Definition html.hpp:2487
token_vector m_tokens
HTML token storage.
Definition html.hpp:2488
text_token< _Elem, _Traits, _Alloc > * parse()
Parses HTML document.
Definition html.hpp:2007
const document< _Elem, _Traits, _Alloc > & m_document
Document being analyzed.
Definition html.hpp:2483
void make_absolute_url(std::basic_string< _Elem, _Traits, _Alloc > &rel)
Converts URL to absolute.
Definition html.hpp:2198
size_t append_token(std::unique_ptr< T > &&token, std::basic_string< _Elem, _Traits, _Alloc > &source)
Adds token to the collection and appends its tag to the source code string.
Definition html.hpp:2240
const token_vector & tokens() const
Returns collection of tokens.
Definition html.hpp:2211
const stdex::sys_string m_url
Absolute document URL.
Definition html.hpp:2484
const bool m_parse_frames
Parse frames.
Definition html.hpp:2485
static void start_tokens(std::basic_string< _Elem, _Traits, _Alloc > &source, token_list &active_tokens, const token_list &new_tokens, token_list::const_iterator from)
Pushes tokens to the active token list and appends their tags to the source code string.
Definition html.hpp:2094
T * append_token(std::unique_ptr< T > &&token)
Adds token to the collection.
Definition html.hpp:2222
void append_inserted_tokens(std::basic_string< _Elem, _Traits, _Alloc > &source, inserted_token_list &inserted_tokens, size_t word_index, bool after_word, token_list &active_tokens)
Adds matching inserted tokens before/after the given word in source code.
Definition html.hpp:2155
sequence_store::const_iterator m_offset
Index of active section.
Definition html.hpp:2489
Base class for HTML sequences.
Definition html.hpp:1131
stdex::interval< size_t > interval
Sequence position in source.
Definition html.hpp:1134
stdex::parser::html_sequence_t type
Sequence type. Enum is used for performance reasons (vs. dynamic_cast)
Definition html.hpp:1133
sequence * parent
Parent sequence.
Definition html.hpp:1135
Token representing start HTML tag.
Definition html.hpp:1924
stdex::html::sequence * end_sequence
Ending tag sequence.
Definition html.hpp:1942
std::basic_string< _Elem, _Traits, _Alloc > name
Element name allowing later recreation of ending </tag>
Definition html.hpp:1941
Token representing part of HTML text.
Definition html.hpp:1899
stdex::mapping_vector< size_t > mapping
Mapping between source and text positions.
Definition html.hpp:1916
std::basic_string< _Elem, _Traits, _Alloc > text
Token text.
Definition html.hpp:1914
uint32_t text_type
Mask of text_type_flag_t to specify text content.
Definition html.hpp:1915
HTML token base class.
Definition html.hpp:1808
sequence * sequence
Pointer to the sequence this token represents or nullptr when it doesn't trivially represent one sequ...
Definition html.hpp:1877
size_t append_tag(std::basic_string< char, _Traits, _Alloc > &str) const
Appends token tag to the source code.
Definition html.hpp:1830
uintptr_t data
Any user-supplied data.
Definition html.hpp:1878
token_t type
Token type.
Definition html.hpp:1876
size_t append_tag(std::basic_string< wchar_t, _Traits, _Alloc > &str) const
Appends token tag to the source code.
Definition html.hpp:1846
HTTP token representing an URL.
Definition html.hpp:1959
token_url_t encoding
URL encoding.
Definition html.hpp:1974
std::basic_string< _Elem, _Traits, _Alloc > url
URL.
Definition html.hpp:1973
stdex::interval< size_t > content
content position in source
Definition parser.hpp:7835
stdex::interval< size_t > content
content position in source
Definition parser.hpp:7750
std::vector< html_attribute > attributes
tag attributes
Definition parser.hpp:8357
html_sequence_t type
tag type
Definition parser.hpp:8355
stdex::interval< size_t > name
tag name position in source
Definition parser.hpp:8356
stdex::interval< size_t > interval
Region of the last match.
Definition parser.hpp:172
Test for given string.
Definition parser.hpp:818
Progress indicator base class.
Definition progress.hpp:19
virtual bool cancel()
Query whether user requested abort.
Definition progress.hpp:65
virtual void set(T value)
Set current progress.
Definition progress.hpp:47
virtual void set_range(T start, T end)
Set progress range extent.
Definition progress.hpp:37
User cancelled exception.
Definition exception.hpp:17
Describes attributes associated with a HTML element.
Definition html.hpp:515
static bool is_group(element_t code)
Does element represent a separate part of text?
Definition html.hpp:920
static bool is_flow(element_t code)
Does element typically represent text?
Definition html.hpp:834
static bool is_heading(element_t code)
Does element represent a heading?
Definition html.hpp:752
static bool is_head_content(element_t code)
Is element part of the document head?
Definition html.hpp:844
static bool is_fontstyle(element_t code)
Does element represent font styling?
Definition html.hpp:640
static bool is_block(element_t code)
Is element typically displayed as a stand-alone section of text?
Definition html.hpp:803
static bool is_head_misc(element_t code)
May element be a part of document head?
Definition html.hpp:861
static bool is_list(element_t code)
Does element represent a list of items?
Definition html.hpp:771
static bool is_uri(element_t code, _In_reads_or_z_opt_(num_chars) const T *attr_name, size_t num_chars)
Checks if expected element attribute value is URI.
Definition html.hpp:1050
static bool is_preformatted(element_t code)
Does element represent preformatted text, source code etc.?
Definition html.hpp:788
static bool is_localizable(element_t code, const T *attr_name, size_t num_chars)
Checks if expected element attribute value is localizable.
Definition html.hpp:1105
static bool is_special(element_t code)
Does element represent non-textual item in the document?
Definition html.hpp:686
static bool is_pre_exclusion(element_t code)
May element be a part of.
Definition html.hpp:879
static bool is_inline(element_t code)
Is element typically displayed inline with text?
Definition html.hpp:737
static bool is_html_content(element_t code)
Does element represent the document body?
Definition html.hpp:904
static bool is_formctrl(element_t code)
Does element represent a form control?
Definition html.hpp:719
static bool is_phrase(element_t code)
Does element represent a phrase-of-speech?
Definition html.hpp:662
static bool may_contain(element_t parent, element_t child)
Checks if one element may nest inside another.
Definition html.hpp:950
static element_span_t span(element_t code)
Returns expected element span in HTML code.
Definition html.hpp:521
HTML entity.
Definition html.hpp:1418
stdex::interval< size_t > name
Name position in source.
Definition html.hpp:1419
std::basic_string< _Elem, _Traits, _Alloc > value
Entity value.
Definition html.hpp:1420
Inserted HTML token.
Definition html.hpp:1980
bool after_word
true if token is anchored after the word; false if anchored before the word
Definition html.hpp:1984
std::list< stdex::html::token * > active_tokens
List of started tokens at inserted token.
Definition html.hpp:1982
size_t word_index
Index of the word, token is anchored to.
Definition html.hpp:1983
token * token
Points to the token.
Definition html.hpp:1981
Numerical interval.
Definition interval.hpp:18
T size() const
Returns interval size.
Definition interval.hpp:47
T end
interval end
Definition interval.hpp:20
T start
interval start
Definition interval.hpp:19
Maps index in source string to index in destination string.
Definition mapping.hpp:17
mapping()
Constructs a zero to zero mapping.
Definition mapping.hpp:24
Tag attribute.
Definition parser.hpp:8129
stdex::interval< size_t > value
attribute value position in source
Definition parser.hpp:8131