9#include "exception.hpp"
10#include "interval.hpp"
13#include "progress.hpp"
40 template<
class _Traits = std::
char_traits<
char>,
class _Alloc = std::allocator<
char>>
42 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
43 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars)
45 _Assume_(src || !num_chars);
46 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
48 case '&': dst +=
"&";
break;
49 case ';': dst +=
";";
break;
50 case '\"': dst +=
""";
break;
51 case '\'': dst +=
"'";
break;
52 case '<': dst +=
"<";
break;
53 case '>': dst +=
">";
break;
54 case 0x00a0: dst +=
" ";
break;
55 default: dst += src[i];
break;
67 template<
class _Traits = std::
char_traits<
wchar_t>,
class _Alloc = std::allocator<
wchar_t>>
69 _Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& dst,
70 _In_reads_or_z_opt_(num_chars)
const wchar_t* src, _In_
size_t num_chars)
72 _Assume_(src || !num_chars);
73 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
75 case L
'&': dst += L
"&";
break;
76 case L
';': dst += L
";";
break;
77 case L
'\"': dst += L
""";
break;
78 case L
'\'': dst += L
"'";
break;
79 case L
'<': dst += L
"<";
break;
80 case L
'>': dst += L
">";
break;
81 case L
'\u00a0': dst += L
" ";
break;
82 default: dst += src[i];
break;
93 template<
class _Elem,
size_t _Size,
class _Traits = std::
char_traits<_Elem>,
class _Alloc = std::allocator<_Elem>>
95 _Inout_ std::basic_string<_Elem, _Traits, _Alloc>& dst,
96 _In_
const _Elem (&src)[_Size])
98 escape(dst, src, _Size);
107 template<
class _Elem,
class _Traits_dst = std::
char_traits<_Elem>,
class _Alloc_dst = std::allocator<_Elem>,
class _Traits_src = std::
char_traits<_Elem>,
class _Alloc_src = std::allocator<_Elem>>
109 _Inout_ std::basic_string<_Elem, _Traits_dst, _Alloc_dst>& dst,
110 _In_
const std::basic_string<_Elem, _Traits_src, _Alloc_src>& src)
112 escape(dst, src.data(), src.size());
121 template<
class _Traits = std::
char_traits<
char>,
class _Alloc = std::allocator<
char>>
122 inline void escape_min(_Inout_ std::basic_string<char, _Traits, _Alloc>& dst, _In_
char chr)
125 case '&': dst +=
"&";
break;
126 case '<': dst +=
"<";
break;
127 case '>': dst +=
">";
break;
128 case 0x00a0: dst +=
" ";
break;
129 default: dst += chr;
break;
139 template<
class _Traits = std::
char_traits<
wchar_t>,
class _Alloc = std::allocator<
wchar_t>>
140 inline void escape_min(_Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& dst, _In_
wchar_t chr)
143 case L
'&': dst += L
"&";
break;
144 case L
'<': dst += L
"<";
break;
145 case L
'>': dst += L
">";
break;
146 case L
'\u00a0': dst += L
" ";
break;
147 default: dst += chr;
break;
158 template<
class _Traits = std::
char_traits<
char>,
class _Alloc = std::allocator<
char>>
159 inline void escape_min(
160 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
161 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars)
163 _Assume_(src || !num_chars);
164 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
166 case '&': dst +=
"&";
break;
167 case '<': dst +=
"<";
break;
168 case '>': dst +=
">";
break;
169 case 0x00a0: dst +=
" ";
break;
170 default: dst += src[i];
break;
182 template<
class _Traits = std::
char_traits<
wchar_t>,
class _Alloc = std::allocator<
wchar_t>>
183 inline void escape_min(
184 _Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& dst,
185 _In_reads_or_z_opt_(num_chars)
const wchar_t* src, _In_
size_t num_chars)
187 _Assume_(src || !num_chars);
188 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
190 case L
'&': dst += L
"&";
break;
191 case L
'<': dst += L
"<";
break;
192 case L
'>': dst += L
">";
break;
193 case L
'\u00a0': dst += L
" ";
break;
194 default: dst += src[i];
break;
205 template<
class _Elem,
size_t _Size,
class _Traits = std::
char_traits<_Elem>,
class _Alloc = std::allocator<_Elem>>
206 inline void escape_min(
207 _Inout_ std::basic_string<_Elem, _Traits, _Alloc>& dst,
208 _In_
const _Elem (&src)[_Size])
210 escape_min(dst, src, _Size);
219 template<
class _Elem,
class _Traits_dst = std::
char_traits<_Elem>,
class _Alloc_dst = std::allocator<_Elem>,
class _Traits_src = std::
char_traits<_Elem>,
class _Alloc_src = std::allocator<_Elem>>
220 inline void escape_min(
221 _Inout_ std::basic_string<_Elem, _Traits_dst, _Alloc_dst>& dst,
222 _In_
const std::basic_string<_Elem, _Traits_src, _Alloc_src>& src)
224 escape_min(dst, src.data(), src.size());
234 template<
class _Traits = std::
char_traits<
char>,
class _Alloc = std::allocator<
char>>
235 inline void url_unescape(
236 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
237 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars)
239 _Assume_(src || !num_chars);
240 for (
size_t i = 0; i < num_chars && src[i];) {
250 if (
'0' <= src[i] && src[i] <=
'9') chr = (src[i++] -
'0') << 4;
251 else if (
'A' <= src[i] && src[i] <=
'F') chr = (src[i++] -
'A' + 10) << 4;
252 else if (
'a' <= src[i] && src[i] <=
'f') chr = (src[i++] -
'a' + 10) << 4;
253 else { dst +=
'%';
continue; }
254 if (
'0' <= src[i] && src[i] <=
'9') chr |= (src[i++] -
'0');
255 else if (
'A' <= src[i] && src[i] <=
'F') chr |= (src[i++] -
'A' + 10);
256 else if (
'a' <= src[i] && src[i] <=
'f') chr |= (src[i++] -
'a' + 10);
257 else { dst +=
'%'; dst += src[i - 1];
continue; }
259 dst +=
static_cast<char>(chr);
275 template<
size_t _Size,
class _Traits = std::
char_traits<
char>,
class _Alloc = std::allocator<
char>>
276 inline void url_unescape(
277 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
278 _In_
const char (&src)[_Size])
280 url_unescape(dst, src, _Size);
289 template<
class _Traits_dst = std::
char_traits<
char>,
class _Alloc_dst = std::allocator<
char>,
class _Traits_src = std::
char_traits<
char>,
class _Alloc_src = std::allocator<
char>>
290 inline void url_unescape(
291 _Inout_ std::basic_string<char, _Traits_dst, _Alloc_dst>& dst,
292 _In_
const std::basic_string<char, _Traits_src, _Alloc_src>& src)
294 url_unescape(dst, src.data(), src.size());
304 template<
class _Traits = std::
char_traits<
char>,
class _Alloc = std::allocator<
char>>
305 inline void url_escape(
306 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
307 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars)
309 _Assume_(src || !num_chars);
310 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
312 case ' ': dst +=
"+";
break;
313 case '<': dst +=
"%3C";
break;
314 case '>': dst +=
"%3E";
break;
315 case '#': dst +=
"%23";
break;
316 case '%': dst +=
"%25";
break;
317 case '{': dst +=
"%7B";
break;
318 case '}': dst +=
"%7D";
break;
319 case '|': dst +=
"%7C";
break;
320 case '\\': dst +=
"%5C";
break;
321 case '^': dst +=
"%5E";
break;
322 case '~': dst +=
"%7E";
break;
323 case '[': dst +=
"%5B";
break;
324 case ']': dst +=
"%5D";
break;
325 case '`': dst +=
"%60";
break;
326 case ';': dst +=
"%3B";
break;
327 case '/': dst +=
"%2F";
break;
328 case '?': dst +=
"%3F";
break;
329 case ':': dst +=
"%3A";
break;
330 case '@': dst +=
"%40";
break;
331 case '=': dst +=
"%3D";
break;
332 case '&': dst +=
"%26";
break;
333 case '$': dst +=
"%24";
break;
335 if (0x20 <
static_cast<uint8_t
>(src[i]) &&
static_cast<uint8_t
>(src[i]) < 0x7f)
339 uint8_t n = (
static_cast<uint8_t
>(src[i]) & 0xf0) >> 4;
340 dst += n < 10 ? static_cast<char>(
'0' + n) : static_cast<char>(
'A' + n - 10);
341 n = ((uint8_t)src[i] & 0x0f);
342 dst += n < 10 ? static_cast<char>(
'0' + n) : static_cast<char>(
'A' + n - 10);
354 template<
size_t _Size,
class _Traits = std::
char_traits<
char>,
class _Alloc = std::allocator<
char>>
355 inline void url_escape(
356 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
357 _In_
const char (&src)[_Size])
359 url_escape(dst, src, _Size);
368 template<
class _Traits_dst = std::
char_traits<
char>,
class _Alloc_dst = std::allocator<
char>,
class _Traits_src = std::
char_traits<
char>,
class _Alloc_src = std::allocator<
char>>
369 inline void url_escape(
370 _Inout_ std::basic_string<char, _Traits_dst, _Alloc_dst>& dst,
371 _In_
const std::basic_string<char, _Traits_src, _Alloc_src>& src)
373 url_escape(dst, src.data(), src.size());
383 template<
class _Elem,
class _Traits = std::
char_traits<_Elem>,
class _Alloc = std::allocator<_Elem>>
384 inline void css_unescape(
385 _Inout_ std::basic_string<_Elem, _Traits, _Alloc>& dst,
386 _In_reads_or_z_opt_(num_chars)
const _Elem* src, _In_
size_t num_chars)
388 _Assume_(src || !num_chars);
389 for (
size_t i = 0; i < num_chars && src[i];) {
392 else if (i + 1 < num_chars) {
397 case 'n': dst +=
'\n'; i++;
break;
398 case 'r': dst +=
'\r'; i++;
break;
399 case 't': dst +=
'\t'; i++;
break;
402 case '\n': i++;
break;
420 case 'F':
case 'f': {
422 size_t end = std::min(num_chars, i + 6);
424 for (; i < end; ++i) {
425 if (
'0' <= src[i] && src[i] <=
'9') chr = chr * 0x10 + src[i] -
'0';
426 else if (
'A' <= src[i] && src[i] <=
'F') chr = chr * 0x10 + src[i] -
'A' + 10;
427 else if (
'a' <= src[i] && src[i] <=
'f') chr = chr * 0x10 + src[i] -
'a' + 10;
431 dst +=
static_cast<_Elem
>(chr);
433 if (i < end && src[i] ==
' ') {
440 default: dst += src[i++];
452 template<
class _Elem,
size_t _Size,
class _Traits = std::
char_traits<_Elem>,
class _Alloc = std::allocator<_Elem>>
453 inline void css_unescape(
454 _Inout_ std::basic_string<_Elem, _Traits, _Alloc>& dst,
455 _In_
const _Elem (&src)[_Size])
457 css_unescape(dst, src, _Size);
466 template<
class _Elem,
class _Traits_dst = std::
char_traits<_Elem>,
class _Alloc_dst = std::allocator<_Elem>,
class _Traits_src = std::
char_traits<_Elem>,
class _Alloc_src = std::allocator<_Elem>>
467 inline void css_unescape(
468 _Inout_ std::basic_string<_Elem, _Traits_dst, _Alloc_dst>& dst,
469 _In_
const std::basic_string<_Elem, _Traits_src, _Alloc_src>& src)
471 css_unescape(dst, src.data(), src.size());
481 template<
class _Traits = std::
char_traits<
char>,
class _Alloc = std::allocator<
char>>
482 inline void css_escape(
483 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
484 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars)
486 _Assume_(src || !num_chars);
487 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
489 case '\\': dst +=
"\\\\";
break;
490 case '\n': dst +=
"\\n";
break;
491 case '\r': dst +=
"\\r";
break;
492 case '\t': dst +=
"\\t";
break;
493 case '\"': dst +=
"\\\"";
break;
494 case '\'': dst +=
"\\'";
break;
495 default: dst += src[i];
break;
507 template<
class _Traits = std::
char_traits<
wchar_t>,
class _Alloc = std::allocator<
wchar_t>>
508 inline void css_escape(
509 _Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& dst,
510 _In_reads_or_z_opt_(num_chars)
const wchar_t* src, _In_
size_t num_chars)
512 _Assume_(src || !num_chars);
513 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
515 case L
'\\': dst += L
"\\\\";
break;
516 case L
'\n': dst += L
"\\n";
break;
517 case L
'\r': dst += L
"\\r";
break;
518 case L
'\t': dst += L
"\\t";
break;
519 case L
'\"': dst += L
"\\\"";
break;
520 case L
'\'': dst += L
"\\'";
break;
521 default: dst += src[i];
break;
532 template<
class _Elem,
size_t _Size,
class _Traits = std::
char_traits<_Elem>,
class _Alloc = std::allocator<_Elem>>
533 inline void css_escape(
534 _Inout_ std::basic_string<_Elem, _Traits, _Alloc>& dst,
535 _In_
const _Elem (&src)[_Size])
537 css_escape(dst, src, _Size);
546 template<
class _Elem,
class _Traits_dst = std::
char_traits<_Elem>,
class _Alloc_dst = std::allocator<_Elem>,
class _Traits_src = std::
char_traits<_Elem>,
class _Alloc_src = std::allocator<_Elem>>
547 inline void css_escape(
548 _Inout_ std::basic_string<_Elem, _Traits_dst, _Alloc_dst>& dst,
549 _In_
const std::basic_string<_Elem, _Traits_src, _Alloc_src>& src)
551 css_escape(dst, src.data(), src.size());
557 enum class element_t {
673 enum class element_span_t {
689 static inline element_span_t
span(_In_ element_t code)
691 static element_span_t lookup[] = {
692 element_span_t::needs_end,
693 element_span_t::needs_end,
694 element_span_t::needs_end,
695 element_span_t::needs_end,
696 element_span_t::needs_end,
697 element_span_t::immediate,
698 element_span_t::needs_end,
699 element_span_t::immediate,
700 element_span_t::immediate,
701 element_span_t::needs_end,
702 element_span_t::immediate,
703 element_span_t::needs_end,
704 element_span_t::needs_end,
705 element_span_t::needs_end,
706 element_span_t::end_optional,
707 element_span_t::immediate,
708 element_span_t::needs_end,
709 element_span_t::needs_end,
710 element_span_t::needs_end,
711 element_span_t::needs_end,
712 element_span_t::needs_end,
713 element_span_t::immediate,
714 element_span_t::end_optional,
715 element_span_t::needs_end,
716 element_span_t::end_optional,
717 element_span_t::needs_end,
718 element_span_t::needs_end,
719 element_span_t::needs_end,
720 element_span_t::needs_end,
721 element_span_t::needs_end,
722 element_span_t::end_optional,
723 element_span_t::needs_end,
724 element_span_t::immediate,
725 element_span_t::needs_end,
726 element_span_t::needs_end,
727 element_span_t::needs_end,
728 element_span_t::immediate,
729 element_span_t::needs_end,
730 element_span_t::needs_end,
731 element_span_t::needs_end,
732 element_span_t::needs_end,
733 element_span_t::needs_end,
734 element_span_t::needs_end,
735 element_span_t::needs_end,
736 element_span_t::end_optional,
737 element_span_t::immediate,
738 element_span_t::end_optional,
739 element_span_t::needs_end,
740 element_span_t::needs_end,
741 element_span_t::immediate,
742 element_span_t::immediate,
743 element_span_t::needs_end,
744 element_span_t::immediate,
745 element_span_t::needs_end,
746 element_span_t::needs_end,
747 element_span_t::needs_end,
748 element_span_t::end_optional,
749 element_span_t::immediate,
750 element_span_t::needs_end,
751 element_span_t::needs_end,
752 element_span_t::needs_end,
753 element_span_t::needs_end,
754 element_span_t::immediate,
755 element_span_t::immediate,
756 element_span_t::needs_end,
757 element_span_t::needs_end,
758 element_span_t::needs_end,
759 element_span_t::needs_end,
760 element_span_t::needs_end,
761 element_span_t::needs_end,
762 element_span_t::needs_end,
763 element_span_t::end_optional,
764 element_span_t::end_optional,
765 element_span_t::immediate,
766 element_span_t::end_optional,
767 element_span_t::needs_end,
768 element_span_t::needs_end,
769 element_span_t::immediate,
770 element_span_t::needs_end,
771 element_span_t::needs_end,
772 element_span_t::needs_end,
773 element_span_t::needs_end,
774 element_span_t::needs_end,
775 element_span_t::needs_end,
776 element_span_t::needs_end,
777 element_span_t::needs_end,
778 element_span_t::needs_end,
779 element_span_t::needs_end,
780 element_span_t::needs_end,
781 element_span_t::needs_end,
782 element_span_t::needs_end,
783 element_span_t::end_optional,
784 element_span_t::end_optional,
785 element_span_t::needs_end,
786 element_span_t::end_optional,
787 element_span_t::end_optional,
788 element_span_t::end_optional,
789 element_span_t::needs_end,
790 element_span_t::end_optional,
791 element_span_t::needs_end,
792 element_span_t::needs_end,
793 element_span_t::needs_end,
794 element_span_t::needs_end,
795 element_span_t::immediate,
796 element_span_t::needs_end,
798 return element_t::a <= code && code <= element_t::xmp ?
799 lookup[
static_cast<size_t>(code) -
static_cast<size_t>(element_t::a)] :
800 element_span_t::needs_end;
816 case element_t::strike:
817 case element_t::blink:
819 case element_t::small:
834 case element_t::strong:
836 case element_t::code:
837 case element_t::samp:
840 case element_t::cite:
841 case element_t::abbr:
842 case element_t::acronym:
859 case element_t::applet:
860 case element_t::object:
861 case element_t::embed:
862 case element_t::font:
863 case element_t::basefont:
867 case element_t::script:
872 case element_t::ruby:
873 case element_t::span:
875 case element_t::iframe:
876 case element_t::nobr:
890 case element_t::input:
891 case element_t::select:
892 case element_t::textarea:
893 case element_t::label:
894 case element_t::button:
908 code == element_t::PCDATA ||
939 static inline bool is_list(_In_ element_t code)
945 case element_t::menu:
960 case element_t::listing:
980 case element_t::center:
981 case element_t::marquee:
982 case element_t::noscript:
983 case element_t::noframes:
984 case element_t::noembed:
985 case element_t::blockquote:
986 case element_t::form:
987 case element_t::isindex:
989 case element_t::table:
990 case element_t::fieldset:
991 case element_t::address:
1015 case element_t::title:
1016 case element_t::isindex:
1017 case element_t::base:
1018 case element_t::nextid:
1032 case element_t::script:
1033 case element_t::style:
1034 case element_t::meta:
1035 case element_t::link:
1036 case element_t::object:
1050 case element_t::img:
1051 case element_t::object:
1052 case element_t::applet:
1053 case element_t::embed:
1054 case element_t::big:
1055 case element_t::small:
1056 case element_t::sub:
1057 case element_t::sup:
1058 case element_t::ruby:
1059 case element_t::font:
1060 case element_t::basefont:
1061 case element_t::nobr:
1075 case element_t::head:
1076 case element_t::body:
1077 case element_t::frameset:
1094 case element_t::col:
1095 case element_t::colgroup:
1097 case element_t::dir:
1099 case element_t::frame:
1100 case element_t::iframe:
1101 case element_t::legend:
1118 static inline bool may_contain(_In_ element_t parent, _In_ element_t child)
1120 if (child == element_t::unknown || child == element_t::comment)
1128 case element_t::a:
return is_inline(child) && child != element_t::a;
1129 case element_t::address:
return is_inline(child) || child == element_t::p;
1130 case element_t::applet:
return is_flow(child) || child == element_t::param;
1131 case element_t::area:
return false;
1132 case element_t::base:
return false;
1133 case element_t::basefont:
return false;
1134 case element_t::bdo:
return is_inline(child);
1135 case element_t::blockquote:
return is_flow(child);
1136 case element_t::body:
return is_flow(child) || child == element_t::ins || child == element_t::del;
1137 case element_t::br:
return false;
1138 case element_t::button:
return is_flow(child) && !
is_formctrl(child) && child != element_t::a && child != element_t::form && child != element_t::isindex && child != element_t::fieldset && child != element_t::iframe;
1139 case element_t::caption:
return is_inline(child);
1140 case element_t::center:
return is_flow(child);
1141 case element_t::col:
return false;
1142 case element_t::colgroup:
return child == element_t::col;
1143 case element_t::comment:
return child == element_t::CDATA;
1144 case element_t::dd:
return is_flow(child);
1145 case element_t::del:
return is_flow(child);
1146 case element_t::dir:
return child == element_t::li;
1147 case element_t::div:
return is_flow(child);
1148 case element_t::dl:
return child == element_t::dt || child == element_t::dd;
1149 case element_t::dt:
return is_inline(child);
1150 case element_t::embed:
return is_flow(child) || child == element_t::param;
1151 case element_t::fieldset:
return is_flow(child) || child == element_t::legend || child == element_t::PCDATA;
1152 case element_t::font:
return is_inline(child);
1153 case element_t::form:
return is_flow(child) && child != element_t::form;
1154 case element_t::frame:
return false;
1155 case element_t::frameset:
return child == element_t::frameset || child == element_t::frame || child == element_t::noframes;
1157 case element_t::hr:
return false;
1159 case element_t::iframe:
return is_flow(child);
1160 case element_t::img:
return false;
1161 case element_t::input:
return false;
1162 case element_t::ins:
return is_flow(child);
1163 case element_t::isindex:
return false;
1164 case element_t::label:
return is_inline(child) && child != element_t::label;
1165 case element_t::legend:
return is_inline(child);
1166 case element_t::li:
return is_flow(child);
1167 case element_t::link:
return false;
1168 case element_t::listing:
return child == element_t::CDATA;
1169 case element_t::map:
return is_block(child) || child == element_t::area;
1170 case element_t::marquee:
return is_flow(child);
1171 case element_t::menu:
return child == element_t::li;
1172 case element_t::meta:
return false;
1173 case element_t::nobr:
return is_inline(child) || child == element_t::wbr;
1174 case element_t::noframes:
return (
is_flow(child) || child == element_t::body) && child != element_t::noframes;
1175 case element_t::noscript:
return is_flow(child);
1176 case element_t::noembed:
return is_flow(child);
1177 case element_t::object:
return is_flow(child) || child == element_t::param;
1178 case element_t::ol:
return child == element_t::li;
1179 case element_t::optgroup:
return child == element_t::option;
1180 case element_t::option:
return child == element_t::PCDATA;
1181 case element_t::p:
return is_inline(child);
1182 case element_t::param:
return false;
1183 case element_t::plaintext:
return is_flow(child);
1185 case element_t::q:
return is_inline(child);
1186 case element_t::rt:
return false;
1187 case element_t::ruby:
return is_inline(child);
1188 case element_t::script:
return child == element_t::CDATA;
1189 case element_t::select:
return child == element_t::optgroup || child == element_t::option;
1190 case element_t::span:
return is_inline(child);
1191 case element_t::style:
return child == element_t::CDATA;
1192 case element_t::sub:
return is_inline(child);
1193 case element_t::sup:
return is_inline(child);
1194 case element_t::table:
return child == element_t::caption || child == element_t::col || child == element_t::colgroup || child == element_t::thead || child == element_t::tfoot || child == element_t::tbody;
1195 case element_t::tbody:
return child == element_t::tr;
1196 case element_t::td:
return is_flow(child);
1197 case element_t::textarea:
return child == element_t::PCDATA;
1198 case element_t::tfoot:
return child == element_t::tr;
1199 case element_t::th:
return is_flow(child);
1200 case element_t::thead:
return child == element_t::tr;
1201 case element_t::title:
return child == element_t::PCDATA;
1202 case element_t::tr:
return child == element_t::td || child == element_t::th;
1203 case element_t::ul:
return child == element_t::li;
1204 case element_t::wbr:
return false;
1205 case element_t::unknown:
return true;
1218 static inline bool is_uri(_In_ element_t code, _In_reads_or_z_opt_(num_chars)
const T* attr_name, _In_
size_t num_chars)
1220 _Assume_(attr_name || !num_chars);
1222 case element_t::a:
return !stdex::strnicmp(attr_name, num_chars,
"href", SIZE_MAX);
1223 case element_t::applet:
return !stdex::strnicmp(attr_name, num_chars,
"code", SIZE_MAX) ||
1224 !stdex::strnicmp(attr_name, num_chars,
"codebase", SIZE_MAX) ||
1225 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX);
1226 case element_t::area:
return !stdex::strnicmp(attr_name, num_chars,
"href", SIZE_MAX);
1227 case element_t::base:
return !stdex::strnicmp(attr_name, num_chars,
"href", SIZE_MAX);
1228 case element_t::bgsound:
return !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX);
1229 case element_t::blockquote:
return !stdex::strnicmp(attr_name, num_chars,
"cite", SIZE_MAX);
1230 case element_t::body:
return !stdex::strnicmp(attr_name, num_chars,
"background", SIZE_MAX);
1231 case element_t::comment:
return !stdex::strnicmp(attr_name, num_chars,
"data", SIZE_MAX);
1232 case element_t::del:
return !stdex::strnicmp(attr_name, num_chars,
"cite", SIZE_MAX);
1233 case element_t::embed:
return !stdex::strnicmp(attr_name, num_chars,
"pluginspage", SIZE_MAX) ||
1234 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX);
1235 case element_t::form:
return !stdex::strnicmp(attr_name, num_chars,
"action", SIZE_MAX);
1236 case element_t::frame:
return !stdex::strnicmp(attr_name, num_chars,
"longdesc", SIZE_MAX) ||
1237 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX);
1238 case element_t::head:
return !stdex::strnicmp(attr_name, num_chars,
"profile", SIZE_MAX);
1239 case element_t::iframe:
return !stdex::strnicmp(attr_name, num_chars,
"longdesc", SIZE_MAX) ||
1240 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX);
1241 case element_t::img:
return !stdex::strnicmp(attr_name, num_chars,
"longdesc", SIZE_MAX) ||
1242 !stdex::strnicmp(attr_name, num_chars,
"lowsrc", SIZE_MAX) ||
1243 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX) ||
1244 !stdex::strnicmp(attr_name, num_chars,
"usemap", SIZE_MAX);
1245 case element_t::input:
return !stdex::strnicmp(attr_name, num_chars,
"lowsrc", SIZE_MAX) ||
1246 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX) ||
1247 !stdex::strnicmp(attr_name, num_chars,
"usemap", SIZE_MAX);
1248 case element_t::ins:
return !stdex::strnicmp(attr_name, num_chars,
"cite", SIZE_MAX);
1249 case element_t::link:
return !stdex::strnicmp(attr_name, num_chars,
"href", SIZE_MAX);
1250 case element_t::object:
return !stdex::strnicmp(attr_name, num_chars,
"basehref", SIZE_MAX) ||
1251 !stdex::strnicmp(attr_name, num_chars,
"classid", SIZE_MAX) ||
1252 !stdex::strnicmp(attr_name, num_chars,
"code", SIZE_MAX) ||
1253 !stdex::strnicmp(attr_name, num_chars,
"codebase", SIZE_MAX) ||
1254 !stdex::strnicmp(attr_name, num_chars,
"data", SIZE_MAX) ||
1255 !stdex::strnicmp(attr_name, num_chars,
"usemap", SIZE_MAX);
1256 case element_t::q:
return !stdex::strnicmp(attr_name, num_chars,
"cite", SIZE_MAX);
1257 case element_t::script:
return !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX);
1258 case element_t::table:
return !stdex::strnicmp(attr_name, num_chars,
"background", SIZE_MAX);
1259 case element_t::td:
return !stdex::strnicmp(attr_name, num_chars,
"background", SIZE_MAX);
1260 case element_t::th:
return !stdex::strnicmp(attr_name, num_chars,
"background", SIZE_MAX);
1273 static inline bool is_localizable(element_t code,
const T* attr_name,
size_t num_chars)
1275 _Assume_(attr_name || !num_chars);
1276 if (!stdex::strnicmp(attr_name, num_chars,
"title", SIZE_MAX))
1279 case element_t::applet:
return !stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX);
1280 case element_t::area:
return !stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX);
1281 case element_t::img:
return !stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX);
1282 case element_t::input:
return !stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX);
1283 case element_t::object:
return !stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX);
1284 case element_t::table:
return !stdex::strnicmp(attr_name, num_chars,
"summary", SIZE_MAX);
1285 case element_t::td:
return !stdex::strnicmp(attr_name, num_chars,
"abbr", SIZE_MAX);
1286 case element_t::th:
return !stdex::strnicmp(attr_name, num_chars,
"abbr", SIZE_MAX);
1293 using sequence_store = std::vector<std::unique_ptr<sequence>>;
1301 stdex::parser::html_sequence_t
type;
1305 sequence(_In_ stdex::parser::html_sequence_t _type = stdex::parser::html_sequence_t::unknown, _In_
size_t start = 0,
size_t end = 0, _In_opt_
sequence* _parent =
nullptr) :
1322 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1323 code(element_code(src + tag.name.start, tag.name.size())),
1324 name(std::move(tag.name)),
1329 static element_t element_code(_In_reads_z_(num_chars)
const T*
name,
size_t num_chars)
1331 static const struct {
1335 {
"a", element_t::a, },
1336 {
"abbr", element_t::abbr, },
1337 {
"acronym", element_t::acronym, },
1338 {
"address", element_t::address, },
1339 {
"applet", element_t::applet, },
1340 {
"area", element_t::area, },
1341 {
"b", element_t::b, },
1342 {
"base", element_t::base, },
1343 {
"basefont", element_t::basefont, },
1344 {
"bdo", element_t::bdo, },
1345 {
"bgsound", element_t::bgsound, },
1346 {
"big", element_t::big, },
1347 {
"blink", element_t::blink, },
1348 {
"blockquote", element_t::blockquote, },
1349 {
"body", element_t::body, },
1350 {
"br", element_t::br, },
1351 {
"button", element_t::button, },
1352 {
"caption", element_t::caption, },
1353 {
"center", element_t::center, },
1354 {
"cite", element_t::cite, },
1355 {
"code", element_t::code, },
1356 {
"col", element_t::col, },
1357 {
"colgroup", element_t::colgroup, },
1358 {
"comment", element_t::comment, },
1359 {
"dd", element_t::dd, },
1360 {
"del", element_t::del, },
1361 {
"dfn", element_t::dfn, },
1362 {
"dir", element_t::dir, },
1363 {
"div", element_t::div, },
1364 {
"dl", element_t::dl, },
1365 {
"dt", element_t::dt, },
1366 {
"em", element_t::em, },
1367 {
"embed", element_t::embed, },
1368 {
"fieldset", element_t::fieldset, },
1369 {
"font", element_t::font, },
1370 {
"form", element_t::form, },
1371 {
"frame", element_t::frame, },
1372 {
"frameset", element_t::frameset, },
1373 {
"h1", element_t::h1, },
1374 {
"h2", element_t::h2, },
1375 {
"h3", element_t::h3, },
1376 {
"h4", element_t::h4, },
1377 {
"h5", element_t::h5, },
1378 {
"h6", element_t::h6, },
1379 {
"head", element_t::head, },
1380 {
"hr", element_t::hr, },
1381 {
"html", element_t::html, },
1382 {
"i", element_t::i, },
1383 {
"iframe", element_t::iframe, },
1384 {
"img", element_t::img, },
1385 {
"input", element_t::input, },
1386 {
"ins", element_t::ins, },
1387 {
"isindex", element_t::isindex, },
1388 {
"kbd", element_t::kbd, },
1389 {
"label", element_t::label, },
1390 {
"legend", element_t::legend, },
1391 {
"li", element_t::li, },
1392 {
"link", element_t::link, },
1393 {
"listing", element_t::listing, },
1394 {
"map", element_t::map, },
1395 {
"marquee", element_t::marquee, },
1396 {
"menu", element_t::menu, },
1397 {
"meta", element_t::meta, },
1398 {
"nextid", element_t::nextid, },
1399 {
"nobr", element_t::nobr, },
1400 {
"noembed", element_t::noembed, },
1401 {
"noframes", element_t::noframes, },
1402 {
"noscript", element_t::noscript, },
1403 {
"object", element_t::object, },
1404 {
"ol", element_t::ol, },
1405 {
"optgroup", element_t::optgroup, },
1406 {
"option", element_t::option, },
1407 {
"p", element_t::p, },
1408 {
"param", element_t::param, },
1409 {
"plaintext", element_t::plaintext, },
1410 {
"pre", element_t::pre, },
1411 {
"q", element_t::q, },
1412 {
"rt", element_t::rt, },
1413 {
"ruby", element_t::ruby, },
1414 {
"s", element_t::s, },
1415 {
"samp", element_t::samp, },
1416 {
"script", element_t::script, },
1417 {
"select", element_t::select, },
1418 {
"small", element_t::small, },
1419 {
"span", element_t::span, },
1420 {
"strike", element_t::strike, },
1421 {
"strong", element_t::strong, },
1422 {
"style", element_t::style, },
1423 {
"sub", element_t::sub, },
1424 {
"sup", element_t::sup, },
1425 {
"table", element_t::table, },
1426 {
"tbody", element_t::tbody, },
1427 {
"td", element_t::td, },
1428 {
"textarea", element_t::textarea, },
1429 {
"tfoot", element_t::tfoot, },
1430 {
"th", element_t::th, },
1431 {
"thead", element_t::thead, },
1432 {
"title", element_t::title, },
1433 {
"tr", element_t::tr, },
1434 {
"tt", element_t::tt, },
1435 {
"u", element_t::u, },
1436 {
"ul", element_t::ul, },
1437 {
"var", element_t::var, },
1438 {
"wbr", element_t::wbr, },
1439 {
"xmp", element_t::xmp, },
1443 for (
size_t i = 1; i < _countof(
mapping); i++)
1445 for (
size_t i = 0; i < _countof(
mapping); i++) {
1446 for (
size_t j = 0;
mapping[i].name[j]; j++)
1450 for (
size_t i = 0, j = _countof(
mapping); i < j; ) {
1451 size_t m = (i + j) / 2;
1453 for (
size_t i1 = 0, i2 = 0;;) {
1455 r = i2 >= num_chars || !
name[i2] ? 0 : -1;
1458 if (i2 >= num_chars || !
name[i2]) {
1463 auto chr =
static_cast<char>(stdex::tolower(
name[i2++]));
1482 return element_t::unknown;
1517 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1518 code(element::element_code(src + tag.name.start, tag.name.size())),
1519 name(std::move(tag.name)),
1537 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1538 name(std::move(tag.name)),
1555 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1571 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1582 template<
class _Elem,
class _Traits = std::
char_traits<_Elem>,
class _Alloc = std::allocator<_Elem>>
1586 std::basic_string<_Elem, _Traits, _Alloc>
value;
1592 template<
class _Elem,
class _Traits = std::
char_traits<_Elem>,
class _Alloc = std::allocator<_Elem>>
1598 template<
class _Elem,
class _Traits = std::
char_traits<_Elem>,
class _Alloc = std::allocator<_Elem>>
1640 void append(_In_reads_or_z_opt_(num_chars)
const _Elem*
source, _In_
size_t num_chars)
1642 _Assume_(
source || !num_chars);
1649 if (m_condition_end.match(
source, i, num_chars)) {
1651 m_is_cdata ? stdex::parser::html_sequence_t::CDATA : stdex::parser::html_sequence_t::PCDATA,
1662 if (m_condition_end.match(
source, i, num_chars)) {
1679 if (m_condition_start.match(
source, i, num_chars)) {
1681 if (!stdex::strcmp(condition_src.c_str(),
"CDATA"))
1683 else if (!stdex::strcmp(condition_src.c_str(),
"RCDATA"))
1687 else if (!stdex::strcmp(condition_src.c_str(),
"IGNORE"))
1699 if (m_tag.match(
source, i, num_chars) &&
1700 m_tag.
type == stdex::parser::html_sequence_t::element_end &&
1706 std::unique_ptr<element_end> e(
new element_end(std::move(m_tag),
source, parent->parent, parent));
1707 parent->end = e.get();
1716 if (m_tag.match(
source, i, num_chars)) {
1721 switch (m_tag.
type) {
1722 case stdex::parser::html_sequence_t::element:
1723 case stdex::parser::html_sequence_t::element_start: {
1724 std::unique_ptr<element> e(
1725 m_tag.
type == stdex::parser::html_sequence_t::element ?
new element(std::move(m_tag),
source) :
1732 _Assume_(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1734 e->parent = starting_tag;
1737 e->parent = starting_tag->parent;
1738 starting_tag->end = e.get();
1742 if (e->type == stdex::parser::html_sequence_t::element_start) {
1745 e_start->
end = e.get();
1749 case element_t::code:
1750 case element_t::comment:
1751 case element_t::script:
1752 case element_t::style:
1759 if (e->code == element_t::meta &&
m_charset == stdex::charset_id::system) {
1760 bool is_content_type =
false;
1762 for (
auto& attr : e->attributes) {
1763 if (!stdex::strnicmp(
source + attr.name.start, attr.name.size(),
"http-equiv", SIZE_MAX) &&
1764 !stdex::strnicmp(
source + attr.value.start, attr.value.size(),
"content-type", SIZE_MAX))
1765 is_content_type =
true;
1766 else if (!stdex::strnicmp(
source + attr.name.start, attr.name.size(),
"content", SIZE_MAX))
1767 content_attr = &attr;
1769 if (is_content_type && content_attr) {
1776 str.reserve(content.charset.size());
1777 for (
size_t j = content.charset.start; j < content.charset.end; ++j)
1778 str.push_back(
static_cast<char>(
source[j]));
1779 m_charset = stdex::charset_from_name(str.c_str());
1787 case stdex::parser::html_sequence_t::element_end: {
1792 _Assume_(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1793 if (starting_tag->code == e->code ||
1794 starting_tag->code == element_t::unknown && e->code == element_t::unknown && !stdex::strnicmp(
source + starting_tag->name.start, starting_tag->name.size(),
source + e->name.start, e->name.size()))
1796 e->start = starting_tag;
1797 e->parent = starting_tag->parent;
1798 starting_tag->end = e.get();
1807 case stdex::parser::html_sequence_t::declaration:
1825 case stdex::parser::html_sequence_t::comment:
1828 case stdex::parser::html_sequence_t::instruction:
1832 throw std::invalid_argument(
"unknown tag type");
1839 if (m_any_char.match(
source, i, num_chars)) {
1863 inline void assign(_In_reads_or_z_opt_(num_chars)
const _Elem*
source, _In_
size_t num_chars)
1873 inline const std::basic_string<_Elem, _Traits, _Alloc>&
source()
const {
return m_source; }
1875 friend class parser<_Elem, _Traits, _Alloc>;
1889 std::basic_string<_Elem, _Traits, _Alloc>
replace_entities(_In_reads_or_z_opt_(num_chars)
const _Elem* input, _In_
size_t num_chars)
const
1891 _Assume_(input || !num_chars);
1892 const size_t num_entities =
m_entities.size();
1894 std::basic_string<_Elem, _Traits, _Alloc> output;
1895 for (
size_t i = 0; i < num_chars && input[i];) {
1896 if (input[i] ==
'%') {
1897 for (
size_t j = 0; j < num_entities; j++) {
1899 size_t entity_size = e->name.size();
1900 if (i + entity_size + 1 < num_chars &&
1901 !stdex::strncmp(input + i + 1,
source + e->name.start, entity_size) &&
1902 input[i + entity_size + 1] ==
';')
1905 i += entity_size + 2;
1909 throw std::runtime_error(
"undefined entity");
1911 output += input[i++];
1930 std::vector<std::unique_ptr<entity<_Elem, _Traits, _Alloc>>>
m_entities;
1942 enum class token_t {
1953 constexpr size_t token_tag_max =
1962 constexpr char token_tag_start =
'\x12';
1968 constexpr char token_tag_end =
'\x13';
1976 inline token(_In_ token_t _type = token_t::root, _In_opt_
sequence* _sequence =
nullptr, _In_ uintptr_t _data = 0) :
1982 template<
class _Elem,
class _Traits,
class _Alloc>
1995 template<
class _Traits = std::
char_traits<
char>,
class _Alloc = std::allocator<
char>>
1996 inline size_t append_tag(_Inout_ std::basic_string<char, _Traits, _Alloc>& str)
const
1998 size_t n = str.size();
2000 stdex::appendf(str,
"%c%zX%c", stdex::locale_C.get(), token_tag_start,
reinterpret_cast<uintptr_t
>(
this), token_tag_end);
2001 return str.size() - n;
2011 template<
class _Traits = std::
char_traits<
wchar_t>,
class _Alloc = std::allocator<
wchar_t>>
2012 inline size_t append_tag(_Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& str)
const
2015 return stdex::appendf(str, L
"%c%zX%c", stdex::locale_C.get(),
static_cast<wchar_t>(token_tag_start),
reinterpret_cast<uintptr_t
>(
this),
static_cast<wchar_t>(token_tag_end));
2019 static inline token* parse_tag(
const T* str,
size_t& offset)
2021 if (str[offset] !=
static_cast<T
>(token_tag_start))
2026 for (end = offset + 1; ; end++) {
2029 if (str[end] == token_tag_end)
2034 token* t =
reinterpret_cast<token*
>(stdex::strtouint<T, uintptr_t>(str + offset + 1, end - offset - 1,
nullptr, 16));
2036 throw std::invalid_argument(
"null token");
2047 using token_vector = std::vector<std::unique_ptr<token>>;
2048 using token_list = std::list<token*>;
2053 enum text_type_flag_t : uint32_t {
2054 has_tokens = 1 << 0,
2063 template<
class _Elem,
class _Traits = std::
char_traits<_Elem>,
class _Alloc = std::allocator<_Elem>>
2068 _In_ token_t
type = token_t::complete,
2069 _In_reads_or_z_opt_(num_chars)
const _Elem* _text =
nullptr, _In_
size_t num_chars = 0,
2070 _In_ uint32_t _text_type = 0,
2073 text(_text, num_chars),
2077 friend class parser<_Elem, _Traits, _Alloc>;
2080 std::basic_string<_Elem, _Traits, _Alloc>
text;
2088 template<
class _Elem,
class _Traits = std::
char_traits<_Elem>,
class _Alloc = std::allocator<_Elem>>
2093 _In_reads_or_z_opt_(num_chars_text)
const _Elem* _text =
nullptr, _In_
size_t num_chars_text = 0,
2094 _In_reads_or_z_opt_(num_chars_name)
const _Elem* _name =
nullptr, _In_
size_t num_chars_name = 0,
2098 _In_ uintptr_t
data = 0) :
2100 name(_name, num_chars_name),
2104 friend class parser<_Elem, _Traits, _Alloc>;
2107 std::basic_string<_Elem, _Traits, _Alloc>
name;
2114 enum class token_url_t {
2123 template<
class _Elem,
class _Traits = std::
char_traits<_Elem>,
class _Alloc = std::allocator<_Elem>>
2128 _In_reads_or_z_opt_(num_chars)
const _Elem* _url =
nullptr, _In_
size_t num_chars = 0,
2129 token_url_t _encoding = token_url_t::plain,
2132 url(_url, num_chars),
2136 friend class parser<_Elem, _Traits, _Alloc>;
2139 std::basic_string<_Elem, _Traits, _Alloc>
url;
2153 using inserted_token_list = std::list<inserted_token>;
2155 template<
class _Elem,
class _Traits,
class _Alloc>
2161 _In_reads_or_z_opt_(num_chars)
const stdex::schar_t* url =
nullptr, _In_
size_t num_chars = 0,
2164 m_url(url, stdex::strnlen(url, num_chars)),
2197 t->type == token_t::complete ||
2198 t->type == token_t::starting ||
2199 t->type == token_t::ending ||
2200 t->type == token_t::root);
2202 if (t->text_type & has_tokens) {
2203 const _Elem* root = t->text.data();
2204 for (
size_t i = 0, num_chars = t->text.size(); i < num_chars && root[i];) {
2205 _Assume_(root[i] != token_tag_end);
2206 const token* t2 = token::parse_tag(root, i);
2209 case token_t::complete:
2210 case token_t::starting:
2211 case token_t::ending:
2215 case token_t::url: {
2217 switch (t2_url->encoding) {
2218 case token_url_t::plain:
2219 source += t2_url->
url;
2221 case token_url_t::sgml:
2222 escape(source, t2_url->url.data(), t2_url->url.size());
2224 case token_url_t::css:
2225 css_escape(source, t2_url->url.data(), t2_url->url.size());
2228 throw std::invalid_argument(
"unsupported URL encoding");
2233 throw std::invalid_argument(
"unsupported token type");
2236 else if (t->text_type & has_text) {
2237 escape_min(source, root[i]);
2241 source += root[i++];
2244 else if (t->text_type & has_text) {
2246 escape_min(source, t->text.data(), t->text.size());
2260 static void start_tokens(_Inout_ std::basic_string<_Elem, _Traits, _Alloc>& source, _Inout_ token_list& active_tokens, _In_
const token_list& new_tokens, _In_ token_list::const_iterator from)
2262 for (; from != new_tokens.cend(); ++from) {
2264 t->append_tag(source);
2265 active_tokens.push_back(t);
2278 token_list::const_iterator
end_tokens(_Inout_ std::basic_string<_Elem, _Traits, _Alloc>& source, _Inout_ token_list& active_tokens, _In_
const token_list& new_tokens)
2281 token_list::const_iterator i1, i2;
2282 for (i1 = active_tokens.cbegin(), i2 = new_tokens.cbegin(); i1 != active_tokens.cend(); ++i1, ++i2) {
2283 if (i2 == new_tokens.cend() || *i1 != *i2) {
2286 for (
auto i = active_tokens.cend(); i != active_tokens.cbegin(); ) {
2288 _Assume_(t1 && t1->type == token_t::starting);
2291 t2->text.reserve(t1->name.size() + 3);
2294 t2->text += t1->name;
2300 active_tokens.erase(i);
2303 active_tokens.erase(i);
2304 i = active_tokens.cend();
2321 void append_inserted_tokens(_Inout_ std::basic_string<_Elem, _Traits, _Alloc>& source, _Inout_ inserted_token_list& inserted_tokens,
2322 _In_
size_t word_index, _In_
bool after_word,
2323 _Inout_ token_list& active_tokens)
2325 for (
auto i = inserted_tokens.begin(); i != inserted_tokens.end(); ) {
2328 if (t.word_index == word_index && t.after_word == after_word) {
2329 if (t.token->type != token_t::ending)
2330 start_tokens(source, active_tokens, t.active_tokens,
end_tokens(source, active_tokens, t.active_tokens));
2331 t.token->append_tag(source);
2332 inserted_tokens.erase(i++);
2345 static void merge(_Inout_ token_list& a, _In_
const token_list& b)
2347 for (
auto i2 = b.begin(); i2 != b.end(); ++i2) {
2349 for (
auto i1 = a.begin(); i1 != a.end(); ++i1) {
2350 if (i1 == a.end()) {
2366 _Unreferenced_(rel);
2392 auto t =
token.get();
2406 inline size_t append_token(_Inout_ std::unique_ptr<T>&&
token, _Inout_ std::basic_string<_Elem, _Traits, _Alloc>& source)
2443 stdex::strnchr(
m_source + s->interval.start, s->interval.size(),
static_cast<_Elem
>(token_tag_start)) == stdex::npos &&
2444 stdex::strnchr(
m_source + s->interval.start, s->interval.size(),
static_cast<_Elem
>(token_tag_end)) == stdex::npos);
2446 if (s->type == stdex::parser::html_sequence_t::text) {
2447 rel.from = s->interval.start;
2448 token->mapping.push_back(rel);
2449 stdex::sgml2strcat(
token->text,
m_source + s->interval.start, s->interval.size(), 0, rel, &
token->mapping);
2450 rel.to =
token->text.size();
2451 if (!(
token->text_type & has_text) &&
2452 !stdex::isblank(
m_source + s->interval.start, s->interval.size()))
2453 token->text_type |= has_text;
2456 else if (s->type == stdex::parser::html_sequence_t::element || s->type == stdex::parser::html_sequence_t::element_start) {
2459 const element_start* s_el_start = s->type == stdex::parser::html_sequence_t::element_start ?
static_cast<const element_start*
>(s.get()) :
nullptr;
2461 throw std::invalid_argument(
"<frameset> detected");
2464 size_t offset = s->interval.start;
2465 std::unique_ptr<text_token<_Elem, _Traits, _Alloc>> t(s->type == stdex::parser::html_sequence_t::element ||
element_traits::span(s_el_start->
code) == element_span_t::immediate ?
2471 if (a.value.empty() ||
2472 stdex::isblank(
m_source + a.value.start, a.value.size()))
2476 t->text.append(
m_source + offset, a.value.start - offset);
2481 stdex::sgml2strcat(t_url->url,
m_source + a.value.start, a.value.size());
2483 t->text_type |= has_tokens;
2484 offset = a.value.end;
2487 t->text.append(
m_source + offset, a.value.start - offset);
2491 has_text | is_title,
2494 t_value->mapping.push_back(rel_value);
2495 stdex::sgml2strcat(t_value->text,
m_source + a.value.start, a.value.size(), 0, rel_value, &t_value->
mapping);
2497 t->text_type |= has_tokens;
2498 offset = a.value.end;
2502 t->text.append(
m_source + offset, s->interval.end - offset);
2503 rel.from = s->interval.start;
2504 token->mapping.push_back(rel);
2506 token->text_type |= has_tokens;
2511 if (s_el_start->
code == element_t::address ||
2512 s_el_start->
code == element_t::code ||
2513 s_el_start->
code == element_t::comment ||
2514 s_el_start->
code == element_t::cite ||
2515 s_el_start->
code == element_t::kbd ||
2516 s_el_start->
code == element_t::samp ||
2517 s_el_start->
code == element_t::script ||
2518 s_el_start->
code == element_t::style)
2521 auto s_end = s_el_start->
end;
2524 if (s->interval.end < s_end->interval.start) {
2525 if (s_el_start->
code != element_t::style) {
2526 rel.from = s->interval.start;
2527 token->mapping.push_back(rel);
2531 m_source + s->interval.end, s_end->interval.start - s->interval.end,
2538 auto t =
parse_css(s->interval.end, s_end->interval.start);
2540 rel.from = s->interval.start;
2541 token->mapping.push_back(rel);
2542 rel.to += t->append_tag(
token->text);
2544 token->text_type |= has_tokens;
2551 while (limit != end && limit->get() != s_el_start->
end)
2553 auto t =
parse(limit,
2556 rel.from = s->interval.start;
2557 token->mapping.push_back(rel);
2558 rel.to += t->append_tag(
token->text);
2559 token->text_type |= has_tokens;
2563 else if (s->type == stdex::parser::html_sequence_t::element_end) {
2564 rel.from = s->interval.start;
2565 token->mapping.push_back(rel);
2569 m_source + s->interval.start, s->interval.size(),
2573 token->text_type |= has_tokens;
2578 rel.from = s->interval.start;
2579 token->mapping.push_back(rel);
2583 m_source + s->interval.start, s->interval.size(),
2587 token->text_type |= has_tokens;
2601 std::unique_ptr<text_token<_Elem, _Traits, _Alloc>>
token(
2609 if (m_css_comment.match(
m_source, start, end)) {
2613 else if (m_css_cdo.match(
m_source, start, end)) {
2617 else if (m_css_cdc.match(
m_source, start, end)) {
2622 m_css_import.match(
m_source, start, end) && (section = m_css_import.
interval, content = m_css_import.
content,
true) ||
2625 std::unique_ptr<url_token<_Elem, _Traits, _Alloc>> t_url(
2634 token->text_type |= has_tokens;
2635 start = section.
end;
2637 else if (m_any_char.match(
m_source, start, end)) {
HTML declaration.
Definition html.hpp:1533
stdex::interval< size_t > name
Declaration name position in source.
Definition html.hpp:1543
std::vector< stdex::parser::html_attribute > attributes
Declaration attribute positions in source.
Definition html.hpp:1544
HTML document.
Definition html.hpp:1600
std::vector< element_start * > m_element_stack
LIFO stack of started elements.
Definition html.hpp:1935
void finalize()
Finalizes document when no more appending is planned.
Definition html.hpp:1851
stdex::charset_id m_charset
Document charset.
Definition html.hpp:1920
bool m_is_rcdata
Inside of RCDATA?
Definition html.hpp:1926
bool m_is_special_element
Inside of a special element (<SCRIPT>, <STYLE>, ...)?
Definition html.hpp:1936
sequence_store m_sequences
Store of sequences.
Definition html.hpp:1934
size_t m_num_invalid_conditions
Number of started invalid conditions.
Definition html.hpp:1924
std::vector< std::unique_ptr< entity< _Elem, _Traits, _Alloc > > > m_entities
Array of entities.
Definition html.hpp:1930
std::basic_string< _Elem, _Traits, _Alloc > m_source
Document HTML source code.
Definition html.hpp:1918
void assign(_In_reads_or_z_opt_(num_chars) const _Elem *source, size_t num_chars)
Parses HTML document source code.
Definition html.hpp:1863
element_start * active_element() const
Returns starting tag of currently active element or nullptr if no element is known to be started.
Definition html.hpp:1881
std::basic_string< _Elem, _Traits, _Alloc > replace_entities(_In_reads_or_z_opt_(num_chars) const _Elem *input, size_t num_chars) const
Replaces entities with their content.
Definition html.hpp:1889
void clear()
Empties document.
Definition html.hpp:1619
size_t m_num_valid_conditions
Number of started valid conditions.
Definition html.hpp:1923
const std::basic_string< _Elem, _Traits, _Alloc > & source() const
Returns document HTML source code.
Definition html.hpp:1873
size_t m_num_parsed
Number of characters already parsed.
Definition html.hpp:1919
bool m_is_cdata
Inside of CDATA?
Definition html.hpp:1925
void append(_In_reads_or_z_opt_(num_chars) const _Elem *source, size_t num_chars)
Parses HTML source code by chunks.
Definition html.hpp:1640
Ending tag of an HTML element </...>
Definition html.hpp:1513
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1525
element_start * start
Corresponding starting tag.
Definition html.hpp:1526
element_t code
Element code.
Definition html.hpp:1524
Starting tag of an HTML element <...>
Definition html.hpp:1497
sequence * end
Corresponding ending tag of type element_end; When element is ended by a start of another element,...
Definition html.hpp:1506
HTML element <.../>
Definition html.hpp:1318
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1487
std::vector< stdex::parser::html_attribute > attributes
Element attribute positions in source.
Definition html.hpp:1488
element_t code
Element code.
Definition html.hpp:1486
HTML instruction.
Definition html.hpp:1567
stdex::interval< size_t > content
Instruction content position in source.
Definition html.hpp:1576
HTML parser.
Definition html.hpp:2157
stdex::progress< size_t > * m_progress
Progress indicator.
Definition html.hpp:2652
text_token< _Elem, _Traits, _Alloc > * parse_css(size_t start, size_t end)
Parses CSS.
Definition html.hpp:2598
static void merge(token_list &a, const token_list &b)
Adds tokens from list b to list a creating an union.
Definition html.hpp:2345
token_list::const_iterator end_tokens(std::basic_string< _Elem, _Traits, _Alloc > &source, token_list &active_tokens, const token_list &new_tokens)
Pops ending tokens from the active token list and append their tags to the source code string.
Definition html.hpp:2278
static void link(std::basic_string< _Elem, _Traits, _Alloc > &source, const text_token< _Elem, _Traits, _Alloc > *t)
Rebuilds HTML source code from the token tree.
Definition html.hpp:2193
text_token< _Elem, _Traits, _Alloc > * parse(const sequence_store::const_iterator &end, uint32_t text_type=0)
Recursively parses HTML document.
Definition html.hpp:2423
const _Elem * m_source
HTML source code.
Definition html.hpp:2653
token_vector m_tokens
HTML token storage.
Definition html.hpp:2654
text_token< _Elem, _Traits, _Alloc > * parse()
Parses HTML document.
Definition html.hpp:2173
const document< _Elem, _Traits, _Alloc > & m_document
Document being analyzed.
Definition html.hpp:2649
void make_absolute_url(std::basic_string< _Elem, _Traits, _Alloc > &rel)
Converts URL to absolute.
Definition html.hpp:2364
size_t append_token(std::unique_ptr< T > &&token, std::basic_string< _Elem, _Traits, _Alloc > &source)
Adds token to the collection and appends its tag to the source code string.
Definition html.hpp:2406
const token_vector & tokens() const
Returns collection of tokens.
Definition html.hpp:2377
const stdex::sstring m_url
Absolute document URL.
Definition html.hpp:2650
const bool m_parse_frames
Parse frames.
Definition html.hpp:2651
static void start_tokens(std::basic_string< _Elem, _Traits, _Alloc > &source, token_list &active_tokens, const token_list &new_tokens, token_list::const_iterator from)
Pushes tokens to the active token list and appends their tags to the source code string.
Definition html.hpp:2260
T * append_token(std::unique_ptr< T > &&token)
Adds token to the collection.
Definition html.hpp:2388
void append_inserted_tokens(std::basic_string< _Elem, _Traits, _Alloc > &source, inserted_token_list &inserted_tokens, size_t word_index, bool after_word, token_list &active_tokens)
Adds matching inserted tokens before/after the given word in source code.
Definition html.hpp:2321
sequence_store::const_iterator m_offset
Index of active section.
Definition html.hpp:2655
Base class for HTML sequences.
Definition html.hpp:1299
stdex::interval< size_t > interval
Sequence position in source.
Definition html.hpp:1302
stdex::parser::html_sequence_t type
Sequence type. Enum is used for performance reasons (vs. dynamic_cast)
Definition html.hpp:1301
sequence * parent
Parent sequence.
Definition html.hpp:1303
Token representing start HTML tag.
Definition html.hpp:2090
stdex::html::sequence * end_sequence
Ending tag sequence.
Definition html.hpp:2108
std::basic_string< _Elem, _Traits, _Alloc > name
Element name allowing later recreation of ending </tag>
Definition html.hpp:2107
Token representing part of HTML text.
Definition html.hpp:2065
stdex::mapping_vector< size_t > mapping
Mapping between source and text positions.
Definition html.hpp:2082
std::basic_string< _Elem, _Traits, _Alloc > text
Token text.
Definition html.hpp:2080
uint32_t text_type
Mask of text_type_flag_t to specify text content.
Definition html.hpp:2081
HTML token base class.
Definition html.hpp:1974
sequence * sequence
Pointer to the sequence this token represents or nullptr when it doesn't trivially represent one sequ...
Definition html.hpp:2043
size_t append_tag(std::basic_string< char, _Traits, _Alloc > &str) const
Appends token tag to the source code.
Definition html.hpp:1996
uintptr_t data
Any user-supplied data.
Definition html.hpp:2044
token_t type
Token type.
Definition html.hpp:2042
size_t append_tag(std::basic_string< wchar_t, _Traits, _Alloc > &str) const
Appends token tag to the source code.
Definition html.hpp:2012
HTTP token representing an URL.
Definition html.hpp:2125
token_url_t encoding
URL encoding.
Definition html.hpp:2140
std::basic_string< _Elem, _Traits, _Alloc > url
URL.
Definition html.hpp:2139
stdex::interval< size_t > content
content position in source
Definition parser.hpp:7833
stdex::interval< size_t > content
content position in source
Definition parser.hpp:7748
std::vector< html_attribute > attributes
tag attributes
Definition parser.hpp:8355
html_sequence_t type
tag type
Definition parser.hpp:8353
stdex::interval< size_t > name
tag name position in source
Definition parser.hpp:8354
stdex::interval< size_t > interval
Region of the last match.
Definition parser.hpp:172
Test for given string.
Definition parser.hpp:818
Progress indicator base class.
Definition progress.hpp:19
virtual bool cancel()
Query whether user requested abort.
Definition progress.hpp:65
virtual void set(T value)
Set current progress.
Definition progress.hpp:47
virtual void set_range(T start, T end)
Set progress range extent.
Definition progress.hpp:37
User cancelled exception.
Definition exception.hpp:17
Describes attributes associated with a HTML element.
Definition html.hpp:683
static bool is_group(element_t code)
Does element represent a separate part of text?
Definition html.hpp:1088
static bool is_flow(element_t code)
Does element typically represent text?
Definition html.hpp:1002
static bool is_heading(element_t code)
Does element represent a heading?
Definition html.hpp:920
static bool is_head_content(element_t code)
Is element part of the document head?
Definition html.hpp:1012
static bool is_fontstyle(element_t code)
Does element represent font styling?
Definition html.hpp:808
static bool is_block(element_t code)
Is element typically displayed as a stand-alone section of text?
Definition html.hpp:971
static bool is_head_misc(element_t code)
May element be a part of document head?
Definition html.hpp:1029
static bool is_list(element_t code)
Does element represent a list of items?
Definition html.hpp:939
static bool is_uri(element_t code, _In_reads_or_z_opt_(num_chars) const T *attr_name, size_t num_chars)
Checks if expected element attribute value is URI.
Definition html.hpp:1218
static bool is_preformatted(element_t code)
Does element represent preformatted text, source code etc.?
Definition html.hpp:956
static bool is_localizable(element_t code, const T *attr_name, size_t num_chars)
Checks if expected element attribute value is localizable.
Definition html.hpp:1273
static bool is_special(element_t code)
Does element represent non-textual item in the document?
Definition html.hpp:854
static bool is_pre_exclusion(element_t code)
May element be a part of.
Definition html.hpp:1047
static bool is_inline(element_t code)
Is element typically displayed inline with text?
Definition html.hpp:905
static bool is_html_content(element_t code)
Does element represent the document body?
Definition html.hpp:1072
static bool is_formctrl(element_t code)
Does element represent a form control?
Definition html.hpp:887
static bool is_phrase(element_t code)
Does element represent a phrase-of-speech?
Definition html.hpp:830
static bool may_contain(element_t parent, element_t child)
Checks if one element may nest inside another.
Definition html.hpp:1118
static element_span_t span(element_t code)
Returns expected element span in HTML code.
Definition html.hpp:689
HTML entity.
Definition html.hpp:1584
stdex::interval< size_t > name
Name position in source.
Definition html.hpp:1585
std::basic_string< _Elem, _Traits, _Alloc > value
Entity value.
Definition html.hpp:1586
Inserted HTML token.
Definition html.hpp:2146
bool after_word
true if token is anchored after the word; false if anchored before the word
Definition html.hpp:2150
std::list< stdex::html::token * > active_tokens
List of started tokens at inserted token.
Definition html.hpp:2148
size_t word_index
Index of the word, token is anchored to.
Definition html.hpp:2149
token * token
Points to the token.
Definition html.hpp:2147
Numerical interval.
Definition interval.hpp:18
T size() const
Returns interval size.
Definition interval.hpp:47
T end
interval end
Definition interval.hpp:20
T start
interval start
Definition interval.hpp:19
Maps index in source string to index in destination string.
Definition mapping.hpp:17
mapping()
Constructs a zero to zero mapping.
Definition mapping.hpp:24
Tag attribute.
Definition parser.hpp:8127
stdex::interval< size_t > value
attribute value position in source
Definition parser.hpp:8129