9#include "exception.hpp"
10#include "interval.hpp"
13#include "progress.hpp"
40 template<
class _Traits = std::
char_traits<
char>,
class _Alloc = std::allocator<
char>>
42 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
43 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars = SIZE_MAX)
45 _Assume_(src || !num_chars);
46 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
48 case '&': dst +=
"&";
break;
49 case ';': dst +=
";";
break;
50 case '\"': dst +=
""";
break;
51 case '\'': dst +=
"'";
break;
52 case '<': dst +=
"<";
break;
53 case '>': dst +=
">";
break;
54 case 0x00a0: dst +=
" ";
break;
55 default: dst += src[i];
break;
67 template<
class _Traits = std::
char_traits<
wchar_t>,
class _Alloc = std::allocator<
wchar_t>>
69 _Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& dst,
70 _In_reads_or_z_opt_(num_chars)
const wchar_t* src, _In_
size_t num_chars = SIZE_MAX)
72 _Assume_(src || !num_chars);
73 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
75 case L
'&': dst += L
"&";
break;
76 case L
';': dst += L
";";
break;
77 case L
'\"': dst += L
""";
break;
78 case L
'\'': dst += L
"'";
break;
79 case L
'<': dst += L
"<";
break;
80 case L
'>': dst += L
">";
break;
81 case L
'\u00a0': dst += L
" ";
break;
82 default: dst += src[i];
break;
93 template<
class _Traits = std::
char_traits<
char>,
class _Alloc = std::allocator<
char>>
94 inline void escape_min(_Inout_ std::basic_string<char, _Traits, _Alloc>& dst, _In_
char chr)
97 case '&': dst +=
"&";
break;
98 case '<': dst +=
"<";
break;
99 case '>': dst +=
">";
break;
100 case 0x00a0: dst +=
" ";
break;
101 default: dst += chr;
break;
111 template<
class _Traits = std::
char_traits<
wchar_t>,
class _Alloc = std::allocator<
wchar_t>>
112 inline void escape_min(_Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& dst, _In_
wchar_t chr)
115 case L
'&': dst += L
"&";
break;
116 case L
'<': dst += L
"<";
break;
117 case L
'>': dst += L
">";
break;
118 case L
'\u00a0': dst += L
" ";
break;
119 default: dst += chr;
break;
130 template<
class _Traits = std::
char_traits<
char>,
class _Alloc = std::allocator<
char>>
131 inline void escape_min(
132 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
133 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars = SIZE_MAX)
135 _Assume_(src || !num_chars);
136 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
138 case '&': dst +=
"&";
break;
139 case '<': dst +=
"<";
break;
140 case '>': dst +=
">";
break;
141 case 0x00a0: dst +=
" ";
break;
142 default: dst += src[i];
break;
154 template<
class _Traits = std::
char_traits<
wchar_t>,
class _Alloc = std::allocator<
wchar_t>>
155 inline void escape_min(
156 _Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& dst,
157 _In_reads_or_z_opt_(num_chars)
const wchar_t* src, _In_
size_t num_chars = SIZE_MAX)
159 _Assume_(src || !num_chars);
160 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
162 case L
'&': dst += L
"&";
break;
163 case L
'<': dst += L
"<";
break;
164 case L
'>': dst += L
">";
break;
165 case L
'\u00a0': dst += L
" ";
break;
166 default: dst += src[i];
break;
178 template<
class _Traits = std::
char_traits<
char>,
class _Alloc = std::allocator<
char>>
179 inline void url_unescape(
180 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
181 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars = SIZE_MAX)
183 _Assume_(src || !num_chars);
184 for (
size_t i = 0; i < num_chars && src[i];) {
194 if (
'0' <= src[i] && src[i] <=
'9') chr = (src[i++] -
'0') << 4;
195 else if (
'A' <= src[i] && src[i] <=
'F') chr = (src[i++] -
'A' + 10) << 4;
196 else if (
'a' <= src[i] && src[i] <=
'f') chr = (src[i++] -
'a' + 10) << 4;
197 else { dst +=
'%';
continue; }
198 if (
'0' <= src[i] && src[i] <=
'9') chr |= (src[i++] -
'0');
199 else if (
'A' <= src[i] && src[i] <=
'F') chr |= (src[i++] -
'A' + 10);
200 else if (
'a' <= src[i] && src[i] <=
'f') chr |= (src[i++] -
'a' + 10);
201 else { dst +=
'%'; dst += src[i - 1];
continue; }
203 dst +=
static_cast<char>(chr);
220 template<
class _Traits = std::
char_traits<
char>,
class _Alloc = std::allocator<
char>>
221 inline void url_escape(
222 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
223 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars = SIZE_MAX)
225 _Assume_(src || !num_chars);
226 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
228 case ' ': dst +=
"+";
break;
229 case '<': dst +=
"%3C";
break;
230 case '>': dst +=
"%3E";
break;
231 case '#': dst +=
"%23";
break;
232 case '%': dst +=
"%25";
break;
233 case '{': dst +=
"%7B";
break;
234 case '}': dst +=
"%7D";
break;
235 case '|': dst +=
"%7C";
break;
236 case '\\': dst +=
"%5C";
break;
237 case '^': dst +=
"%5E";
break;
238 case '~': dst +=
"%7E";
break;
239 case '[': dst +=
"%5B";
break;
240 case ']': dst +=
"%5D";
break;
241 case '`': dst +=
"%60";
break;
242 case ';': dst +=
"%3B";
break;
243 case '/': dst +=
"%2F";
break;
244 case '?': dst +=
"%3F";
break;
245 case ':': dst +=
"%3A";
break;
246 case '@': dst +=
"%40";
break;
247 case '=': dst +=
"%3D";
break;
248 case '&': dst +=
"%26";
break;
249 case '$': dst +=
"%24";
break;
251 if (0x20 <
static_cast<uint8_t
>(src[i]) &&
static_cast<uint8_t
>(src[i]) < 0x7f)
255 uint8_t n = (
static_cast<uint8_t
>(src[i]) & 0xf0) >> 4;
256 dst += n < 10 ? static_cast<char>(
'0' + n) : static_cast<char>(
'A' + n - 10);
257 n = ((uint8_t)src[i] & 0x0f);
258 dst += n < 10 ? static_cast<char>(
'0' + n) : static_cast<char>(
'A' + n - 10);
271 template<
class _Elem,
class _Traits = std::
char_traits<_Elem>,
class _Alloc = std::allocator<_Elem>>
272 inline void css_unescape(
273 _Inout_ std::basic_string<_Elem, _Traits, _Alloc>& dst,
274 _In_reads_or_z_opt_(num_chars)
const _Elem* src, _In_
size_t num_chars = SIZE_MAX)
276 _Assume_(src || !num_chars);
277 for (
size_t i = 0; i < num_chars && src[i];) {
280 else if (i + 1 < num_chars) {
285 case 'n': dst +=
'\n'; i++;
break;
286 case 'r': dst +=
'\r'; i++;
break;
287 case 't': dst +=
'\t'; i++;
break;
290 case '\n': i++;
break;
308 case 'F':
case 'f': {
310 size_t end = std::min(num_chars, i + 6);
312 for (; i < end; ++i) {
313 if (
'0' <= src[i] && src[i] <=
'9') chr = chr * 0x10 + src[i] -
'0';
314 else if (
'A' <= src[i] && src[i] <=
'F') chr = chr * 0x10 + src[i] -
'A' + 10;
315 else if (
'a' <= src[i] && src[i] <=
'f') chr = chr * 0x10 + src[i] -
'a' + 10;
319 dst +=
static_cast<_Elem
>(chr);
321 if (i < end && src[i] ==
' ') {
328 default: dst += src[i++];
341 template<
class _Traits = std::
char_traits<
char>,
class _Alloc = std::allocator<
char>>
342 inline void css_escape(
343 _Inout_ std::basic_string<char, _Traits, _Alloc>& dst,
344 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars = SIZE_MAX)
346 _Assume_(src || !num_chars);
347 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
349 case '\\': dst +=
"\\\\";
break;
350 case '\n': dst +=
"\\n";
break;
351 case '\r': dst +=
"\\r";
break;
352 case '\t': dst +=
"\\t";
break;
353 case '\"': dst +=
"\\\"";
break;
354 case '\'': dst +=
"\\'";
break;
355 default: dst += src[i];
break;
367 template<
class _Traits = std::
char_traits<
wchar_t>,
class _Alloc = std::allocator<
wchar_t>>
368 inline void css_escape(
369 _Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& dst,
370 _In_reads_or_z_opt_(num_chars)
const wchar_t* src, _In_
size_t num_chars = SIZE_MAX)
372 _Assume_(src || !num_chars);
373 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
375 case L
'\\': dst += L
"\\\\";
break;
376 case L
'\n': dst += L
"\\n";
break;
377 case L
'\r': dst += L
"\\r";
break;
378 case L
'\t': dst += L
"\\t";
break;
379 case L
'\"': dst += L
"\\\"";
break;
380 case L
'\'': dst += L
"\\'";
break;
381 default: dst += src[i];
break;
389 enum class element_t {
505 enum class element_span_t {
521 static inline element_span_t
span(_In_ element_t code)
523 static element_span_t lookup[] = {
524 element_span_t::needs_end,
525 element_span_t::needs_end,
526 element_span_t::needs_end,
527 element_span_t::needs_end,
528 element_span_t::needs_end,
529 element_span_t::immediate,
530 element_span_t::needs_end,
531 element_span_t::immediate,
532 element_span_t::immediate,
533 element_span_t::needs_end,
534 element_span_t::immediate,
535 element_span_t::needs_end,
536 element_span_t::needs_end,
537 element_span_t::needs_end,
538 element_span_t::end_optional,
539 element_span_t::immediate,
540 element_span_t::needs_end,
541 element_span_t::needs_end,
542 element_span_t::needs_end,
543 element_span_t::needs_end,
544 element_span_t::needs_end,
545 element_span_t::immediate,
546 element_span_t::end_optional,
547 element_span_t::needs_end,
548 element_span_t::end_optional,
549 element_span_t::needs_end,
550 element_span_t::needs_end,
551 element_span_t::needs_end,
552 element_span_t::needs_end,
553 element_span_t::needs_end,
554 element_span_t::end_optional,
555 element_span_t::needs_end,
556 element_span_t::immediate,
557 element_span_t::needs_end,
558 element_span_t::needs_end,
559 element_span_t::needs_end,
560 element_span_t::immediate,
561 element_span_t::needs_end,
562 element_span_t::needs_end,
563 element_span_t::needs_end,
564 element_span_t::needs_end,
565 element_span_t::needs_end,
566 element_span_t::needs_end,
567 element_span_t::needs_end,
568 element_span_t::end_optional,
569 element_span_t::immediate,
570 element_span_t::end_optional,
571 element_span_t::needs_end,
572 element_span_t::needs_end,
573 element_span_t::immediate,
574 element_span_t::immediate,
575 element_span_t::needs_end,
576 element_span_t::immediate,
577 element_span_t::needs_end,
578 element_span_t::needs_end,
579 element_span_t::needs_end,
580 element_span_t::end_optional,
581 element_span_t::immediate,
582 element_span_t::needs_end,
583 element_span_t::needs_end,
584 element_span_t::needs_end,
585 element_span_t::needs_end,
586 element_span_t::immediate,
587 element_span_t::immediate,
588 element_span_t::needs_end,
589 element_span_t::needs_end,
590 element_span_t::needs_end,
591 element_span_t::needs_end,
592 element_span_t::needs_end,
593 element_span_t::needs_end,
594 element_span_t::needs_end,
595 element_span_t::end_optional,
596 element_span_t::end_optional,
597 element_span_t::immediate,
598 element_span_t::end_optional,
599 element_span_t::needs_end,
600 element_span_t::needs_end,
601 element_span_t::immediate,
602 element_span_t::needs_end,
603 element_span_t::needs_end,
604 element_span_t::needs_end,
605 element_span_t::needs_end,
606 element_span_t::needs_end,
607 element_span_t::needs_end,
608 element_span_t::needs_end,
609 element_span_t::needs_end,
610 element_span_t::needs_end,
611 element_span_t::needs_end,
612 element_span_t::needs_end,
613 element_span_t::needs_end,
614 element_span_t::needs_end,
615 element_span_t::end_optional,
616 element_span_t::end_optional,
617 element_span_t::needs_end,
618 element_span_t::end_optional,
619 element_span_t::end_optional,
620 element_span_t::end_optional,
621 element_span_t::needs_end,
622 element_span_t::end_optional,
623 element_span_t::needs_end,
624 element_span_t::needs_end,
625 element_span_t::needs_end,
626 element_span_t::needs_end,
627 element_span_t::immediate,
628 element_span_t::needs_end,
630 return element_t::a <= code && code <= element_t::xmp ?
631 lookup[
static_cast<size_t>(code) -
static_cast<size_t>(element_t::a)] :
632 element_span_t::needs_end;
648 case element_t::strike:
649 case element_t::blink:
651 case element_t::small:
666 case element_t::strong:
668 case element_t::code:
669 case element_t::samp:
672 case element_t::cite:
673 case element_t::abbr:
674 case element_t::acronym:
691 case element_t::applet:
692 case element_t::object:
693 case element_t::embed:
694 case element_t::font:
695 case element_t::basefont:
699 case element_t::script:
704 case element_t::ruby:
705 case element_t::span:
707 case element_t::iframe:
708 case element_t::nobr:
722 case element_t::input:
723 case element_t::select:
724 case element_t::textarea:
725 case element_t::label:
726 case element_t::button:
740 code == element_t::PCDATA ||
771 static inline bool is_list(_In_ element_t code)
777 case element_t::menu:
792 case element_t::listing:
812 case element_t::center:
813 case element_t::marquee:
814 case element_t::noscript:
815 case element_t::noframes:
816 case element_t::noembed:
817 case element_t::blockquote:
818 case element_t::form:
819 case element_t::isindex:
821 case element_t::table:
822 case element_t::fieldset:
823 case element_t::address:
834 static inline bool is_flow(_In_ element_t code)
847 case element_t::title:
848 case element_t::isindex:
849 case element_t::base:
850 case element_t::nextid:
864 case element_t::script:
865 case element_t::style:
866 case element_t::meta:
867 case element_t::link:
868 case element_t::object:
883 case element_t::object:
884 case element_t::applet:
885 case element_t::embed:
887 case element_t::small:
890 case element_t::ruby:
891 case element_t::font:
892 case element_t::basefont:
893 case element_t::nobr:
907 case element_t::head:
908 case element_t::body:
909 case element_t::frameset:
927 case element_t::colgroup:
931 case element_t::frame:
932 case element_t::iframe:
933 case element_t::legend:
950 static inline bool may_contain(_In_ element_t parent, _In_ element_t child)
952 if (child == element_t::unknown || child == element_t::comment)
960 case element_t::a:
return is_inline(child) && child != element_t::a;
961 case element_t::address:
return is_inline(child) || child == element_t::p;
962 case element_t::applet:
return is_flow(child) || child == element_t::param;
963 case element_t::area:
return false;
964 case element_t::base:
return false;
965 case element_t::basefont:
return false;
966 case element_t::bdo:
return is_inline(child);
967 case element_t::blockquote:
return is_flow(child);
968 case element_t::body:
return is_flow(child) || child == element_t::ins || child == element_t::del;
969 case element_t::br:
return false;
970 case element_t::button:
return is_flow(child) && !
is_formctrl(child) && child != element_t::a && child != element_t::form && child != element_t::isindex && child != element_t::fieldset && child != element_t::iframe;
971 case element_t::caption:
return is_inline(child);
972 case element_t::center:
return is_flow(child);
973 case element_t::col:
return false;
974 case element_t::colgroup:
return child == element_t::col;
975 case element_t::comment:
return child == element_t::CDATA;
976 case element_t::dd:
return is_flow(child);
977 case element_t::del:
return is_flow(child);
978 case element_t::dir:
return child == element_t::li;
979 case element_t::div:
return is_flow(child);
980 case element_t::dl:
return child == element_t::dt || child == element_t::dd;
981 case element_t::dt:
return is_inline(child);
982 case element_t::embed:
return is_flow(child) || child == element_t::param;
983 case element_t::fieldset:
return is_flow(child) || child == element_t::legend || child == element_t::PCDATA;
984 case element_t::font:
return is_inline(child);
985 case element_t::form:
return is_flow(child) && child != element_t::form;
986 case element_t::frame:
return false;
987 case element_t::frameset:
return child == element_t::frameset || child == element_t::frame || child == element_t::noframes;
989 case element_t::hr:
return false;
991 case element_t::iframe:
return is_flow(child);
992 case element_t::img:
return false;
993 case element_t::input:
return false;
994 case element_t::ins:
return is_flow(child);
995 case element_t::isindex:
return false;
996 case element_t::label:
return is_inline(child) && child != element_t::label;
997 case element_t::legend:
return is_inline(child);
998 case element_t::li:
return is_flow(child);
999 case element_t::link:
return false;
1000 case element_t::listing:
return child == element_t::CDATA;
1001 case element_t::map:
return is_block(child) || child == element_t::area;
1002 case element_t::marquee:
return is_flow(child);
1003 case element_t::menu:
return child == element_t::li;
1004 case element_t::meta:
return false;
1005 case element_t::nobr:
return is_inline(child) || child == element_t::wbr;
1006 case element_t::noframes:
return (
is_flow(child) || child == element_t::body) && child != element_t::noframes;
1007 case element_t::noscript:
return is_flow(child);
1008 case element_t::noembed:
return is_flow(child);
1009 case element_t::object:
return is_flow(child) || child == element_t::param;
1010 case element_t::ol:
return child == element_t::li;
1011 case element_t::optgroup:
return child == element_t::option;
1012 case element_t::option:
return child == element_t::PCDATA;
1013 case element_t::p:
return is_inline(child);
1014 case element_t::param:
return false;
1015 case element_t::plaintext:
return is_flow(child);
1017 case element_t::q:
return is_inline(child);
1018 case element_t::rt:
return false;
1019 case element_t::ruby:
return is_inline(child);
1020 case element_t::script:
return child == element_t::CDATA;
1021 case element_t::select:
return child == element_t::optgroup || child == element_t::option;
1022 case element_t::span:
return is_inline(child);
1023 case element_t::style:
return child == element_t::CDATA;
1024 case element_t::sub:
return is_inline(child);
1025 case element_t::sup:
return is_inline(child);
1026 case element_t::table:
return child == element_t::caption || child == element_t::col || child == element_t::colgroup || child == element_t::thead || child == element_t::tfoot || child == element_t::tbody;
1027 case element_t::tbody:
return child == element_t::tr;
1028 case element_t::td:
return is_flow(child);
1029 case element_t::textarea:
return child == element_t::PCDATA;
1030 case element_t::tfoot:
return child == element_t::tr;
1031 case element_t::th:
return is_flow(child);
1032 case element_t::thead:
return child == element_t::tr;
1033 case element_t::title:
return child == element_t::PCDATA;
1034 case element_t::tr:
return child == element_t::td || child == element_t::th;
1035 case element_t::ul:
return child == element_t::li;
1036 case element_t::wbr:
return false;
1037 case element_t::unknown:
return true;
1050 static inline bool is_uri(_In_ element_t code, _In_reads_or_z_opt_(num_chars)
const T* attr_name, _In_
size_t num_chars)
1052 _Assume_(attr_name || !num_chars);
1054 case element_t::a:
return !stdex::strnicmp(attr_name, num_chars,
"href", SIZE_MAX, stdex::std_locale_C);
1055 case element_t::applet:
return !stdex::strnicmp(attr_name, num_chars,
"code", SIZE_MAX, stdex::std_locale_C) ||
1056 !stdex::strnicmp(attr_name, num_chars,
"codebase", SIZE_MAX, stdex::std_locale_C) ||
1057 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX, stdex::std_locale_C);
1058 case element_t::area:
return !stdex::strnicmp(attr_name, num_chars,
"href", SIZE_MAX, stdex::std_locale_C);
1059 case element_t::base:
return !stdex::strnicmp(attr_name, num_chars,
"href", SIZE_MAX, stdex::std_locale_C);
1060 case element_t::bgsound:
return !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX, stdex::std_locale_C);
1061 case element_t::blockquote:
return !stdex::strnicmp(attr_name, num_chars,
"cite", SIZE_MAX, stdex::std_locale_C);
1062 case element_t::body:
return !stdex::strnicmp(attr_name, num_chars,
"background", SIZE_MAX, stdex::std_locale_C);
1063 case element_t::comment:
return !stdex::strnicmp(attr_name, num_chars,
"data", SIZE_MAX, stdex::std_locale_C);
1064 case element_t::del:
return !stdex::strnicmp(attr_name, num_chars,
"cite", SIZE_MAX, stdex::std_locale_C);
1065 case element_t::embed:
return !stdex::strnicmp(attr_name, num_chars,
"pluginspage", SIZE_MAX, stdex::std_locale_C) ||
1066 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX, stdex::std_locale_C);
1067 case element_t::form:
return !stdex::strnicmp(attr_name, num_chars,
"action", SIZE_MAX, stdex::std_locale_C);
1068 case element_t::frame:
return !stdex::strnicmp(attr_name, num_chars,
"longdesc", SIZE_MAX, stdex::std_locale_C) ||
1069 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX, stdex::std_locale_C);
1070 case element_t::head:
return !stdex::strnicmp(attr_name, num_chars,
"profile", SIZE_MAX, stdex::std_locale_C);
1071 case element_t::iframe:
return !stdex::strnicmp(attr_name, num_chars,
"longdesc", SIZE_MAX, stdex::std_locale_C) ||
1072 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX, stdex::std_locale_C);
1073 case element_t::img:
return !stdex::strnicmp(attr_name, num_chars,
"longdesc", SIZE_MAX, stdex::std_locale_C) ||
1074 !stdex::strnicmp(attr_name, num_chars,
"lowsrc", SIZE_MAX, stdex::std_locale_C) ||
1075 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX, stdex::std_locale_C) ||
1076 !stdex::strnicmp(attr_name, num_chars,
"usemap", SIZE_MAX, stdex::std_locale_C);
1077 case element_t::input:
return !stdex::strnicmp(attr_name, num_chars,
"lowsrc", SIZE_MAX, stdex::std_locale_C) ||
1078 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX, stdex::std_locale_C) ||
1079 !stdex::strnicmp(attr_name, num_chars,
"usemap", SIZE_MAX, stdex::std_locale_C);
1080 case element_t::ins:
return !stdex::strnicmp(attr_name, num_chars,
"cite", SIZE_MAX, stdex::std_locale_C);
1081 case element_t::link:
return !stdex::strnicmp(attr_name, num_chars,
"href", SIZE_MAX, stdex::std_locale_C);
1082 case element_t::object:
return !stdex::strnicmp(attr_name, num_chars,
"basehref", SIZE_MAX, stdex::std_locale_C) ||
1083 !stdex::strnicmp(attr_name, num_chars,
"classid", SIZE_MAX, stdex::std_locale_C) ||
1084 !stdex::strnicmp(attr_name, num_chars,
"code", SIZE_MAX, stdex::std_locale_C) ||
1085 !stdex::strnicmp(attr_name, num_chars,
"codebase", SIZE_MAX, stdex::std_locale_C) ||
1086 !stdex::strnicmp(attr_name, num_chars,
"data", SIZE_MAX, stdex::std_locale_C) ||
1087 !stdex::strnicmp(attr_name, num_chars,
"usemap", SIZE_MAX, stdex::std_locale_C);
1088 case element_t::q:
return !stdex::strnicmp(attr_name, num_chars,
"cite", SIZE_MAX, stdex::std_locale_C);
1089 case element_t::script:
return !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX, stdex::std_locale_C);
1090 case element_t::table:
return !stdex::strnicmp(attr_name, num_chars,
"background", SIZE_MAX, stdex::std_locale_C);
1091 case element_t::td:
return !stdex::strnicmp(attr_name, num_chars,
"background", SIZE_MAX, stdex::std_locale_C);
1092 case element_t::th:
return !stdex::strnicmp(attr_name, num_chars,
"background", SIZE_MAX, stdex::std_locale_C);
1105 static inline bool is_localizable(element_t code,
const T* attr_name,
size_t num_chars)
1107 _Assume_(attr_name || !num_chars);
1108 if (!stdex::strnicmp(attr_name, num_chars,
"title", SIZE_MAX, stdex::std_locale_C))
1111 case element_t::applet:
return !stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX, stdex::std_locale_C);
1112 case element_t::area:
return !stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX, stdex::std_locale_C);
1113 case element_t::img:
return !stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX, stdex::std_locale_C);
1114 case element_t::input:
return !stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX, stdex::std_locale_C);
1115 case element_t::object:
return !stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX, stdex::std_locale_C);
1116 case element_t::table:
return !stdex::strnicmp(attr_name, num_chars,
"summary", SIZE_MAX, stdex::std_locale_C);
1117 case element_t::td:
return !stdex::strnicmp(attr_name, num_chars,
"abbr", SIZE_MAX, stdex::std_locale_C);
1118 case element_t::th:
return !stdex::strnicmp(attr_name, num_chars,
"abbr", SIZE_MAX, stdex::std_locale_C);
1125 using sequence_store = std::vector<std::unique_ptr<sequence>>;
1133 stdex::parser::html_sequence_t
type;
1137 sequence(_In_ stdex::parser::html_sequence_t _type = stdex::parser::html_sequence_t::unknown, _In_
size_t start = 0,
size_t end = 0, _In_opt_
sequence* _parent =
nullptr) :
1154 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1155 code(element_code(src + tag.name.start, tag.name.size())),
1156 name(std::move(tag.name)),
1161 static element_t element_code(_In_reads_z_(num_chars)
const T*
name,
size_t num_chars)
1163 static const struct {
1167 {
"a", element_t::a, },
1168 {
"abbr", element_t::abbr, },
1169 {
"acronym", element_t::acronym, },
1170 {
"address", element_t::address, },
1171 {
"applet", element_t::applet, },
1172 {
"area", element_t::area, },
1173 {
"b", element_t::b, },
1174 {
"base", element_t::base, },
1175 {
"basefont", element_t::basefont, },
1176 {
"bdo", element_t::bdo, },
1177 {
"bgsound", element_t::bgsound, },
1178 {
"big", element_t::big, },
1179 {
"blink", element_t::blink, },
1180 {
"blockquote", element_t::blockquote, },
1181 {
"body", element_t::body, },
1182 {
"br", element_t::br, },
1183 {
"button", element_t::button, },
1184 {
"caption", element_t::caption, },
1185 {
"center", element_t::center, },
1186 {
"cite", element_t::cite, },
1187 {
"code", element_t::code, },
1188 {
"col", element_t::col, },
1189 {
"colgroup", element_t::colgroup, },
1190 {
"comment", element_t::comment, },
1191 {
"dd", element_t::dd, },
1192 {
"del", element_t::del, },
1193 {
"dfn", element_t::dfn, },
1194 {
"dir", element_t::dir, },
1195 {
"div", element_t::div, },
1196 {
"dl", element_t::dl, },
1197 {
"dt", element_t::dt, },
1198 {
"em", element_t::em, },
1199 {
"embed", element_t::embed, },
1200 {
"fieldset", element_t::fieldset, },
1201 {
"font", element_t::font, },
1202 {
"form", element_t::form, },
1203 {
"frame", element_t::frame, },
1204 {
"frameset", element_t::frameset, },
1205 {
"h1", element_t::h1, },
1206 {
"h2", element_t::h2, },
1207 {
"h3", element_t::h3, },
1208 {
"h4", element_t::h4, },
1209 {
"h5", element_t::h5, },
1210 {
"h6", element_t::h6, },
1211 {
"head", element_t::head, },
1212 {
"hr", element_t::hr, },
1213 {
"html", element_t::html, },
1214 {
"i", element_t::i, },
1215 {
"iframe", element_t::iframe, },
1216 {
"img", element_t::img, },
1217 {
"input", element_t::input, },
1218 {
"ins", element_t::ins, },
1219 {
"isindex", element_t::isindex, },
1220 {
"kbd", element_t::kbd, },
1221 {
"label", element_t::label, },
1222 {
"legend", element_t::legend, },
1223 {
"li", element_t::li, },
1224 {
"link", element_t::link, },
1225 {
"listing", element_t::listing, },
1226 {
"map", element_t::map, },
1227 {
"marquee", element_t::marquee, },
1228 {
"menu", element_t::menu, },
1229 {
"meta", element_t::meta, },
1230 {
"nextid", element_t::nextid, },
1231 {
"nobr", element_t::nobr, },
1232 {
"noembed", element_t::noembed, },
1233 {
"noframes", element_t::noframes, },
1234 {
"noscript", element_t::noscript, },
1235 {
"object", element_t::object, },
1236 {
"ol", element_t::ol, },
1237 {
"optgroup", element_t::optgroup, },
1238 {
"option", element_t::option, },
1239 {
"p", element_t::p, },
1240 {
"param", element_t::param, },
1241 {
"plaintext", element_t::plaintext, },
1242 {
"pre", element_t::pre, },
1243 {
"q", element_t::q, },
1244 {
"rt", element_t::rt, },
1245 {
"ruby", element_t::ruby, },
1246 {
"s", element_t::s, },
1247 {
"samp", element_t::samp, },
1248 {
"script", element_t::script, },
1249 {
"select", element_t::select, },
1250 {
"small", element_t::small, },
1251 {
"span", element_t::span, },
1252 {
"strike", element_t::strike, },
1253 {
"strong", element_t::strong, },
1254 {
"style", element_t::style, },
1255 {
"sub", element_t::sub, },
1256 {
"sup", element_t::sup, },
1257 {
"table", element_t::table, },
1258 {
"tbody", element_t::tbody, },
1259 {
"td", element_t::td, },
1260 {
"textarea", element_t::textarea, },
1261 {
"tfoot", element_t::tfoot, },
1262 {
"th", element_t::th, },
1263 {
"thead", element_t::thead, },
1264 {
"title", element_t::title, },
1265 {
"tr", element_t::tr, },
1266 {
"tt", element_t::tt, },
1267 {
"u", element_t::u, },
1268 {
"ul", element_t::ul, },
1269 {
"var", element_t::var, },
1270 {
"wbr", element_t::wbr, },
1271 {
"xmp", element_t::xmp, },
1275 for (
size_t i = 1; i < _countof(
mapping); i++)
1277 const auto& ctype = std::use_facet<std::ctype<char>>(stdex::std_locale_C);
1278 for (
size_t i = 0; i < _countof(
mapping); i++) {
1279 for (
size_t j = 0;
mapping[i].name[j]; j++)
1280 _Assume_(ctype.is(ctype.lower | ctype.digit,
mapping[i].name[j]));
1283 const auto& ctypeT = std::use_facet<std::ctype<T>>(stdex::std_locale_C);
1284 for (
size_t i = 0, j = _countof(
mapping); i < j; ) {
1285 size_t m = (i + j) / 2;
1287 for (
size_t i1 = 0, i2 = 0;;) {
1289 r = i2 >= num_chars || !
name[i2] ? 0 : -1;
1292 if (i2 >= num_chars || !
name[i2]) {
1297 auto chr =
static_cast<char>(ctypeT.tolower(
name[i2++]));
1316 return element_t::unknown;
1351 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1352 code(element::element_code(src + tag.name.start, tag.name.size())),
1353 name(std::move(tag.name)),
1371 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1372 name(std::move(tag.name)),
1389 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1405 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1416 template<
class _Elem,
class _Traits = std::
char_traits<_Elem>,
class _Alloc = std::allocator<_Elem>>
1420 std::basic_string<_Elem, _Traits, _Alloc>
value;
1426 template<
class _Elem,
class _Traits = std::
char_traits<_Elem>,
class _Alloc = std::allocator<_Elem>>
1432 template<
class _Elem,
class _Traits = std::
char_traits<_Elem>,
class _Alloc = std::allocator<_Elem>>
1474 void append(_In_reads_or_z_opt_(num_chars)
const _Elem*
source, _In_
size_t num_chars = SIZE_MAX)
1476 _Assume_(
source || !num_chars);
1483 if (m_condition_end.match(
source, i, num_chars)) {
1485 m_is_cdata ? stdex::parser::html_sequence_t::CDATA : stdex::parser::html_sequence_t::PCDATA,
1496 if (m_condition_end.match(
source, i, num_chars)) {
1513 if (m_condition_start.match(
source, i, num_chars)) {
1515 if (!stdex::strcmp(condition_src.c_str(),
"CDATA"))
1517 else if (!stdex::strcmp(condition_src.c_str(),
"RCDATA"))
1521 else if (!stdex::strcmp(condition_src.c_str(),
"IGNORE"))
1533 if (m_tag.match(
source, i, num_chars) &&
1534 m_tag.
type == stdex::parser::html_sequence_t::element_end &&
1540 std::unique_ptr<element_end> e(
new element_end(std::move(m_tag),
source, parent->parent, parent));
1541 parent->end = e.get();
1550 if (m_tag.match(
source, i, num_chars)) {
1555 switch (m_tag.
type) {
1556 case stdex::parser::html_sequence_t::element:
1557 case stdex::parser::html_sequence_t::element_start: {
1558 std::unique_ptr<element> e(
1559 m_tag.
type == stdex::parser::html_sequence_t::element ?
new element(std::move(m_tag),
source) :
1566 _Assume_(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1568 e->parent = starting_tag;
1571 e->parent = starting_tag->parent;
1572 starting_tag->end = e.get();
1576 if (e->type == stdex::parser::html_sequence_t::element_start) {
1579 e_start->
end = e.get();
1583 case element_t::code:
1584 case element_t::comment:
1585 case element_t::script:
1586 case element_t::style:
1593 if (e->code == element_t::meta &&
m_charset == stdex::charset_id::system) {
1594 bool is_content_type =
false;
1596 for (
auto& attr : e->attributes) {
1597 if (!stdex::strnicmp(
source + attr.name.start, attr.name.size(),
"http-equiv", SIZE_MAX, stdex::std_locale_C) &&
1598 !stdex::strnicmp(
source + attr.value.start, attr.value.size(),
"content-type", SIZE_MAX, stdex::std_locale_C))
1599 is_content_type =
true;
1600 else if (!stdex::strnicmp(
source + attr.name.start, attr.name.size(),
"content", SIZE_MAX, stdex::std_locale_C))
1601 content_attr = &attr;
1603 if (is_content_type && content_attr) {
1610 str.reserve(content.charset.size());
1611 for (
size_t j = content.charset.start; j < content.charset.end; ++j)
1612 str.push_back(
static_cast<char>(
source[j]));
1613 m_charset = stdex::charset_from_name(str.c_str());
1621 case stdex::parser::html_sequence_t::element_end: {
1626 _Assume_(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1627 if (starting_tag->code == e->code ||
1628 starting_tag->code == element_t::unknown && e->code == element_t::unknown && !stdex::strnicmp(
source + starting_tag->name.start, starting_tag->name.size(),
source + e->name.start, e->name.size(), stdex::std_locale_C))
1630 e->start = starting_tag;
1631 e->parent = starting_tag->parent;
1632 starting_tag->end = e.get();
1641 case stdex::parser::html_sequence_t::declaration:
1643 !stdex::strnicmp(
source + m_tag.
attributes[0].name.start, m_tag.
attributes[0].name.size(),
"entity", SIZE_MAX, stdex::std_locale_C))
1659 case stdex::parser::html_sequence_t::comment:
1662 case stdex::parser::html_sequence_t::instruction:
1666 throw std::invalid_argument(
"unknown tag type");
1673 if (m_any_char.match(
source, i, num_chars)) {
1697 inline void assign(_In_reads_or_z_opt_(num_chars)
const _Elem*
source, _In_
size_t num_chars = SIZE_MAX)
1707 inline const std::basic_string<_Elem, _Traits, _Alloc>&
source()
const {
return m_source; }
1709 friend class parser<_Elem, _Traits, _Alloc>;
1723 std::basic_string<_Elem, _Traits, _Alloc>
replace_entities(_In_reads_or_z_opt_(num_chars)
const _Elem* input, _In_
size_t num_chars)
const
1725 _Assume_(input || !num_chars);
1726 const size_t num_entities =
m_entities.size();
1728 std::basic_string<_Elem, _Traits, _Alloc> output;
1729 for (
size_t i = 0; i < num_chars && input[i];) {
1730 if (input[i] ==
'%') {
1731 for (
size_t j = 0; j < num_entities; j++) {
1733 size_t entity_size = e->name.size();
1734 if (i + entity_size + 1 < num_chars &&
1735 !stdex::strncmp(input + i + 1,
source + e->name.start, entity_size) &&
1736 input[i + entity_size + 1] ==
';')
1739 i += entity_size + 2;
1743 throw std::runtime_error(
"undefined entity");
1745 output += input[i++];
1764 std::vector<std::unique_ptr<entity<_Elem, _Traits, _Alloc>>>
m_entities;
1776 enum class token_t {
1787 constexpr size_t token_tag_max =
1796 constexpr char token_tag_start =
'\x12';
1802 constexpr char token_tag_end =
'\x13';
1810 inline token(_In_ token_t _type = token_t::root, _In_opt_
sequence* _sequence =
nullptr, _In_ uintptr_t _data = 0) :
1816 template<
class _Elem,
class _Traits,
class _Alloc>
1829 template<
class _Traits = std::
char_traits<
char>,
class _Alloc = std::allocator<
char>>
1830 inline size_t append_tag(_Inout_ std::basic_string<char, _Traits, _Alloc>& str)
const
1832 size_t n = str.size();
1834 stdex::appendf(str,
"%c%zX%c", stdex::locale_C.get(), token_tag_start,
reinterpret_cast<uintptr_t
>(
this), token_tag_end);
1835 return str.size() - n;
1845 template<
class _Traits = std::
char_traits<
wchar_t>,
class _Alloc = std::allocator<
wchar_t>>
1846 inline size_t append_tag(_Inout_ std::basic_string<wchar_t, _Traits, _Alloc>& str)
const
1849 return stdex::appendf(str, L
"%c%zX%c", stdex::locale_C.get(),
static_cast<wchar_t>(token_tag_start),
reinterpret_cast<uintptr_t
>(
this),
static_cast<wchar_t>(token_tag_end));
1853 static inline token* parse_tag(
const T* str,
size_t& offset)
1855 if (str[offset] !=
static_cast<T
>(token_tag_start))
1860 for (end = offset + 1; ; end++) {
1863 if (str[end] == token_tag_end)
1868 token* t =
reinterpret_cast<token*
>(stdex::strtouint<T, uintptr_t>(str + offset + 1, end - offset - 1,
nullptr, 16));
1870 throw std::invalid_argument(
"null token");
1881 using token_vector = std::vector<std::unique_ptr<token>>;
1882 using token_list = std::list<token*>;
1887 enum text_type_flag_t : uint32_t {
1888 has_tokens = 1 << 0,
1897 template<
class _Elem,
class _Traits = std::
char_traits<_Elem>,
class _Alloc = std::allocator<_Elem>>
1902 _In_ token_t
type = token_t::complete,
1903 _In_reads_or_z_opt_(num_chars)
const _Elem* _text =
nullptr, _In_
size_t num_chars = 0,
1904 _In_ uint32_t _text_type = 0,
1907 text(_text, num_chars),
1911 friend class parser<_Elem, _Traits, _Alloc>;
1914 std::basic_string<_Elem, _Traits, _Alloc>
text;
1922 template<
class _Elem,
class _Traits = std::
char_traits<_Elem>,
class _Alloc = std::allocator<_Elem>>
1927 _In_reads_or_z_opt_(num_chars_text)
const _Elem* _text =
nullptr, _In_
size_t num_chars_text = 0,
1928 _In_reads_or_z_opt_(num_chars_name)
const _Elem* _name =
nullptr, _In_
size_t num_chars_name = 0,
1932 _In_ uintptr_t
data = 0) :
1934 name(_name, num_chars_name),
1938 friend class parser<_Elem, _Traits, _Alloc>;
1941 std::basic_string<_Elem, _Traits, _Alloc>
name;
1948 enum class token_url_t {
1957 template<
class _Elem,
class _Traits = std::
char_traits<_Elem>,
class _Alloc = std::allocator<_Elem>>
1962 _In_reads_or_z_opt_(num_chars)
const _Elem* _url =
nullptr, _In_
size_t num_chars = 0,
1963 token_url_t _encoding = token_url_t::plain,
1966 url(_url, num_chars),
1970 friend class parser<_Elem, _Traits, _Alloc>;
1973 std::basic_string<_Elem, _Traits, _Alloc>
url;
1987 using inserted_token_list = std::list<inserted_token>;
1989 template<
class _Elem,
class _Traits,
class _Alloc>
1995 _In_reads_or_z_opt_(num_chars)
const stdex::schar_t* url =
nullptr, _In_
size_t num_chars = 0,
1998 m_url(url, stdex::strnlen(url, num_chars)),
2031 t->type == token_t::complete ||
2032 t->type == token_t::starting ||
2033 t->type == token_t::ending ||
2034 t->type == token_t::root);
2036 if (t->text_type & has_tokens) {
2037 const _Elem* root = t->text.data();
2038 for (
size_t i = 0, num_chars = t->text.size(); i < num_chars && root[i];) {
2039 _Assume_(root[i] != token_tag_end);
2040 const token* t2 = token::parse_tag(root, i);
2043 case token_t::complete:
2044 case token_t::starting:
2045 case token_t::ending:
2049 case token_t::url: {
2051 switch (t2_url->encoding) {
2052 case token_url_t::plain:
2053 source += t2_url->
url;
2055 case token_url_t::sgml:
2056 escape(source, t2_url->url.data(), t2_url->url.size());
2058 case token_url_t::css:
2059 css_escape(source, t2_url->url.data(), t2_url->url.size());
2062 throw std::invalid_argument(
"unsupported URL encoding");
2067 throw std::invalid_argument(
"unsupported token type");
2070 else if (t->text_type & has_text) {
2071 escape_min(source, root[i]);
2075 source += root[i++];
2078 else if (t->text_type & has_text) {
2080 escape_min(source, t->text.data(), t->text.size());
2094 static void start_tokens(_Inout_ std::basic_string<_Elem, _Traits, _Alloc>& source, _Inout_ token_list& active_tokens, _In_
const token_list& new_tokens, _In_ token_list::const_iterator from)
2096 for (; from != new_tokens.cend(); ++from) {
2098 t->append_tag(source);
2099 active_tokens.push_back(t);
2112 token_list::const_iterator
end_tokens(_Inout_ std::basic_string<_Elem, _Traits, _Alloc>& source, _Inout_ token_list& active_tokens, _In_
const token_list& new_tokens)
2115 token_list::const_iterator i1, i2;
2116 for (i1 = active_tokens.cbegin(), i2 = new_tokens.cbegin(); i1 != active_tokens.cend(); ++i1, ++i2) {
2117 if (i2 == new_tokens.cend() || *i1 != *i2) {
2120 for (
auto i = active_tokens.cend(); i != active_tokens.cbegin(); ) {
2122 _Assume_(t1 && t1->type == token_t::starting);
2125 t2->text.reserve(t1->name.size() + 3);
2128 t2->text += t1->name;
2134 active_tokens.erase(i);
2137 active_tokens.erase(i);
2138 i = active_tokens.cend();
2155 void append_inserted_tokens(_Inout_ std::basic_string<_Elem, _Traits, _Alloc>& source, _Inout_ inserted_token_list& inserted_tokens,
2156 _In_
size_t word_index, _In_
bool after_word,
2157 _Inout_ token_list& active_tokens)
2159 for (
auto i = inserted_tokens.begin(); i != inserted_tokens.end(); ) {
2162 if (t.word_index == word_index && t.after_word == after_word) {
2163 if (t.token->type != token_t::ending)
2164 start_tokens(source, active_tokens, t.active_tokens,
end_tokens(source, active_tokens, t.active_tokens));
2165 t.token->append_tag(source);
2166 inserted_tokens.erase(i++);
2179 static void merge(_Inout_ token_list& a, _In_
const token_list& b)
2181 for (
auto i2 = b.begin(); i2 != b.end(); ++i2) {
2183 for (
auto i1 = a.begin(); i1 != a.end(); ++i1) {
2184 if (i1 == a.end()) {
2200 _Unreferenced_(rel);
2226 auto t =
token.get();
2240 inline size_t append_token(_Inout_ std::unique_ptr<T>&&
token, _Inout_ std::basic_string<_Elem, _Traits, _Alloc>& source)
2277 stdex::strnchr(
m_source + s->interval.start, s->interval.size(),
static_cast<_Elem
>(token_tag_start)) == stdex::npos &&
2278 stdex::strnchr(
m_source + s->interval.start, s->interval.size(),
static_cast<_Elem
>(token_tag_end)) == stdex::npos);
2280 if (s->type == stdex::parser::html_sequence_t::text) {
2281 rel.from = s->interval.start;
2282 token->mapping.push_back(rel);
2283 stdex::sgml2strcat(
token->text,
m_source + s->interval.start, s->interval.size(), 0, rel, &
token->mapping);
2284 rel.to =
token->text.size();
2285 if (!(
token->text_type & has_text) &&
2286 !stdex::isblank(
m_source + s->interval.start, s->interval.size(), stdex::std_locale_C))
2287 token->text_type |= has_text;
2290 else if (s->type == stdex::parser::html_sequence_t::element || s->type == stdex::parser::html_sequence_t::element_start) {
2293 const element_start* s_el_start = s->type == stdex::parser::html_sequence_t::element_start ?
static_cast<const element_start*
>(s.get()) :
nullptr;
2295 throw std::invalid_argument(
"<frameset> detected");
2298 size_t offset = s->interval.start;
2299 std::unique_ptr<text_token<_Elem, _Traits, _Alloc>> t(s->type == stdex::parser::html_sequence_t::element ||
element_traits::span(s_el_start->
code) == element_span_t::immediate ?
2305 if (a.value.empty() ||
2306 stdex::isblank(
m_source + a.value.start, a.value.size(), stdex::std_locale_C))
2310 t->text.append(
m_source + offset, a.value.start - offset);
2315 stdex::sgml2strcat(t_url->url,
m_source + a.value.start, a.value.size());
2317 t->text_type |= has_tokens;
2318 offset = a.value.end;
2321 t->text.append(
m_source + offset, a.value.start - offset);
2325 has_text | is_title,
2328 t_value->mapping.push_back(rel_value);
2329 stdex::sgml2strcpy(t_value->text,
m_source + a.value.start, a.value.size(), 0, rel_value, &t_value->
mapping);
2331 t->text_type |= has_tokens;
2332 offset = a.value.end;
2336 t->text.append(
m_source + offset, s->interval.end - offset);
2337 rel.from = s->interval.start;
2338 token->mapping.push_back(rel);
2340 token->text_type |= has_tokens;
2345 if (s_el_start->
code == element_t::address ||
2346 s_el_start->
code == element_t::code ||
2347 s_el_start->
code == element_t::comment ||
2348 s_el_start->
code == element_t::cite ||
2349 s_el_start->
code == element_t::kbd ||
2350 s_el_start->
code == element_t::samp ||
2351 s_el_start->
code == element_t::script ||
2352 s_el_start->
code == element_t::style)
2355 auto s_end = s_el_start->
end;
2358 if (s->interval.end < s_end->interval.start) {
2359 if (s_el_start->
code != element_t::style) {
2360 rel.from = s->interval.start;
2361 token->mapping.push_back(rel);
2365 m_source + s->interval.end, s_end->interval.start - s->interval.end,
2372 auto t =
parse_css(s->interval.end, s_end->interval.start);
2374 rel.from = s->interval.start;
2375 token->mapping.push_back(rel);
2376 rel.to += t->append_tag(
token->text);
2378 token->text_type |= has_tokens;
2385 while (limit != end && limit->get() != s_el_start->
end)
2387 auto t =
parse(limit,
2390 rel.from = s->interval.start;
2391 token->mapping.push_back(rel);
2392 rel.to += t->append_tag(
token->text);
2393 token->text_type |= has_tokens;
2397 else if (s->type == stdex::parser::html_sequence_t::element_end) {
2398 rel.from = s->interval.start;
2399 token->mapping.push_back(rel);
2403 m_source + s->interval.start, s->interval.size(),
2407 token->text_type |= has_tokens;
2412 rel.from = s->interval.start;
2413 token->mapping.push_back(rel);
2417 m_source + s->interval.start, s->interval.size(),
2421 token->text_type |= has_tokens;
2435 std::unique_ptr<text_token<_Elem, _Traits, _Alloc>>
token(
2443 if (m_css_comment.match(
m_source, start, end)) {
2447 else if (m_css_cdo.match(
m_source, start, end)) {
2451 else if (m_css_cdc.match(
m_source, start, end)) {
2456 m_css_import.match(
m_source, start, end) && (section = m_css_import.
interval, content = m_css_import.
content,
true) ||
2459 std::unique_ptr<url_token<_Elem, _Traits, _Alloc>> t_url(
2468 token->text_type |= has_tokens;
2469 start = section.
end;
2471 else if (m_any_char.match(
m_source, start, end)) {
HTML declaration.
Definition html.hpp:1367
stdex::interval< size_t > name
Declaration name position in source.
Definition html.hpp:1377
std::vector< stdex::parser::html_attribute > attributes
Declaration attribute positions in source.
Definition html.hpp:1378
HTML document.
Definition html.hpp:1434
std::vector< element_start * > m_element_stack
LIFO stack of started elements.
Definition html.hpp:1769
void finalize()
Finalizes document when no more appending is planned.
Definition html.hpp:1685
stdex::charset_id m_charset
Document charset.
Definition html.hpp:1754
bool m_is_rcdata
Inside of RCDATA?
Definition html.hpp:1760
bool m_is_special_element
Inside of a special element (<SCRIPT>, <STYLE>, ...)?
Definition html.hpp:1770
void append(_In_reads_or_z_opt_(num_chars) const _Elem *source, size_t num_chars=SIZE_MAX)
Parses HTML source code by chunks.
Definition html.hpp:1474
sequence_store m_sequences
Store of sequences.
Definition html.hpp:1768
size_t m_num_invalid_conditions
Number of started invalid conditions.
Definition html.hpp:1758
void assign(_In_reads_or_z_opt_(num_chars) const _Elem *source, size_t num_chars=SIZE_MAX)
Parses HTML document source code.
Definition html.hpp:1697
std::vector< std::unique_ptr< entity< _Elem, _Traits, _Alloc > > > m_entities
Array of entities.
Definition html.hpp:1764
std::basic_string< _Elem, _Traits, _Alloc > m_source
Document HTML source code.
Definition html.hpp:1752
element_start * active_element() const
Returns starting tag of currently active element or nullptr if no element is known to be started.
Definition html.hpp:1715
std::basic_string< _Elem, _Traits, _Alloc > replace_entities(_In_reads_or_z_opt_(num_chars) const _Elem *input, size_t num_chars) const
Replaces entities with their content.
Definition html.hpp:1723
void clear()
Empties document.
Definition html.hpp:1453
size_t m_num_valid_conditions
Number of started valid conditions.
Definition html.hpp:1757
const std::basic_string< _Elem, _Traits, _Alloc > & source() const
Returns document HTML source code.
Definition html.hpp:1707
size_t m_num_parsed
Number of characters already parsed.
Definition html.hpp:1753
bool m_is_cdata
Inside of CDATA?
Definition html.hpp:1759
Ending tag of an HTML element </...>
Definition html.hpp:1347
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1359
element_start * start
Corresponding starting tag.
Definition html.hpp:1360
element_t code
Element code.
Definition html.hpp:1358
Starting tag of an HTML element <...>
Definition html.hpp:1331
sequence * end
Corresponding ending tag of type element_end; When element is ended by a start of another element,...
Definition html.hpp:1340
HTML element <.../>
Definition html.hpp:1150
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1321
std::vector< stdex::parser::html_attribute > attributes
Element attribute positions in source.
Definition html.hpp:1322
element_t code
Element code.
Definition html.hpp:1320
HTML instruction.
Definition html.hpp:1401
stdex::interval< size_t > content
Instruction content position in source.
Definition html.hpp:1410
HTML parser.
Definition html.hpp:1991
stdex::progress< size_t > * m_progress
Progress indicator.
Definition html.hpp:2486
text_token< _Elem, _Traits, _Alloc > * parse_css(size_t start, size_t end)
Parses CSS.
Definition html.hpp:2432
static void merge(token_list &a, const token_list &b)
Adds tokens from list b to list a creating an union.
Definition html.hpp:2179
token_list::const_iterator end_tokens(std::basic_string< _Elem, _Traits, _Alloc > &source, token_list &active_tokens, const token_list &new_tokens)
Pops ending tokens from the active token list and append their tags to the source code string.
Definition html.hpp:2112
static void link(std::basic_string< _Elem, _Traits, _Alloc > &source, const text_token< _Elem, _Traits, _Alloc > *t)
Rebuilds HTML source code from the token tree.
Definition html.hpp:2027
text_token< _Elem, _Traits, _Alloc > * parse(const sequence_store::const_iterator &end, uint32_t text_type=0)
Recursively parses HTML document.
Definition html.hpp:2257
const _Elem * m_source
HTML source code.
Definition html.hpp:2487
token_vector m_tokens
HTML token storage.
Definition html.hpp:2488
text_token< _Elem, _Traits, _Alloc > * parse()
Parses HTML document.
Definition html.hpp:2007
const document< _Elem, _Traits, _Alloc > & m_document
Document being analyzed.
Definition html.hpp:2483
void make_absolute_url(std::basic_string< _Elem, _Traits, _Alloc > &rel)
Converts URL to absolute.
Definition html.hpp:2198
size_t append_token(std::unique_ptr< T > &&token, std::basic_string< _Elem, _Traits, _Alloc > &source)
Adds token to the collection and appends its tag to the source code string.
Definition html.hpp:2240
const token_vector & tokens() const
Returns collection of tokens.
Definition html.hpp:2211
const stdex::sys_string m_url
Absolute document URL.
Definition html.hpp:2484
const bool m_parse_frames
Parse frames.
Definition html.hpp:2485
static void start_tokens(std::basic_string< _Elem, _Traits, _Alloc > &source, token_list &active_tokens, const token_list &new_tokens, token_list::const_iterator from)
Pushes tokens to the active token list and appends their tags to the source code string.
Definition html.hpp:2094
T * append_token(std::unique_ptr< T > &&token)
Adds token to the collection.
Definition html.hpp:2222
void append_inserted_tokens(std::basic_string< _Elem, _Traits, _Alloc > &source, inserted_token_list &inserted_tokens, size_t word_index, bool after_word, token_list &active_tokens)
Adds matching inserted tokens before/after the given word in source code.
Definition html.hpp:2155
sequence_store::const_iterator m_offset
Index of active section.
Definition html.hpp:2489
Base class for HTML sequences.
Definition html.hpp:1131
stdex::interval< size_t > interval
Sequence position in source.
Definition html.hpp:1134
stdex::parser::html_sequence_t type
Sequence type. Enum is used for performance reasons (vs. dynamic_cast)
Definition html.hpp:1133
sequence * parent
Parent sequence.
Definition html.hpp:1135
Token representing start HTML tag.
Definition html.hpp:1924
stdex::html::sequence * end_sequence
Ending tag sequence.
Definition html.hpp:1942
std::basic_string< _Elem, _Traits, _Alloc > name
Element name allowing later recreation of ending </tag>
Definition html.hpp:1941
Token representing part of HTML text.
Definition html.hpp:1899
stdex::mapping_vector< size_t > mapping
Mapping between source and text positions.
Definition html.hpp:1916
std::basic_string< _Elem, _Traits, _Alloc > text
Token text.
Definition html.hpp:1914
uint32_t text_type
Mask of text_type_flag_t to specify text content.
Definition html.hpp:1915
HTML token base class.
Definition html.hpp:1808
sequence * sequence
Pointer to the sequence this token represents or nullptr when it doesn't trivially represent one sequ...
Definition html.hpp:1877
size_t append_tag(std::basic_string< char, _Traits, _Alloc > &str) const
Appends token tag to the source code.
Definition html.hpp:1830
uintptr_t data
Any user-supplied data.
Definition html.hpp:1878
token_t type
Token type.
Definition html.hpp:1876
size_t append_tag(std::basic_string< wchar_t, _Traits, _Alloc > &str) const
Appends token tag to the source code.
Definition html.hpp:1846
HTTP token representing an URL.
Definition html.hpp:1959
token_url_t encoding
URL encoding.
Definition html.hpp:1974
std::basic_string< _Elem, _Traits, _Alloc > url
URL.
Definition html.hpp:1973
stdex::interval< size_t > content
content position in source
Definition parser.hpp:7835
stdex::interval< size_t > content
content position in source
Definition parser.hpp:7750
std::vector< html_attribute > attributes
tag attributes
Definition parser.hpp:8357
html_sequence_t type
tag type
Definition parser.hpp:8355
stdex::interval< size_t > name
tag name position in source
Definition parser.hpp:8356
stdex::interval< size_t > interval
Region of the last match.
Definition parser.hpp:172
Test for given string.
Definition parser.hpp:818
Progress indicator base class.
Definition progress.hpp:19
virtual bool cancel()
Query whether user requested abort.
Definition progress.hpp:65
virtual void set(T value)
Set current progress.
Definition progress.hpp:47
virtual void set_range(T start, T end)
Set progress range extent.
Definition progress.hpp:37
User cancelled exception.
Definition exception.hpp:17
Describes attributes associated with a HTML element.
Definition html.hpp:515
static bool is_group(element_t code)
Does element represent a separate part of text?
Definition html.hpp:920
static bool is_flow(element_t code)
Does element typically represent text?
Definition html.hpp:834
static bool is_heading(element_t code)
Does element represent a heading?
Definition html.hpp:752
static bool is_head_content(element_t code)
Is element part of the document head?
Definition html.hpp:844
static bool is_fontstyle(element_t code)
Does element represent font styling?
Definition html.hpp:640
static bool is_block(element_t code)
Is element typically displayed as a stand-alone section of text?
Definition html.hpp:803
static bool is_head_misc(element_t code)
May element be a part of document head?
Definition html.hpp:861
static bool is_list(element_t code)
Does element represent a list of items?
Definition html.hpp:771
static bool is_uri(element_t code, _In_reads_or_z_opt_(num_chars) const T *attr_name, size_t num_chars)
Checks if expected element attribute value is URI.
Definition html.hpp:1050
static bool is_preformatted(element_t code)
Does element represent preformatted text, source code etc.?
Definition html.hpp:788
static bool is_localizable(element_t code, const T *attr_name, size_t num_chars)
Checks if expected element attribute value is localizable.
Definition html.hpp:1105
static bool is_special(element_t code)
Does element represent non-textual item in the document?
Definition html.hpp:686
static bool is_pre_exclusion(element_t code)
May element be a part of.
Definition html.hpp:879
static bool is_inline(element_t code)
Is element typically displayed inline with text?
Definition html.hpp:737
static bool is_html_content(element_t code)
Does element represent the document body?
Definition html.hpp:904
static bool is_formctrl(element_t code)
Does element represent a form control?
Definition html.hpp:719
static bool is_phrase(element_t code)
Does element represent a phrase-of-speech?
Definition html.hpp:662
static bool may_contain(element_t parent, element_t child)
Checks if one element may nest inside another.
Definition html.hpp:950
static element_span_t span(element_t code)
Returns expected element span in HTML code.
Definition html.hpp:521
HTML entity.
Definition html.hpp:1418
stdex::interval< size_t > name
Name position in source.
Definition html.hpp:1419
std::basic_string< _Elem, _Traits, _Alloc > value
Entity value.
Definition html.hpp:1420
Inserted HTML token.
Definition html.hpp:1980
bool after_word
true if token is anchored after the word; false if anchored before the word
Definition html.hpp:1984
std::list< stdex::html::token * > active_tokens
List of started tokens at inserted token.
Definition html.hpp:1982
size_t word_index
Index of the word, token is anchored to.
Definition html.hpp:1983
token * token
Points to the token.
Definition html.hpp:1981
Numerical interval.
Definition interval.hpp:18
T size() const
Returns interval size.
Definition interval.hpp:47
T end
interval end
Definition interval.hpp:20
T start
interval start
Definition interval.hpp:19
Maps index in source string to index in destination string.
Definition mapping.hpp:17
mapping()
Constructs a zero to zero mapping.
Definition mapping.hpp:24
Tag attribute.
Definition parser.hpp:8129
stdex::interval< size_t > value
attribute value position in source
Definition parser.hpp:8131