9#include "exception.hpp"
10#include "interval.hpp"
13#include "progress.hpp"
42 template<
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
44 _Inout_ std::basic_string<char, TR, AX>& dst,
45 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars)
47 _Assume_(src || !num_chars);
48 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
50 case '&': dst +=
"&";
break;
51 case ';': dst +=
";";
break;
52 case '\"': dst +=
""";
break;
53 case '\'': dst +=
"'";
break;
54 case '<': dst +=
"<";
break;
55 case '>': dst +=
">";
break;
56 case 0x00a0: dst +=
" ";
break;
57 default: dst += src[i];
break;
69 template<
class TR = std::
char_traits<
wchar_t>,
class AX = std::allocator<
wchar_t>>
71 _Inout_ std::basic_string<wchar_t, TR, AX>& dst,
72 _In_reads_or_z_opt_(num_chars)
const wchar_t* src, _In_
size_t num_chars)
74 _Assume_(src || !num_chars);
75 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
77 case L
'&': dst += L
"&";
break;
78 case L
';': dst += L
";";
break;
79 case L
'\"': dst += L
""";
break;
80 case L
'\'': dst += L
"'";
break;
81 case L
'<': dst += L
"<";
break;
82 case L
'>': dst += L
">";
break;
83 case L
'\u00a0': dst += L
" ";
break;
84 default: dst += src[i];
break;
95 template<
class T,
size_t N,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
97 _Inout_ std::basic_string<T, TR, AX>& dst,
98 _In_
const T (&src)[N])
109 template<
class T,
class TR_dst = std::
char_traits<T>,
class AX_dst = std::allocator<T>,
class TR_src = std::
char_traits<T>,
class AX_src = std::allocator<T>>
111 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
112 _In_
const std::basic_string<T, TR_src, AX_src>& src)
114 escape(dst, src.data(), src.size());
123 template<
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
124 void escape_min(_Inout_ std::basic_string<char, TR, AX>& dst, _In_
char chr)
127 case '&': dst +=
"&";
break;
128 case '<': dst +=
"<";
break;
129 case '>': dst +=
">";
break;
130 case 0x00a0: dst +=
" ";
break;
131 default: dst += chr;
break;
141 template<
class TR = std::
char_traits<
wchar_t>,
class AX = std::allocator<
wchar_t>>
142 void escape_min(_Inout_ std::basic_string<wchar_t, TR, AX>& dst, _In_
wchar_t chr)
145 case L
'&': dst += L
"&";
break;
146 case L
'<': dst += L
"<";
break;
147 case L
'>': dst += L
">";
break;
148 case L
'\u00a0': dst += L
" ";
break;
149 default: dst += chr;
break;
160 template<
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
162 _Inout_ std::basic_string<char, TR, AX>& dst,
163 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars)
165 _Assume_(src || !num_chars);
166 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
168 case '&': dst +=
"&";
break;
169 case '<': dst +=
"<";
break;
170 case '>': dst +=
">";
break;
171 case 0x00a0: dst +=
" ";
break;
172 default: dst += src[i];
break;
184 template<
class TR = std::
char_traits<
wchar_t>,
class AX = std::allocator<
wchar_t>>
186 _Inout_ std::basic_string<wchar_t, TR, AX>& dst,
187 _In_reads_or_z_opt_(num_chars)
const wchar_t* src, _In_
size_t num_chars)
189 _Assume_(src || !num_chars);
190 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
192 case L
'&': dst += L
"&";
break;
193 case L
'<': dst += L
"<";
break;
194 case L
'>': dst += L
">";
break;
195 case L
'\u00a0': dst += L
" ";
break;
196 default: dst += src[i];
break;
207 template<
class T,
size_t N,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
209 _Inout_ std::basic_string<T, TR, AX>& dst,
210 _In_
const T (&src)[N])
212 escape_min(dst, src, N);
221 template<
class T,
class TR_dst = std::
char_traits<T>,
class AX_dst = std::allocator<T>,
class TR_src = std::
char_traits<T>,
class AX_src = std::allocator<T>>
223 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
224 _In_
const std::basic_string<T, TR_src, AX_src>& src)
226 escape_min(dst, src.data(), src.size());
236 template<
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
238 _Inout_ std::basic_string<char, TR, AX>& dst,
239 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars)
241 _Assume_(src || !num_chars);
242 for (
size_t i = 0; i < num_chars && src[i];) {
252 if (
'0' <= src[i] && src[i] <=
'9') chr =
static_cast<char>((src[i++] -
'0') << 4);
253 else if (
'A' <= src[i] && src[i] <=
'F') chr =
static_cast<char>((src[i++] -
'A' + 10) << 4);
254 else if (
'a' <= src[i] && src[i] <=
'f') chr =
static_cast<char>((src[i++] -
'a' + 10) << 4);
255 else { dst +=
'%';
continue; }
256 if (
'0' <= src[i] && src[i] <=
'9') chr |=
static_cast<char>((src[i++] -
'0'));
257 else if (
'A' <= src[i] && src[i] <=
'F') chr |=
static_cast<char>((src[i++] -
'A' + 10));
258 else if (
'a' <= src[i] && src[i] <=
'f') chr |=
static_cast<char>((src[i++] -
'a' + 10));
259 else { dst +=
'%'; dst += src[i - 1];
continue; }
277 template<
size_t N,
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
279 _Inout_ std::basic_string<char, TR, AX>& dst,
280 _In_
const char (&src)[N])
282 url_unescape(dst, src, N);
291 template<
class TR_dst = std::
char_traits<
char>,
class AX_dst = std::allocator<
char>>
293 _Inout_ std::basic_string<char, TR_dst, AX_dst>& dst,
294 _In_
const std::basic_string_view<
char, std::char_traits<char>> src)
296 url_unescape(dst, src.data(), src.size());
306 template<
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
308 _Inout_ std::basic_string<char, TR, AX>& dst,
309 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars)
311 _Assume_(src || !num_chars);
312 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
314 case ' ': dst +=
"+";
break;
315 case '<': dst +=
"%3C";
break;
316 case '>': dst +=
"%3E";
break;
317 case '#': dst +=
"%23";
break;
318 case '%': dst +=
"%25";
break;
319 case '{': dst +=
"%7B";
break;
320 case '}': dst +=
"%7D";
break;
321 case '|': dst +=
"%7C";
break;
322 case '\\': dst +=
"%5C";
break;
323 case '^': dst +=
"%5E";
break;
324 case '~': dst +=
"%7E";
break;
325 case '[': dst +=
"%5B";
break;
326 case ']': dst +=
"%5D";
break;
327 case '`': dst +=
"%60";
break;
328 case ';': dst +=
"%3B";
break;
329 case '/': dst +=
"%2F";
break;
330 case '?': dst +=
"%3F";
break;
331 case ':': dst +=
"%3A";
break;
332 case '@': dst +=
"%40";
break;
333 case '=': dst +=
"%3D";
break;
334 case '&': dst +=
"%26";
break;
335 case '$': dst +=
"%24";
break;
337 if (0x20 <
static_cast<uint8_t
>(src[i]) &&
static_cast<uint8_t
>(src[i]) < 0x7f)
341 uint8_t n = (
static_cast<uint8_t
>(src[i]) & 0xf0) >> 4;
342 dst += n < 10 ? static_cast<char>(
'0' + n) : static_cast<char>(
'A' + n - 10);
343 n = ((uint8_t)src[i] & 0x0f);
344 dst += n < 10 ? static_cast<char>(
'0' + n) : static_cast<char>(
'A' + n - 10);
356 template<
size_t N,
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
358 _Inout_ std::basic_string<char, TR, AX>& dst,
359 _In_
const char (&src)[N])
361 url_escape(dst, src, N);
370 template<
class TR_dst = std::
char_traits<
char>,
class AX_dst = std::allocator<
char>>
372 _Inout_ std::basic_string<char, TR_dst, AX_dst>& dst,
373 _In_
const std::basic_string_view<
char, std::char_traits<char>> src)
375 url_escape(dst, src.data(), src.size());
385 template<
class T,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
387 _Inout_ std::basic_string<T, TR, AX>& dst,
388 _In_reads_or_z_opt_(num_chars)
const T* src, _In_
size_t num_chars)
390 _Assume_(src || !num_chars);
391 for (
size_t i = 0; i < num_chars && src[i];) {
394 else if (i + 1 < num_chars) {
399 case 'n': dst +=
'\n'; i++;
break;
400 case 'r': dst +=
'\r'; i++;
break;
401 case 't': dst +=
'\t'; i++;
break;
404 case '\n': i++;
break;
422 case 'F':
case 'f': {
424 size_t end = std::min(num_chars, i + 6);
426 for (; i < end; ++i) {
427 if (
'0' <= src[i] && src[i] <=
'9') chr = chr * 0x10 + src[i] -
'0';
428 else if (
'A' <= src[i] && src[i] <=
'F') chr = chr * 0x10 + src[i] -
'A' + 10;
429 else if (
'a' <= src[i] && src[i] <=
'f') chr = chr * 0x10 + src[i] -
'a' + 10;
433 dst +=
static_cast<T
>(chr);
435 if (i < end && src[i] ==
' ') {
442 default: dst += src[i++];
454 template<
class T,
size_t N,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
456 _Inout_ std::basic_string<T, TR, AX>& dst,
457 _In_
const T (&src)[N])
459 css_unescape(dst, src, N);
468 template<
class T,
class TR_dst = std::
char_traits<T>,
class AX_dst = std::allocator<T>,
class TR_src = std::
char_traits<T>,
class AX_src = std::allocator<T>>
470 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
471 _In_
const std::basic_string<T, TR_src, AX_src>& src)
473 css_unescape(dst, src.data(), src.size());
483 template<
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
485 _Inout_ std::basic_string<char, TR, AX>& dst,
486 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars)
488 _Assume_(src || !num_chars);
489 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
491 case '\\': dst +=
"\\\\";
break;
492 case '\n': dst +=
"\\n";
break;
493 case '\r': dst +=
"\\r";
break;
494 case '\t': dst +=
"\\t";
break;
495 case '\"': dst +=
"\\\"";
break;
496 case '\'': dst +=
"\\'";
break;
497 default: dst += src[i];
break;
509 template<
class TR = std::
char_traits<
wchar_t>,
class AX = std::allocator<
wchar_t>>
511 _Inout_ std::basic_string<wchar_t, TR, AX>& dst,
512 _In_reads_or_z_opt_(num_chars)
const wchar_t* src, _In_
size_t num_chars)
514 _Assume_(src || !num_chars);
515 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
517 case L
'\\': dst += L
"\\\\";
break;
518 case L
'\n': dst += L
"\\n";
break;
519 case L
'\r': dst += L
"\\r";
break;
520 case L
'\t': dst += L
"\\t";
break;
521 case L
'\"': dst += L
"\\\"";
break;
522 case L
'\'': dst += L
"\\'";
break;
523 default: dst += src[i];
break;
534 template<
class T,
size_t N,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
536 _Inout_ std::basic_string<T, TR, AX>& dst,
537 _In_
const T (&src)[N])
539 css_escape(dst, src, N);
548 template<
class T,
class TR_dst = std::
char_traits<T>,
class AX_dst = std::allocator<T>,
class TR_src = std::
char_traits<T>,
class AX_src = std::allocator<T>>
550 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
551 _In_
const std::basic_string<T, TR_src, AX_src>& src)
553 css_escape(dst, src.data(), src.size());
559 enum class element_t {
675 enum class element_span_t {
691 static element_span_t
span(_In_ element_t code)
693 static element_span_t lookup[] = {
694 element_span_t::needs_end,
695 element_span_t::needs_end,
696 element_span_t::needs_end,
697 element_span_t::needs_end,
698 element_span_t::needs_end,
699 element_span_t::immediate,
700 element_span_t::needs_end,
701 element_span_t::immediate,
702 element_span_t::immediate,
703 element_span_t::needs_end,
704 element_span_t::immediate,
705 element_span_t::needs_end,
706 element_span_t::needs_end,
707 element_span_t::needs_end,
708 element_span_t::end_optional,
709 element_span_t::immediate,
710 element_span_t::needs_end,
711 element_span_t::needs_end,
712 element_span_t::needs_end,
713 element_span_t::needs_end,
714 element_span_t::needs_end,
715 element_span_t::immediate,
716 element_span_t::end_optional,
717 element_span_t::needs_end,
718 element_span_t::end_optional,
719 element_span_t::needs_end,
720 element_span_t::needs_end,
721 element_span_t::needs_end,
722 element_span_t::needs_end,
723 element_span_t::needs_end,
724 element_span_t::end_optional,
725 element_span_t::needs_end,
726 element_span_t::immediate,
727 element_span_t::needs_end,
728 element_span_t::needs_end,
729 element_span_t::needs_end,
730 element_span_t::immediate,
731 element_span_t::needs_end,
732 element_span_t::needs_end,
733 element_span_t::needs_end,
734 element_span_t::needs_end,
735 element_span_t::needs_end,
736 element_span_t::needs_end,
737 element_span_t::needs_end,
738 element_span_t::end_optional,
739 element_span_t::immediate,
740 element_span_t::end_optional,
741 element_span_t::needs_end,
742 element_span_t::needs_end,
743 element_span_t::immediate,
744 element_span_t::immediate,
745 element_span_t::needs_end,
746 element_span_t::immediate,
747 element_span_t::needs_end,
748 element_span_t::needs_end,
749 element_span_t::needs_end,
750 element_span_t::end_optional,
751 element_span_t::immediate,
752 element_span_t::needs_end,
753 element_span_t::needs_end,
754 element_span_t::needs_end,
755 element_span_t::needs_end,
756 element_span_t::immediate,
757 element_span_t::immediate,
758 element_span_t::needs_end,
759 element_span_t::needs_end,
760 element_span_t::needs_end,
761 element_span_t::needs_end,
762 element_span_t::needs_end,
763 element_span_t::needs_end,
764 element_span_t::needs_end,
765 element_span_t::end_optional,
766 element_span_t::end_optional,
767 element_span_t::immediate,
768 element_span_t::end_optional,
769 element_span_t::needs_end,
770 element_span_t::needs_end,
771 element_span_t::immediate,
772 element_span_t::needs_end,
773 element_span_t::needs_end,
774 element_span_t::needs_end,
775 element_span_t::needs_end,
776 element_span_t::needs_end,
777 element_span_t::needs_end,
778 element_span_t::needs_end,
779 element_span_t::needs_end,
780 element_span_t::needs_end,
781 element_span_t::needs_end,
782 element_span_t::needs_end,
783 element_span_t::needs_end,
784 element_span_t::needs_end,
785 element_span_t::end_optional,
786 element_span_t::end_optional,
787 element_span_t::needs_end,
788 element_span_t::end_optional,
789 element_span_t::end_optional,
790 element_span_t::end_optional,
791 element_span_t::needs_end,
792 element_span_t::end_optional,
793 element_span_t::needs_end,
794 element_span_t::needs_end,
795 element_span_t::needs_end,
796 element_span_t::needs_end,
797 element_span_t::immediate,
798 element_span_t::needs_end,
800 return element_t::a <= code && code <= element_t::xmp ?
801 lookup[
static_cast<size_t>(code) -
static_cast<size_t>(element_t::a)] :
802 element_span_t::needs_end;
818 case element_t::strike:
819 case element_t::blink:
821 case element_t::small:
837 case element_t::strong:
839 case element_t::code:
840 case element_t::samp:
843 case element_t::cite:
844 case element_t::abbr:
845 case element_t::acronym:
863 case element_t::applet:
864 case element_t::object:
865 case element_t::embed:
866 case element_t::font:
867 case element_t::basefont:
871 case element_t::script:
876 case element_t::ruby:
877 case element_t::span:
879 case element_t::iframe:
880 case element_t::nobr:
895 case element_t::input:
896 case element_t::select:
897 case element_t::textarea:
898 case element_t::label:
899 case element_t::button:
914 code == element_t::PCDATA ||
952 case element_t::menu:
968 case element_t::listing:
989 case element_t::center:
990 case element_t::marquee:
991 case element_t::noscript:
992 case element_t::noframes:
993 case element_t::noembed:
994 case element_t::blockquote:
995 case element_t::form:
996 case element_t::isindex:
998 case element_t::table:
999 case element_t::fieldset:
1000 case element_t::address:
1025 case element_t::title:
1026 case element_t::isindex:
1027 case element_t::base:
1028 case element_t::nextid:
1043 case element_t::script:
1044 case element_t::style:
1045 case element_t::meta:
1046 case element_t::link:
1047 case element_t::object:
1062 case element_t::img:
1063 case element_t::object:
1064 case element_t::applet:
1065 case element_t::embed:
1066 case element_t::big:
1067 case element_t::small:
1068 case element_t::sub:
1069 case element_t::sup:
1070 case element_t::ruby:
1071 case element_t::font:
1072 case element_t::basefont:
1073 case element_t::nobr:
1088 case element_t::head:
1089 case element_t::body:
1090 case element_t::frameset:
1108 case element_t::col:
1109 case element_t::colgroup:
1111 case element_t::dir:
1113 case element_t::frame:
1114 case element_t::iframe:
1115 case element_t::legend:
1135 if (child == element_t::unknown || child == element_t::comment)
1143 case element_t::a:
return is_inline(child) && child != element_t::a;
1144 case element_t::address:
return is_inline(child) || child == element_t::p;
1145 case element_t::applet:
return is_flow(child) || child == element_t::param;
1146 case element_t::area:
return false;
1147 case element_t::base:
return false;
1148 case element_t::basefont:
return false;
1149 case element_t::bdo:
return is_inline(child);
1150 case element_t::blockquote:
return is_flow(child);
1151 case element_t::body:
return is_flow(child) || child == element_t::ins || child == element_t::del;
1152 case element_t::br:
return false;
1153 case element_t::button:
return is_flow(child) && !
is_formctrl(child) && child != element_t::a && child != element_t::form && child != element_t::isindex && child != element_t::fieldset && child != element_t::iframe;
1154 case element_t::caption:
return is_inline(child);
1155 case element_t::center:
return is_flow(child);
1156 case element_t::col:
return false;
1157 case element_t::colgroup:
return child == element_t::col;
1158 case element_t::comment:
return child == element_t::CDATA;
1159 case element_t::dd:
return is_flow(child);
1160 case element_t::del:
return is_flow(child);
1161 case element_t::dir:
return child == element_t::li;
1162 case element_t::div:
return is_flow(child);
1163 case element_t::dl:
return child == element_t::dt || child == element_t::dd;
1164 case element_t::dt:
return is_inline(child);
1165 case element_t::embed:
return is_flow(child) || child == element_t::param;
1166 case element_t::fieldset:
return is_flow(child) || child == element_t::legend || child == element_t::PCDATA;
1167 case element_t::font:
return is_inline(child);
1168 case element_t::form:
return is_flow(child) && child != element_t::form;
1169 case element_t::frame:
return false;
1170 case element_t::frameset:
return child == element_t::frameset || child == element_t::frame || child == element_t::noframes;
1172 case element_t::hr:
return false;
1174 case element_t::iframe:
return is_flow(child);
1175 case element_t::img:
return false;
1176 case element_t::input:
return false;
1177 case element_t::ins:
return is_flow(child);
1178 case element_t::isindex:
return false;
1179 case element_t::label:
return is_inline(child) && child != element_t::label;
1180 case element_t::legend:
return is_inline(child);
1181 case element_t::li:
return is_flow(child);
1182 case element_t::link:
return false;
1183 case element_t::listing:
return child == element_t::CDATA;
1184 case element_t::map:
return is_block(child) || child == element_t::area;
1185 case element_t::marquee:
return is_flow(child);
1186 case element_t::menu:
return child == element_t::li;
1187 case element_t::meta:
return false;
1188 case element_t::nobr:
return is_inline(child) || child == element_t::wbr;
1189 case element_t::noframes:
return (
is_flow(child) || child == element_t::body) && child != element_t::noframes;
1190 case element_t::noscript:
return is_flow(child);
1191 case element_t::noembed:
return is_flow(child);
1192 case element_t::object:
return is_flow(child) || child == element_t::param;
1193 case element_t::ol:
return child == element_t::li;
1194 case element_t::optgroup:
return child == element_t::option;
1195 case element_t::option:
return child == element_t::PCDATA;
1196 case element_t::p:
return is_inline(child);
1197 case element_t::param:
return false;
1198 case element_t::plaintext:
return is_flow(child);
1200 case element_t::q:
return is_inline(child);
1201 case element_t::rt:
return false;
1202 case element_t::ruby:
return is_inline(child);
1203 case element_t::script:
return child == element_t::CDATA;
1204 case element_t::select:
return child == element_t::optgroup || child == element_t::option;
1205 case element_t::span:
return is_inline(child);
1206 case element_t::style:
return child == element_t::CDATA;
1207 case element_t::sub:
return is_inline(child);
1208 case element_t::sup:
return is_inline(child);
1209 case element_t::table:
return child == element_t::caption || child == element_t::col || child == element_t::colgroup || child == element_t::thead || child == element_t::tfoot || child == element_t::tbody;
1210 case element_t::tbody:
return child == element_t::tr;
1211 case element_t::td:
return is_flow(child);
1212 case element_t::textarea:
return child == element_t::PCDATA;
1213 case element_t::tfoot:
return child == element_t::tr;
1214 case element_t::th:
return is_flow(child);
1215 case element_t::thead:
return child == element_t::tr;
1216 case element_t::title:
return child == element_t::PCDATA;
1217 case element_t::tr:
return child == element_t::td || child == element_t::th;
1218 case element_t::ul:
return child == element_t::li;
1219 case element_t::wbr:
return false;
1220 case element_t::unknown:
return true;
1221 default:
return false;
1233 static bool is_uri(_In_ element_t code, _In_reads_or_z_opt_(num_chars)
const T* attr_name, _In_
size_t num_chars)
1235 _Assume_(attr_name || !num_chars);
1237 case element_t::a:
return !stdex::strnicmp(attr_name, num_chars,
"href", SIZE_MAX);
1238 case element_t::applet:
return !stdex::strnicmp(attr_name, num_chars,
"code", SIZE_MAX) ||
1239 !stdex::strnicmp(attr_name, num_chars,
"codebase", SIZE_MAX) ||
1240 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX);
1241 case element_t::area:
return !stdex::strnicmp(attr_name, num_chars,
"href", SIZE_MAX);
1242 case element_t::base:
return !stdex::strnicmp(attr_name, num_chars,
"href", SIZE_MAX);
1243 case element_t::bgsound:
return !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX);
1244 case element_t::blockquote:
return !stdex::strnicmp(attr_name, num_chars,
"cite", SIZE_MAX);
1245 case element_t::body:
return !stdex::strnicmp(attr_name, num_chars,
"background", SIZE_MAX);
1246 case element_t::comment:
return !stdex::strnicmp(attr_name, num_chars,
"data", SIZE_MAX);
1247 case element_t::del:
return !stdex::strnicmp(attr_name, num_chars,
"cite", SIZE_MAX);
1248 case element_t::embed:
return !stdex::strnicmp(attr_name, num_chars,
"pluginspage", SIZE_MAX) ||
1249 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX);
1250 case element_t::form:
return !stdex::strnicmp(attr_name, num_chars,
"action", SIZE_MAX);
1251 case element_t::frame:
return !stdex::strnicmp(attr_name, num_chars,
"longdesc", SIZE_MAX) ||
1252 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX);
1253 case element_t::head:
return !stdex::strnicmp(attr_name, num_chars,
"profile", SIZE_MAX);
1254 case element_t::iframe:
return !stdex::strnicmp(attr_name, num_chars,
"longdesc", SIZE_MAX) ||
1255 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX);
1256 case element_t::img:
return !stdex::strnicmp(attr_name, num_chars,
"longdesc", SIZE_MAX) ||
1257 !stdex::strnicmp(attr_name, num_chars,
"lowsrc", SIZE_MAX) ||
1258 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX) ||
1259 !stdex::strnicmp(attr_name, num_chars,
"usemap", SIZE_MAX);
1260 case element_t::input:
return !stdex::strnicmp(attr_name, num_chars,
"lowsrc", SIZE_MAX) ||
1261 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX) ||
1262 !stdex::strnicmp(attr_name, num_chars,
"usemap", SIZE_MAX);
1263 case element_t::ins:
return !stdex::strnicmp(attr_name, num_chars,
"cite", SIZE_MAX);
1264 case element_t::link:
return !stdex::strnicmp(attr_name, num_chars,
"href", SIZE_MAX);
1265 case element_t::object:
return !stdex::strnicmp(attr_name, num_chars,
"basehref", SIZE_MAX) ||
1266 !stdex::strnicmp(attr_name, num_chars,
"classid", SIZE_MAX) ||
1267 !stdex::strnicmp(attr_name, num_chars,
"code", SIZE_MAX) ||
1268 !stdex::strnicmp(attr_name, num_chars,
"codebase", SIZE_MAX) ||
1269 !stdex::strnicmp(attr_name, num_chars,
"data", SIZE_MAX) ||
1270 !stdex::strnicmp(attr_name, num_chars,
"usemap", SIZE_MAX);
1271 case element_t::q:
return !stdex::strnicmp(attr_name, num_chars,
"cite", SIZE_MAX);
1272 case element_t::script:
return !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX);
1273 case element_t::table:
return !stdex::strnicmp(attr_name, num_chars,
"background", SIZE_MAX);
1274 case element_t::td:
return !stdex::strnicmp(attr_name, num_chars,
"background", SIZE_MAX);
1275 case element_t::th:
return !stdex::strnicmp(attr_name, num_chars,
"background", SIZE_MAX);
1276 default:
return false;
1290 _Assume_(attr_name || !num_chars);
1291 if (!stdex::strnicmp(attr_name, num_chars,
"title", SIZE_MAX))
1294 case element_t::applet:
return !stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX);
1295 case element_t::area:
return !stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX);
1296 case element_t::img:
return !stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX);
1297 case element_t::input:
return !stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX);
1298 case element_t::object:
return !stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX);
1299 case element_t::table:
return !stdex::strnicmp(attr_name, num_chars,
"summary", SIZE_MAX);
1300 case element_t::td:
return !stdex::strnicmp(attr_name, num_chars,
"abbr", SIZE_MAX);
1301 case element_t::th:
return !stdex::strnicmp(attr_name, num_chars,
"abbr", SIZE_MAX);
1302 default:
return false;
1308 using sequence_store = std::vector<std::unique_ptr<sequence>>;
1316 stdex::parser::html_sequence_t
type;
1320 sequence(_In_ stdex::parser::html_sequence_t _type = stdex::parser::html_sequence_t::unknown, _In_
size_t start = 0,
size_t end = 0, _In_opt_
sequence* _parent =
nullptr) :
1337 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1338 code(element_code(src + tag.name.start, tag.name.size())),
1339 name(std::move(tag.name)),
1344 static element_t element_code(_In_reads_z_(num_chars)
const T*
name,
size_t num_chars)
1346 static const struct {
1350 {
"a", element_t::a, },
1351 {
"abbr", element_t::abbr, },
1352 {
"acronym", element_t::acronym, },
1353 {
"address", element_t::address, },
1354 {
"applet", element_t::applet, },
1355 {
"area", element_t::area, },
1356 {
"b", element_t::b, },
1357 {
"base", element_t::base, },
1358 {
"basefont", element_t::basefont, },
1359 {
"bdo", element_t::bdo, },
1360 {
"bgsound", element_t::bgsound, },
1361 {
"big", element_t::big, },
1362 {
"blink", element_t::blink, },
1363 {
"blockquote", element_t::blockquote, },
1364 {
"body", element_t::body, },
1365 {
"br", element_t::br, },
1366 {
"button", element_t::button, },
1367 {
"caption", element_t::caption, },
1368 {
"center", element_t::center, },
1369 {
"cite", element_t::cite, },
1370 {
"code", element_t::code, },
1371 {
"col", element_t::col, },
1372 {
"colgroup", element_t::colgroup, },
1373 {
"comment", element_t::comment, },
1374 {
"dd", element_t::dd, },
1375 {
"del", element_t::del, },
1376 {
"dfn", element_t::dfn, },
1377 {
"dir", element_t::dir, },
1378 {
"div", element_t::div, },
1379 {
"dl", element_t::dl, },
1380 {
"dt", element_t::dt, },
1381 {
"em", element_t::em, },
1382 {
"embed", element_t::embed, },
1383 {
"fieldset", element_t::fieldset, },
1384 {
"font", element_t::font, },
1385 {
"form", element_t::form, },
1386 {
"frame", element_t::frame, },
1387 {
"frameset", element_t::frameset, },
1388 {
"h1", element_t::h1, },
1389 {
"h2", element_t::h2, },
1390 {
"h3", element_t::h3, },
1391 {
"h4", element_t::h4, },
1392 {
"h5", element_t::h5, },
1393 {
"h6", element_t::h6, },
1394 {
"head", element_t::head, },
1395 {
"hr", element_t::hr, },
1396 {
"html", element_t::html, },
1397 {
"i", element_t::i, },
1398 {
"iframe", element_t::iframe, },
1399 {
"img", element_t::img, },
1400 {
"input", element_t::input, },
1401 {
"ins", element_t::ins, },
1402 {
"isindex", element_t::isindex, },
1403 {
"kbd", element_t::kbd, },
1404 {
"label", element_t::label, },
1405 {
"legend", element_t::legend, },
1406 {
"li", element_t::li, },
1407 {
"link", element_t::link, },
1408 {
"listing", element_t::listing, },
1409 {
"map", element_t::map, },
1410 {
"marquee", element_t::marquee, },
1411 {
"menu", element_t::menu, },
1412 {
"meta", element_t::meta, },
1413 {
"nextid", element_t::nextid, },
1414 {
"nobr", element_t::nobr, },
1415 {
"noembed", element_t::noembed, },
1416 {
"noframes", element_t::noframes, },
1417 {
"noscript", element_t::noscript, },
1418 {
"object", element_t::object, },
1419 {
"ol", element_t::ol, },
1420 {
"optgroup", element_t::optgroup, },
1421 {
"option", element_t::option, },
1422 {
"p", element_t::p, },
1423 {
"param", element_t::param, },
1424 {
"plaintext", element_t::plaintext, },
1425 {
"pre", element_t::pre, },
1426 {
"q", element_t::q, },
1427 {
"rt", element_t::rt, },
1428 {
"ruby", element_t::ruby, },
1429 {
"s", element_t::s, },
1430 {
"samp", element_t::samp, },
1431 {
"script", element_t::script, },
1432 {
"select", element_t::select, },
1433 {
"small", element_t::small, },
1434 {
"span", element_t::span, },
1435 {
"strike", element_t::strike, },
1436 {
"strong", element_t::strong, },
1437 {
"style", element_t::style, },
1438 {
"sub", element_t::sub, },
1439 {
"sup", element_t::sup, },
1440 {
"table", element_t::table, },
1441 {
"tbody", element_t::tbody, },
1442 {
"td", element_t::td, },
1443 {
"textarea", element_t::textarea, },
1444 {
"tfoot", element_t::tfoot, },
1445 {
"th", element_t::th, },
1446 {
"thead", element_t::thead, },
1447 {
"title", element_t::title, },
1448 {
"tr", element_t::tr, },
1449 {
"tt", element_t::tt, },
1450 {
"u", element_t::u, },
1451 {
"ul", element_t::ul, },
1452 {
"var", element_t::var, },
1453 {
"wbr", element_t::wbr, },
1454 {
"xmp", element_t::xmp, },
1458 for (
size_t i = 1; i < _countof(
mapping); i++)
1460 for (
size_t i = 0; i < _countof(
mapping); i++) {
1461 for (
size_t j = 0;
mapping[i].name[j]; j++)
1465 for (
size_t i = 0, j = _countof(
mapping); i < j; ) {
1466 size_t m = (i + j) / 2;
1468 for (
size_t i1 = 0, i2 = 0;;) {
1470 r = i2 >= num_chars || !
name[i2] ? 0 : -1;
1473 if (i2 >= num_chars || !
name[i2]) {
1478 auto chr =
static_cast<char>(stdex::tolower(
name[i2++]));
1497 return element_t::unknown;
1532 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1533 code(element::element_code(src + tag.name.start, tag.name.size())),
1534 name(std::move(tag.name)),
1552 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1553 name(std::move(tag.name)),
1570 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1586 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1597 template<
class T,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
1607 template<
class T,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
1613 template<
class T,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
1655 void append(_In_reads_or_z_opt_(num_chars)
const T*
source, _In_
size_t num_chars)
1657 _Assume_(
source || !num_chars);
1664 if (m_condition_end.match(
source, i, num_chars)) {
1666 m_is_cdata ? stdex::parser::html_sequence_t::CDATA : stdex::parser::html_sequence_t::PCDATA,
1677 if (m_condition_end.match(
source, i, num_chars)) {
1694 if (m_condition_start.match(
source, i, num_chars)) {
1695 auto condition_src(
replace_entities(
source + m_condition_start.condition.start, m_condition_start.condition.size()));
1696 if (stdex::strncmp(condition_src.data(), condition_src.size(),
"CDATA", SIZE_MAX) == 0)
1698 else if (stdex::strncmp(condition_src.data(), condition_src.size(),
"RCDATA", SIZE_MAX) == 0)
1702 else if (stdex::strncmp(condition_src.data(), condition_src.size(),
"IGNORE", SIZE_MAX) == 0)
1714 if (m_tag.match(
source, i, num_chars) &&
1715 m_tag.type == stdex::parser::html_sequence_t::element_end &&
1716 element::element_code(
source + m_tag.name.start, m_tag.name.size()) == parent->code)
1721 std::unique_ptr<element_end> e(
new element_end(std::move(m_tag),
source, parent->parent, parent));
1722 parent->end = e.get();
1731 if (m_tag.match(
source, i, num_chars)) {
1736 switch (m_tag.type) {
1737 case stdex::parser::html_sequence_t::element:
1738 case stdex::parser::html_sequence_t::element_start: {
1739 std::unique_ptr<element> e(
1740 m_tag.type == stdex::parser::html_sequence_t::element ?
new element(std::move(m_tag),
source) :
1741 m_tag.type == stdex::parser::html_sequence_t::element_start ?
new element_start(std::move(m_tag),
source) :
1747 _Assume_(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1749 e->parent = starting_tag;
1752 e->parent = starting_tag->parent;
1753 starting_tag->end = e.get();
1757 if (e->type == stdex::parser::html_sequence_t::element_start) {
1760 e_start->
end = e.get();
1764 case element_t::code:
1765 case element_t::comment:
1766 case element_t::script:
1767 case element_t::style:
1775 if (e->code == element_t::meta &&
m_charset == stdex::charset_id::system) {
1776 bool is_content_type =
false;
1778 for (
auto& attr : e->attributes) {
1779 if (!stdex::strnicmp(
source + attr.name.start, attr.name.size(),
"http-equiv", SIZE_MAX) &&
1780 !stdex::strnicmp(
source + attr.value.start, attr.value.size(),
"content-type", SIZE_MAX))
1781 is_content_type =
true;
1782 else if (!stdex::strnicmp(
source + attr.name.start, attr.name.size(),
"content", SIZE_MAX))
1783 content_attr = &attr;
1785 if (is_content_type && content_attr) {
1792 str.reserve(content.charset.size());
1793 for (
size_t j = content.charset.start; j < content.charset.end; ++j)
1794 str.push_back(
static_cast<char>(
source[j]));
1795 m_charset = stdex::charset_from_name(str);
1803 case stdex::parser::html_sequence_t::element_end: {
1808 _Assume_(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1809 if (starting_tag->code == e->code ||
1810 (starting_tag->code == element_t::unknown && e->code == element_t::unknown && !stdex::strnicmp(
source + starting_tag->name.start, starting_tag->name.size(),
source + e->name.start, e->name.size())))
1812 e->start = starting_tag;
1813 e->parent = starting_tag->parent;
1814 starting_tag->end = e.get();
1823 case stdex::parser::html_sequence_t::declaration:
1824 if (m_tag.attributes.size() > 3 &&
1825 !stdex::strnicmp(
source + m_tag.attributes[0].name.start, m_tag.attributes[0].name.size(),
"entity", SIZE_MAX))
1827 if (!stdex::strncmp(
source + m_tag.attributes[1].name.start, m_tag.attributes[1].name.size(),
"%", SIZE_MAX) &&
1828 stdex::strncmp(
source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size(),
"SYSTEM", SIZE_MAX) &&
1829 stdex::strncmp(
source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size(),
"PUBLIC", SIZE_MAX))
1832 e->name = m_tag.attributes[2].name;
1833 e->value = std::move(
replace_entities(
source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size()));
1841 case stdex::parser::html_sequence_t::comment:
1844 case stdex::parser::html_sequence_t::instruction:
1848 throw std::invalid_argument(
"unknown tag type");
1855 if (m_any_char.match(
source, i, num_chars)) {
1879 void assign(_In_reads_or_z_opt_(num_chars)
const T*
source, _In_
size_t num_chars)
1891 friend class parser<T, TR, AX>;
1905 std::basic_string<T, TR, AX>
replace_entities(_In_reads_or_z_opt_(num_chars)
const T* input, _In_
size_t num_chars)
const
1907 _Assume_(input || !num_chars);
1908 const size_t num_entities =
m_entities.size();
1910 std::basic_string<T, TR, AX> output;
1911 for (
size_t i = 0; i < num_chars && input[i];) {
1912 if (input[i] ==
'%') {
1913 for (
size_t j = 0; j < num_entities; j++) {
1915 size_t entity_size = e->name.size();
1916 if (i + entity_size + 1 < num_chars &&
1917 !stdex::strncmp(input + i + 1,
source + e->name.start, entity_size) &&
1918 input[i + entity_size + 1] ==
';')
1921 i += entity_size + 2;
1925 throw std::runtime_error(
"undefined entity");
1927 output += input[i++];
1958 enum class token_t {
1969 constexpr size_t token_tag_max =
1978 constexpr char token_tag_start =
'\x12';
1984 constexpr char token_tag_end =
'\x13';
1992 token(_In_ token_t _type = token_t::root, _In_opt_
sequence* _sequence =
nullptr, _In_ uintptr_t _data = 0) :
1998 template<
class T,
class TR,
class AX>
2011 template<
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
2012 size_t append_tag(_Inout_ std::basic_string<char, TR, AX>& str)
const
2014 size_t n = str.size();
2016 stdex::appendf(str,
"%c%zX%c", stdex::locale_C, token_tag_start,
reinterpret_cast<uintptr_t
>(
this), token_tag_end);
2017 return str.size() - n;
2027 template<
class TR = std::
char_traits<
wchar_t>,
class AX = std::allocator<
wchar_t>>
2028 size_t append_tag(_Inout_ std::basic_string<wchar_t, TR, AX>& str)
const
2031 return stdex::appendf(str, L
"%c%zX%c", stdex::locale_C,
static_cast<wchar_t>(token_tag_start),
reinterpret_cast<uintptr_t
>(
this),
static_cast<wchar_t>(token_tag_end));
2035 static token* parse_tag(
const T* str,
size_t& offset)
2037 if (str[offset] !=
static_cast<T
>(token_tag_start))
2042 for (end = offset + 1; ; end++) {
2045 if (str[end] == token_tag_end)
2050 token* t =
reinterpret_cast<token*
>(stdex::strtouint<T, uintptr_t>(str + offset + 1, end - offset - 1,
nullptr, 16));
2052 throw std::invalid_argument(
"null token");
2063 using token_vector = std::vector<std::unique_ptr<token>>;
2064 using token_list = std::list<token*>;
2069 enum text_type_flag_t : uint32_t {
2070 has_tokens = 1 << 0,
2079 template<
class T,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
2084 _In_ token_t
type = token_t::complete,
2085 _In_reads_or_z_opt_(num_chars)
const T* _text =
nullptr, _In_
size_t num_chars = 0,
2086 _In_ uint32_t _text_type = 0,
2089 text(_text, num_chars),
2093 friend class parser<T, TR, AX>;
2104 template<
class T,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
2109 _In_reads_or_z_opt_(num_chars_text)
const T* _text =
nullptr, _In_
size_t num_chars_text = 0,
2110 _In_reads_or_z_opt_(num_chars_name)
const T* _name =
nullptr, _In_
size_t num_chars_name = 0,
2114 _In_ uintptr_t
data = 0) :
2116 name(_name, num_chars_name),
2120 friend class parser<T, TR, AX>;
2130 enum class token_url_t {
2139 template<
class T,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
2144 _In_reads_or_z_opt_(num_chars)
const T* _url =
nullptr, _In_
size_t num_chars = 0,
2145 token_url_t _encoding = token_url_t::plain,
2148 url(_url, num_chars),
2152 friend class parser<T, TR, AX>;
2155 std::basic_string<T, TR, AX>
url;
2169 using inserted_token_list = std::list<inserted_token>;
2171 template<
class T,
class TR,
class AX>
2177 _In_reads_or_z_opt_(num_chars)
const stdex::schar_t* url =
nullptr, _In_
size_t num_chars = 0,
2180 m_url(url, stdex::strnlen(url, num_chars)),
2213 t->type == token_t::complete ||
2214 t->type == token_t::starting ||
2215 t->type == token_t::ending ||
2216 t->type == token_t::root);
2218 if (t->text_type & has_tokens) {
2219 const T* root = t->text.data();
2220 for (
size_t i = 0, num_chars = t->text.size(); i < num_chars && root[i];) {
2221 _Assume_(root[i] != token_tag_end);
2222 const token* t2 = token::parse_tag(root, i);
2225 case token_t::complete:
2226 case token_t::starting:
2227 case token_t::ending:
2231 case token_t::url: {
2233 switch (t2_url->encoding) {
2234 case token_url_t::plain:
2235 source += t2_url->
url;
2237 case token_url_t::sgml:
2238 escape(source, t2_url->url.data(), t2_url->url.size());
2240 case token_url_t::css:
2241 css_escape(source, t2_url->url.data(), t2_url->url.size());
2244 throw std::invalid_argument(
"unsupported URL encoding");
2249 throw std::invalid_argument(
"unsupported token type");
2252 else if (t->text_type & has_text) {
2253 escape_min(source, root[i]);
2257 source += root[i++];
2260 else if (t->text_type & has_text) {
2262 escape_min(source, t->text.data(), t->text.size());
2276 static void start_tokens(_Inout_ std::basic_string<T, TR, AX>& source, _Inout_ token_list& active_tokens, _In_
const token_list& new_tokens, _In_ token_list::const_iterator from)
2278 for (; from != new_tokens.cend(); ++from) {
2280 t->append_tag(source);
2281 active_tokens.push_back(t);
2294 token_list::const_iterator
end_tokens(_Inout_ std::basic_string<T, TR, AX>& source, _Inout_ token_list& active_tokens, _In_
const token_list& new_tokens)
2297 token_list::const_iterator i1, i2;
2298 for (i1 = active_tokens.cbegin(), i2 = new_tokens.cbegin(); i1 != active_tokens.cend(); ++i1, ++i2) {
2299 if (i2 == new_tokens.cend() || *i1 != *i2) {
2302 for (
auto i = active_tokens.cend(); i != active_tokens.cbegin(); ) {
2304 _Assume_(t1 && t1->type == token_t::starting);
2307 t2->text.reserve(t1->name.size() + 3);
2310 t2->text += t1->name;
2316 active_tokens.erase(i);
2319 active_tokens.erase(i);
2320 i = active_tokens.cend();
2338 _In_
size_t word_index, _In_
bool after_word,
2339 _Inout_ token_list& active_tokens)
2341 for (
auto i = inserted_tokens.begin(); i != inserted_tokens.end(); ) {
2344 if (t.word_index == word_index && t.after_word == after_word) {
2345 if (t.token->type != token_t::ending)
2346 start_tokens(source, active_tokens, t.active_tokens,
end_tokens(source, active_tokens, t.active_tokens));
2347 t.token->append_tag(source);
2348 inserted_tokens.erase(i++);
2361 static void merge(_Inout_ token_list& a, _In_
const token_list& b)
2363 for (
auto i2 = b.begin(); i2 != b.end(); ++i2) {
2365 for (
auto i1 = a.begin(); i1 != a.end(); ++i1) {
2366 if (i1 == a.end()) {
2382 _Unreferenced_(rel);
2403 template <
class T_token>
2408 auto t =
token.get();
2421 template <
class T_token>
2422 size_t append_token(_Inout_ std::unique_ptr<T_token>&&
token, _Inout_ std::basic_string<T, TR, AX>& source)
2459 stdex::strnchr(
m_source + s->interval.start, s->interval.size(),
static_cast<T
>(token_tag_start)) == stdex::npos &&
2460 stdex::strnchr(
m_source + s->interval.start, s->interval.size(),
static_cast<T
>(token_tag_end)) == stdex::npos);
2462 if (s->type == stdex::parser::html_sequence_t::text) {
2463 rel.from = s->interval.start;
2464 token->mapping.push_back(rel);
2465 stdex::sgml2strcat(
token->text,
m_source + s->interval.start, s->interval.size(), 0, rel, &
token->mapping);
2466 rel.to =
token->text.size();
2467 if (!(
token->text_type & has_text) &&
2468 !stdex::isblank(
m_source + s->interval.start, s->interval.size()))
2469 token->text_type |= has_text;
2472 else if (s->type == stdex::parser::html_sequence_t::element || s->type == stdex::parser::html_sequence_t::element_start) {
2475 const element_start* s_el_start = s->type == stdex::parser::html_sequence_t::element_start ?
static_cast<const element_start*
>(s.get()) :
nullptr;
2477 throw std::invalid_argument(
"<frameset> detected");
2480 size_t offset = s->interval.start;
2481 std::unique_ptr<text_token<T, TR, AX>> t(s->type == stdex::parser::html_sequence_t::element ||
element_traits::span(s_el_start->
code) == element_span_t::immediate ?
2487 if (a.value.empty() ||
2488 stdex::isblank(
m_source + a.value.start, a.value.size()))
2492 t->text.append(
m_source + offset, a.value.start - offset);
2497 stdex::sgml2strcat(t_url->url,
m_source + a.value.start, a.value.size());
2499 t->text_type |= has_tokens;
2500 offset = a.value.end;
2503 t->text.append(
m_source + offset, a.value.start - offset);
2507 has_text | is_title,
2510 t_value->mapping.push_back(rel_value);
2511 stdex::sgml2strcat(t_value->text,
m_source + a.value.start, a.value.size(), 0, rel_value, &t_value->mapping);
2513 t->text_type |= has_tokens;
2514 offset = a.value.end;
2518 t->text.append(
m_source + offset, s->interval.end - offset);
2519 rel.from = s->interval.start;
2520 token->mapping.push_back(rel);
2522 token->text_type |= has_tokens;
2527 if (s_el_start->
code == element_t::address ||
2528 s_el_start->
code == element_t::code ||
2529 s_el_start->
code == element_t::comment ||
2530 s_el_start->
code == element_t::cite ||
2531 s_el_start->
code == element_t::kbd ||
2532 s_el_start->
code == element_t::samp ||
2533 s_el_start->
code == element_t::script ||
2534 s_el_start->
code == element_t::style)
2537 auto s_end = s_el_start->
end;
2540 if (s->interval.end < s_end->interval.start) {
2541 if (s_el_start->
code != element_t::style) {
2542 rel.from = s->interval.start;
2543 token->mapping.push_back(rel);
2547 m_source + s->interval.end, s_end->interval.start - s->interval.end,
2554 auto t =
parse_css(s->interval.end, s_end->interval.start);
2556 rel.from = s->interval.start;
2557 token->mapping.push_back(rel);
2558 rel.to += t->append_tag(
token->text);
2560 token->text_type |= has_tokens;
2567 while (limit != end && limit->get() != s_el_start->
end)
2569 auto t =
parse(limit,
2572 rel.from = s->interval.start;
2573 token->mapping.push_back(rel);
2574 rel.to += t->append_tag(
token->text);
2575 token->text_type |= has_tokens;
2579 else if (s->type == stdex::parser::html_sequence_t::element_end) {
2580 rel.from = s->interval.start;
2581 token->mapping.push_back(rel);
2585 m_source + s->interval.start, s->interval.size(),
2589 token->text_type |= has_tokens;
2594 rel.from = s->interval.start;
2595 token->mapping.push_back(rel);
2599 m_source + s->interval.start, s->interval.size(),
2603 token->text_type |= has_tokens;
2617 std::unique_ptr<text_token<T, TR, AX>>
token(
2625 if (m_css_comment.match(
m_source, start, end)) {
2629 else if (m_css_cdo.match(
m_source, start, end)) {
2633 else if (m_css_cdc.match(
m_source, start, end)) {
2638 (m_css_import.match(
m_source, start, end) && ((
void)(section = m_css_import.
interval), (
void)(content = m_css_import.content),
true)) ||
2639 (m_css_uri.match(
m_source, start, end) && ((
void)(section = m_css_uri.
interval), (
void)(content = m_css_uri.content),
true)))
2641 std::unique_ptr<url_token<T, TR, AX>> t_url(
2650 token->text_type |= has_tokens;
2651 start = section.
end;
2653 else if (m_any_char.match(
m_source, start, end)) {
HTML declaration.
Definition html.hpp:1548
stdex::interval< size_t > name
Declaration name position in source.
Definition html.hpp:1558
std::vector< stdex::parser::html_attribute > attributes
Declaration attribute positions in source.
Definition html.hpp:1559
HTML document.
Definition html.hpp:1615
bool m_is_rcdata
Inside of RCDATA?
Definition html.hpp:1942
const std::basic_string< T, TR, AX > & source() const
Returns document HTML source code.
Definition html.hpp:1889
void append(_In_reads_or_z_opt_(num_chars) const T *source, size_t num_chars)
Parses HTML source code by chunks.
Definition html.hpp:1655
size_t m_num_valid_conditions
Number of started valid conditions.
Definition html.hpp:1939
size_t m_num_invalid_conditions
Number of started invalid conditions.
Definition html.hpp:1940
bool m_is_cdata
Inside of CDATA?
Definition html.hpp:1941
stdex::charset_id m_charset
Document charset.
Definition html.hpp:1936
sequence_store m_sequences
Store of sequences.
Definition html.hpp:1950
element_start * active_element() const
Returns starting tag of currently active element or nullptr if no element is known to be started.
Definition html.hpp:1897
size_t m_num_parsed
Number of characters already parsed.
Definition html.hpp:1935
std::vector< element_start * > m_element_stack
LIFO stack of started elements.
Definition html.hpp:1951
void finalize()
Finalizes document when no more appending is planned.
Definition html.hpp:1867
std::basic_string< T, TR, AX > replace_entities(_In_reads_or_z_opt_(num_chars) const T *input, size_t num_chars) const
Replaces entities with their content.
Definition html.hpp:1905
void assign(_In_reads_or_z_opt_(num_chars) const T *source, size_t num_chars)
Parses HTML document source code.
Definition html.hpp:1879
bool m_is_special_element
Inside of a special element (<SCRIPT>, <STYLE>, ...)?
Definition html.hpp:1952
std::vector< std::unique_ptr< entity< T, TR, AX > > > m_entities
Array of entities.
Definition html.hpp:1946
void clear()
Empties document.
Definition html.hpp:1634
std::basic_string< T, TR, AX > m_source
Document HTML source code.
Definition html.hpp:1934
Ending tag of an HTML element </...>
Definition html.hpp:1528
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1540
element_start * start
Corresponding starting tag.
Definition html.hpp:1541
element_t code
Element code.
Definition html.hpp:1539
Starting tag of an HTML element <...>
Definition html.hpp:1512
sequence * end
Corresponding ending tag of type element_end; When element is ended by a start of another element,...
Definition html.hpp:1521
HTML element <.../>
Definition html.hpp:1333
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1502
std::vector< stdex::parser::html_attribute > attributes
Element attribute positions in source.
Definition html.hpp:1503
element_t code
Element code.
Definition html.hpp:1501
HTML instruction.
Definition html.hpp:1582
stdex::interval< size_t > content
Instruction content position in source.
Definition html.hpp:1591
HTML parser.
Definition html.hpp:2173
token_vector m_tokens
HTML token storage.
Definition html.hpp:2670
void append_inserted_tokens(std::basic_string< T, TR, AX > &source, inserted_token_list &inserted_tokens, size_t word_index, bool after_word, token_list &active_tokens)
Adds matching inserted tokens before/after the given word in source code.
Definition html.hpp:2337
text_token< T, TR, AX > * parse(const sequence_store::const_iterator &end, uint32_t text_type=0)
Recursively parses HTML document.
Definition html.hpp:2439
const stdex::sstring m_url
Absolute document URL.
Definition html.hpp:2666
text_token< T, TR, AX > * parse()
Parses HTML document.
Definition html.hpp:2189
const document< T, TR, AX > & m_document
Document being analyzed.
Definition html.hpp:2665
token_list::const_iterator end_tokens(std::basic_string< T, TR, AX > &source, token_list &active_tokens, const token_list &new_tokens)
Pops ending tokens from the active token list and append their tags to the source code string.
Definition html.hpp:2294
static void merge(token_list &a, const token_list &b)
Adds tokens from list b to list a creating an union.
Definition html.hpp:2361
text_token< T, TR, AX > * parse_css(size_t start, size_t end)
Parses CSS.
Definition html.hpp:2614
static void start_tokens(std::basic_string< T, TR, AX > &source, token_list &active_tokens, const token_list &new_tokens, token_list::const_iterator from)
Pushes tokens to the active token list and appends their tags to the source code string.
Definition html.hpp:2276
static void link(std::basic_string< T, TR, AX > &source, const text_token< T, TR, AX > *t)
Rebuilds HTML source code from the token tree.
Definition html.hpp:2209
T_token * append_token(std::unique_ptr< T_token > &&token)
Adds token to the collection.
Definition html.hpp:2404
sequence_store::const_iterator m_offset
Index of active section.
Definition html.hpp:2671
const T * m_source
HTML source code.
Definition html.hpp:2669
stdex::progress< size_t > * m_progress
Progress indicator.
Definition html.hpp:2668
const bool m_parse_frames
Parse frames.
Definition html.hpp:2667
void make_absolute_url(std::basic_string< T, TR, AX > &rel)
Converts URL to absolute.
Definition html.hpp:2380
size_t append_token(std::unique_ptr< T_token > &&token, std::basic_string< T, TR, AX > &source)
Adds token to the collection and appends its tag to the source code string.
Definition html.hpp:2422
const token_vector & tokens() const
Returns collection of tokens.
Definition html.hpp:2393
Base class for HTML sequences.
Definition html.hpp:1314
stdex::interval< size_t > interval
Sequence position in source.
Definition html.hpp:1317
stdex::parser::html_sequence_t type
Sequence type. Enum is used for performance reasons (vs. dynamic_cast)
Definition html.hpp:1316
sequence * parent
Parent sequence.
Definition html.hpp:1318
Token representing start HTML tag.
Definition html.hpp:2106
stdex::html::sequence * end_sequence
Ending tag sequence.
Definition html.hpp:2124
std::basic_string< T, TR, AX > name
Element name allowing later recreation of ending </tag>
Definition html.hpp:2123
Token representing part of HTML text.
Definition html.hpp:2081
stdex::mapping_vector< size_t > mapping
Mapping between source and text positions.
Definition html.hpp:2098
uint32_t text_type
Mask of text_type_flag_t to specify text content.
Definition html.hpp:2097
std::basic_string< T, TR, AX > text
Token text.
Definition html.hpp:2096
HTML token base class.
Definition html.hpp:1990
sequence * sequence
Pointer to the sequence this token represents or nullptr when it doesn't trivially represent one sequ...
Definition html.hpp:2059
uintptr_t data
Any user-supplied data.
Definition html.hpp:2060
size_t append_tag(std::basic_string< wchar_t, TR, AX > &str) const
Appends token tag to the source code.
Definition html.hpp:2028
token_t type
Token type.
Definition html.hpp:2058
size_t append_tag(std::basic_string< char, TR, AX > &str) const
Appends token tag to the source code.
Definition html.hpp:2012
HTTP token representing an URL.
Definition html.hpp:2141
token_url_t encoding
URL encoding.
Definition html.hpp:2156
std::basic_string< T, TR, AX > url
URL.
Definition html.hpp:2155
stdex::interval< size_t > interval
Region of the last match.
Definition parser.hpp:120
Test for given string.
Definition parser.hpp:814
Progress indicator base class.
Definition progress.hpp:22
virtual bool cancel()
Query whether user requested abort.
Definition progress.hpp:70
virtual void set(T value)
Set current progress.
Definition progress.hpp:52
virtual void set_range(T start, T end)
Set progress range extent.
Definition progress.hpp:42
User cancelled exception.
Definition exception.hpp:17
Describes attributes associated with a HTML element.
Definition html.hpp:685
static bool is_group(element_t code)
Does element represent a separate part of text?
Definition html.hpp:1102
static bool is_flow(element_t code)
Does element typically represent text?
Definition html.hpp:1012
static bool is_heading(element_t code)
Does element represent a heading?
Definition html.hpp:926
static bool is_head_content(element_t code)
Is element part of the document head?
Definition html.hpp:1022
static bool is_fontstyle(element_t code)
Does element represent font styling?
Definition html.hpp:810
static bool is_block(element_t code)
Is element typically displayed as a stand-alone section of text?
Definition html.hpp:980
static bool is_head_misc(element_t code)
May element be a part of document head?
Definition html.hpp:1040
static bool is_list(element_t code)
Does element represent a list of items?
Definition html.hpp:946
static bool is_uri(element_t code, _In_reads_or_z_opt_(num_chars) const T *attr_name, size_t num_chars)
Checks if expected element attribute value is URI.
Definition html.hpp:1233
static bool is_preformatted(element_t code)
Does element represent preformatted text, source code etc.?
Definition html.hpp:964
static bool is_localizable(element_t code, const T *attr_name, size_t num_chars)
Checks if expected element attribute value is localizable.
Definition html.hpp:1288
static bool is_special(element_t code)
Does element represent non-textual item in the document?
Definition html.hpp:858
static bool is_pre_exclusion(element_t code)
May element be a part of <pre></pre>?
Definition html.hpp:1059
static bool is_inline(element_t code)
Is element typically displayed inline with text?
Definition html.hpp:911
static bool is_html_content(element_t code)
Does element represent the document body?
Definition html.hpp:1085
static bool is_formctrl(element_t code)
Does element represent a form control?
Definition html.hpp:892
static bool is_phrase(element_t code)
Does element represent a phrase-of-speech?
Definition html.hpp:833
static bool may_contain(element_t parent, element_t child)
Checks if one element may nest inside another.
Definition html.hpp:1133
static element_span_t span(element_t code)
Returns expected element span in HTML code.
Definition html.hpp:691
HTML entity.
Definition html.hpp:1599
std::basic_string< T, TR, AX > value
Entity value.
Definition html.hpp:1601
stdex::interval< size_t > name
Name position in source.
Definition html.hpp:1600
Inserted HTML token.
Definition html.hpp:2162
bool after_word
true if token is anchored after the word; false if anchored before the word
Definition html.hpp:2166
std::list< stdex::html::token * > active_tokens
List of started tokens at inserted token.
Definition html.hpp:2164
size_t word_index
Index of the word, token is anchored to.
Definition html.hpp:2165
token * token
Points to the token.
Definition html.hpp:2163
Numerical interval.
Definition interval.hpp:18
T size() const
Returns interval size.
Definition interval.hpp:47
T end
interval end
Definition interval.hpp:20
T start
interval start
Definition interval.hpp:19
Maps index in source string to index in destination string.
Definition mapping.hpp:17
Tag attribute.
Definition parser.hpp:8023
stdex::interval< size_t > value
attribute value position in source
Definition parser.hpp:8025