9#include "exception.hpp"
10#include "interval.hpp"
13#include "progress.hpp"
42 template<
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
44 _Inout_ std::basic_string<char, TR, AX>& dst,
45 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars)
47 _Assume_(src || !num_chars);
48 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
50 case '&': dst +=
"&";
break;
51 case ';': dst +=
";";
break;
52 case '\"': dst +=
""";
break;
53 case '\'': dst +=
"'";
break;
54 case '<': dst +=
"<";
break;
55 case '>': dst +=
">";
break;
56 case 0x00a0: dst +=
" ";
break;
57 default: dst += src[i];
break;
69 template<
class TR = std::
char_traits<
wchar_t>,
class AX = std::allocator<
wchar_t>>
71 _Inout_ std::basic_string<wchar_t, TR, AX>& dst,
72 _In_reads_or_z_opt_(num_chars)
const wchar_t* src, _In_
size_t num_chars)
74 _Assume_(src || !num_chars);
75 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
77 case L
'&': dst += L
"&";
break;
78 case L
';': dst += L
";";
break;
79 case L
'\"': dst += L
""";
break;
80 case L
'\'': dst += L
"'";
break;
81 case L
'<': dst += L
"<";
break;
82 case L
'>': dst += L
">";
break;
83 case L
'\u00a0': dst += L
" ";
break;
84 default: dst += src[i];
break;
95 template<
class T,
size_t N,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
97 _Inout_ std::basic_string<T, TR, AX>& dst,
98 _In_
const T (&src)[N])
109 template<
class T,
class TR_dst = std::
char_traits<T>,
class AX_dst = std::allocator<T>,
class TR_src = std::
char_traits<T>,
class AX_src = std::allocator<T>>
111 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
112 _In_
const std::basic_string<T, TR_src, AX_src>& src)
114 escape(dst, src.data(), src.size());
123 template<
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
124 void escape_min(_Inout_ std::basic_string<char, TR, AX>& dst, _In_
char chr)
127 case '&': dst +=
"&";
break;
128 case '<': dst +=
"<";
break;
129 case '>': dst +=
">";
break;
130 case 0x00a0: dst +=
" ";
break;
131 default: dst += chr;
break;
141 template<
class TR = std::
char_traits<
wchar_t>,
class AX = std::allocator<
wchar_t>>
142 void escape_min(_Inout_ std::basic_string<wchar_t, TR, AX>& dst, _In_
wchar_t chr)
145 case L
'&': dst += L
"&";
break;
146 case L
'<': dst += L
"<";
break;
147 case L
'>': dst += L
">";
break;
148 case L
'\u00a0': dst += L
" ";
break;
149 default: dst += chr;
break;
160 template<
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
162 _Inout_ std::basic_string<char, TR, AX>& dst,
163 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars)
165 _Assume_(src || !num_chars);
166 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
168 case '&': dst +=
"&";
break;
169 case '<': dst +=
"<";
break;
170 case '>': dst +=
">";
break;
171 case 0x00a0: dst +=
" ";
break;
172 default: dst += src[i];
break;
184 template<
class TR = std::
char_traits<
wchar_t>,
class AX = std::allocator<
wchar_t>>
186 _Inout_ std::basic_string<wchar_t, TR, AX>& dst,
187 _In_reads_or_z_opt_(num_chars)
const wchar_t* src, _In_
size_t num_chars)
189 _Assume_(src || !num_chars);
190 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
192 case L
'&': dst += L
"&";
break;
193 case L
'<': dst += L
"<";
break;
194 case L
'>': dst += L
">";
break;
195 case L
'\u00a0': dst += L
" ";
break;
196 default: dst += src[i];
break;
207 template<
class T,
size_t N,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
209 _Inout_ std::basic_string<T, TR, AX>& dst,
210 _In_
const T (&src)[N])
212 escape_min(dst, src, N);
221 template<
class T,
class TR_dst = std::
char_traits<T>,
class AX_dst = std::allocator<T>,
class TR_src = std::
char_traits<T>,
class AX_src = std::allocator<T>>
223 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
224 _In_
const std::basic_string<T, TR_src, AX_src>& src)
226 escape_min(dst, src.data(), src.size());
236 template<
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
238 _Inout_ std::basic_string<char, TR, AX>& dst,
239 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars)
241 _Assume_(src || !num_chars);
242 for (
size_t i = 0; i < num_chars && src[i];) {
252 if (
'0' <= src[i] && src[i] <=
'9') chr = (src[i++] -
'0') << 4;
253 else if (
'A' <= src[i] && src[i] <=
'F') chr = (src[i++] -
'A' + 10) << 4;
254 else if (
'a' <= src[i] && src[i] <=
'f') chr = (src[i++] -
'a' + 10) << 4;
255 else { dst +=
'%';
continue; }
256 if (
'0' <= src[i] && src[i] <=
'9') chr |= (src[i++] -
'0');
257 else if (
'A' <= src[i] && src[i] <=
'F') chr |= (src[i++] -
'A' + 10);
258 else if (
'a' <= src[i] && src[i] <=
'f') chr |= (src[i++] -
'a' + 10);
259 else { dst +=
'%'; dst += src[i - 1];
continue; }
261 dst +=
static_cast<char>(chr);
277 template<
size_t N,
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
279 _Inout_ std::basic_string<char, TR, AX>& dst,
280 _In_
const char (&src)[N])
282 url_unescape(dst, src, N);
291 template<
class TR_dst = std::
char_traits<
char>,
class AX_dst = std::allocator<
char>>
293 _Inout_ std::basic_string<char, TR_dst, AX_dst>& dst,
294 _In_
const std::basic_string_view<
char, std::char_traits<char>> src)
296 url_unescape(dst, src.data(), src.size());
306 template<
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
308 _Inout_ std::basic_string<char, TR, AX>& dst,
309 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars)
311 _Assume_(src || !num_chars);
312 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
314 case ' ': dst +=
"+";
break;
315 case '<': dst +=
"%3C";
break;
316 case '>': dst +=
"%3E";
break;
317 case '#': dst +=
"%23";
break;
318 case '%': dst +=
"%25";
break;
319 case '{': dst +=
"%7B";
break;
320 case '}': dst +=
"%7D";
break;
321 case '|': dst +=
"%7C";
break;
322 case '\\': dst +=
"%5C";
break;
323 case '^': dst +=
"%5E";
break;
324 case '~': dst +=
"%7E";
break;
325 case '[': dst +=
"%5B";
break;
326 case ']': dst +=
"%5D";
break;
327 case '`': dst +=
"%60";
break;
328 case ';': dst +=
"%3B";
break;
329 case '/': dst +=
"%2F";
break;
330 case '?': dst +=
"%3F";
break;
331 case ':': dst +=
"%3A";
break;
332 case '@': dst +=
"%40";
break;
333 case '=': dst +=
"%3D";
break;
334 case '&': dst +=
"%26";
break;
335 case '$': dst +=
"%24";
break;
337 if (0x20 <
static_cast<uint8_t
>(src[i]) &&
static_cast<uint8_t
>(src[i]) < 0x7f)
341 uint8_t n = (
static_cast<uint8_t
>(src[i]) & 0xf0) >> 4;
342 dst += n < 10 ? static_cast<char>(
'0' + n) : static_cast<char>(
'A' + n - 10);
343 n = ((uint8_t)src[i] & 0x0f);
344 dst += n < 10 ? static_cast<char>(
'0' + n) : static_cast<char>(
'A' + n - 10);
356 template<
size_t N,
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
358 _Inout_ std::basic_string<char, TR, AX>& dst,
359 _In_
const char (&src)[N])
361 url_escape(dst, src, N);
370 template<
class TR_dst = std::
char_traits<
char>,
class AX_dst = std::allocator<
char>>
372 _Inout_ std::basic_string<char, TR_dst, AX_dst>& dst,
373 _In_
const std::basic_string_view<
char, std::char_traits<char>> src)
375 url_escape(dst, src.data(), src.size());
385 template<
class T,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
387 _Inout_ std::basic_string<T, TR, AX>& dst,
388 _In_reads_or_z_opt_(num_chars)
const T* src, _In_
size_t num_chars)
390 _Assume_(src || !num_chars);
391 for (
size_t i = 0; i < num_chars && src[i];) {
394 else if (i + 1 < num_chars) {
399 case 'n': dst +=
'\n'; i++;
break;
400 case 'r': dst +=
'\r'; i++;
break;
401 case 't': dst +=
'\t'; i++;
break;
404 case '\n': i++;
break;
422 case 'F':
case 'f': {
424 size_t end = std::min(num_chars, i + 6);
426 for (; i < end; ++i) {
427 if (
'0' <= src[i] && src[i] <=
'9') chr = chr * 0x10 + src[i] -
'0';
428 else if (
'A' <= src[i] && src[i] <=
'F') chr = chr * 0x10 + src[i] -
'A' + 10;
429 else if (
'a' <= src[i] && src[i] <=
'f') chr = chr * 0x10 + src[i] -
'a' + 10;
433 dst +=
static_cast<T
>(chr);
435 if (i < end && src[i] ==
' ') {
442 default: dst += src[i++];
454 template<
class T,
size_t N,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
456 _Inout_ std::basic_string<T, TR, AX>& dst,
457 _In_
const T (&src)[N])
459 css_unescape(dst, src, N);
468 template<
class T,
class TR_dst = std::
char_traits<T>,
class AX_dst = std::allocator<T>,
class TR_src = std::
char_traits<T>,
class AX_src = std::allocator<T>>
470 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
471 _In_
const std::basic_string<T, TR_src, AX_src>& src)
473 css_unescape(dst, src.data(), src.size());
483 template<
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
485 _Inout_ std::basic_string<char, TR, AX>& dst,
486 _In_reads_or_z_opt_(num_chars)
const char* src, _In_
size_t num_chars)
488 _Assume_(src || !num_chars);
489 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
491 case '\\': dst +=
"\\\\";
break;
492 case '\n': dst +=
"\\n";
break;
493 case '\r': dst +=
"\\r";
break;
494 case '\t': dst +=
"\\t";
break;
495 case '\"': dst +=
"\\\"";
break;
496 case '\'': dst +=
"\\'";
break;
497 default: dst += src[i];
break;
509 template<
class TR = std::
char_traits<
wchar_t>,
class AX = std::allocator<
wchar_t>>
511 _Inout_ std::basic_string<wchar_t, TR, AX>& dst,
512 _In_reads_or_z_opt_(num_chars)
const wchar_t* src, _In_
size_t num_chars)
514 _Assume_(src || !num_chars);
515 for (
size_t i = 0; i < num_chars && src[i]; ++i) {
517 case L
'\\': dst += L
"\\\\";
break;
518 case L
'\n': dst += L
"\\n";
break;
519 case L
'\r': dst += L
"\\r";
break;
520 case L
'\t': dst += L
"\\t";
break;
521 case L
'\"': dst += L
"\\\"";
break;
522 case L
'\'': dst += L
"\\'";
break;
523 default: dst += src[i];
break;
534 template<
class T,
size_t N,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
536 _Inout_ std::basic_string<T, TR, AX>& dst,
537 _In_
const T (&src)[N])
539 css_escape(dst, src, N);
548 template<
class T,
class TR_dst = std::
char_traits<T>,
class AX_dst = std::allocator<T>,
class TR_src = std::
char_traits<T>,
class AX_src = std::allocator<T>>
550 _Inout_ std::basic_string<T, TR_dst, AX_dst>& dst,
551 _In_
const std::basic_string<T, TR_src, AX_src>& src)
553 css_escape(dst, src.data(), src.size());
559 enum class element_t {
675 enum class element_span_t {
691 static element_span_t
span(_In_ element_t code)
693 static element_span_t lookup[] = {
694 element_span_t::needs_end,
695 element_span_t::needs_end,
696 element_span_t::needs_end,
697 element_span_t::needs_end,
698 element_span_t::needs_end,
699 element_span_t::immediate,
700 element_span_t::needs_end,
701 element_span_t::immediate,
702 element_span_t::immediate,
703 element_span_t::needs_end,
704 element_span_t::immediate,
705 element_span_t::needs_end,
706 element_span_t::needs_end,
707 element_span_t::needs_end,
708 element_span_t::end_optional,
709 element_span_t::immediate,
710 element_span_t::needs_end,
711 element_span_t::needs_end,
712 element_span_t::needs_end,
713 element_span_t::needs_end,
714 element_span_t::needs_end,
715 element_span_t::immediate,
716 element_span_t::end_optional,
717 element_span_t::needs_end,
718 element_span_t::end_optional,
719 element_span_t::needs_end,
720 element_span_t::needs_end,
721 element_span_t::needs_end,
722 element_span_t::needs_end,
723 element_span_t::needs_end,
724 element_span_t::end_optional,
725 element_span_t::needs_end,
726 element_span_t::immediate,
727 element_span_t::needs_end,
728 element_span_t::needs_end,
729 element_span_t::needs_end,
730 element_span_t::immediate,
731 element_span_t::needs_end,
732 element_span_t::needs_end,
733 element_span_t::needs_end,
734 element_span_t::needs_end,
735 element_span_t::needs_end,
736 element_span_t::needs_end,
737 element_span_t::needs_end,
738 element_span_t::end_optional,
739 element_span_t::immediate,
740 element_span_t::end_optional,
741 element_span_t::needs_end,
742 element_span_t::needs_end,
743 element_span_t::immediate,
744 element_span_t::immediate,
745 element_span_t::needs_end,
746 element_span_t::immediate,
747 element_span_t::needs_end,
748 element_span_t::needs_end,
749 element_span_t::needs_end,
750 element_span_t::end_optional,
751 element_span_t::immediate,
752 element_span_t::needs_end,
753 element_span_t::needs_end,
754 element_span_t::needs_end,
755 element_span_t::needs_end,
756 element_span_t::immediate,
757 element_span_t::immediate,
758 element_span_t::needs_end,
759 element_span_t::needs_end,
760 element_span_t::needs_end,
761 element_span_t::needs_end,
762 element_span_t::needs_end,
763 element_span_t::needs_end,
764 element_span_t::needs_end,
765 element_span_t::end_optional,
766 element_span_t::end_optional,
767 element_span_t::immediate,
768 element_span_t::end_optional,
769 element_span_t::needs_end,
770 element_span_t::needs_end,
771 element_span_t::immediate,
772 element_span_t::needs_end,
773 element_span_t::needs_end,
774 element_span_t::needs_end,
775 element_span_t::needs_end,
776 element_span_t::needs_end,
777 element_span_t::needs_end,
778 element_span_t::needs_end,
779 element_span_t::needs_end,
780 element_span_t::needs_end,
781 element_span_t::needs_end,
782 element_span_t::needs_end,
783 element_span_t::needs_end,
784 element_span_t::needs_end,
785 element_span_t::end_optional,
786 element_span_t::end_optional,
787 element_span_t::needs_end,
788 element_span_t::end_optional,
789 element_span_t::end_optional,
790 element_span_t::end_optional,
791 element_span_t::needs_end,
792 element_span_t::end_optional,
793 element_span_t::needs_end,
794 element_span_t::needs_end,
795 element_span_t::needs_end,
796 element_span_t::needs_end,
797 element_span_t::immediate,
798 element_span_t::needs_end,
800 return element_t::a <= code && code <= element_t::xmp ?
801 lookup[
static_cast<size_t>(code) -
static_cast<size_t>(element_t::a)] :
802 element_span_t::needs_end;
818 case element_t::strike:
819 case element_t::blink:
821 case element_t::small:
836 case element_t::strong:
838 case element_t::code:
839 case element_t::samp:
842 case element_t::cite:
843 case element_t::abbr:
844 case element_t::acronym:
861 case element_t::applet:
862 case element_t::object:
863 case element_t::embed:
864 case element_t::font:
865 case element_t::basefont:
869 case element_t::script:
874 case element_t::ruby:
875 case element_t::span:
877 case element_t::iframe:
878 case element_t::nobr:
892 case element_t::input:
893 case element_t::select:
894 case element_t::textarea:
895 case element_t::label:
896 case element_t::button:
910 code == element_t::PCDATA ||
947 case element_t::menu:
962 case element_t::listing:
982 case element_t::center:
983 case element_t::marquee:
984 case element_t::noscript:
985 case element_t::noframes:
986 case element_t::noembed:
987 case element_t::blockquote:
988 case element_t::form:
989 case element_t::isindex:
991 case element_t::table:
992 case element_t::fieldset:
993 case element_t::address:
1017 case element_t::title:
1018 case element_t::isindex:
1019 case element_t::base:
1020 case element_t::nextid:
1034 case element_t::script:
1035 case element_t::style:
1036 case element_t::meta:
1037 case element_t::link:
1038 case element_t::object:
1052 case element_t::img:
1053 case element_t::object:
1054 case element_t::applet:
1055 case element_t::embed:
1056 case element_t::big:
1057 case element_t::small:
1058 case element_t::sub:
1059 case element_t::sup:
1060 case element_t::ruby:
1061 case element_t::font:
1062 case element_t::basefont:
1063 case element_t::nobr:
1077 case element_t::head:
1078 case element_t::body:
1079 case element_t::frameset:
1096 case element_t::col:
1097 case element_t::colgroup:
1099 case element_t::dir:
1101 case element_t::frame:
1102 case element_t::iframe:
1103 case element_t::legend:
1122 if (child == element_t::unknown || child == element_t::comment)
1130 case element_t::a:
return is_inline(child) && child != element_t::a;
1131 case element_t::address:
return is_inline(child) || child == element_t::p;
1132 case element_t::applet:
return is_flow(child) || child == element_t::param;
1133 case element_t::area:
return false;
1134 case element_t::base:
return false;
1135 case element_t::basefont:
return false;
1136 case element_t::bdo:
return is_inline(child);
1137 case element_t::blockquote:
return is_flow(child);
1138 case element_t::body:
return is_flow(child) || child == element_t::ins || child == element_t::del;
1139 case element_t::br:
return false;
1140 case element_t::button:
return is_flow(child) && !
is_formctrl(child) && child != element_t::a && child != element_t::form && child != element_t::isindex && child != element_t::fieldset && child != element_t::iframe;
1141 case element_t::caption:
return is_inline(child);
1142 case element_t::center:
return is_flow(child);
1143 case element_t::col:
return false;
1144 case element_t::colgroup:
return child == element_t::col;
1145 case element_t::comment:
return child == element_t::CDATA;
1146 case element_t::dd:
return is_flow(child);
1147 case element_t::del:
return is_flow(child);
1148 case element_t::dir:
return child == element_t::li;
1149 case element_t::div:
return is_flow(child);
1150 case element_t::dl:
return child == element_t::dt || child == element_t::dd;
1151 case element_t::dt:
return is_inline(child);
1152 case element_t::embed:
return is_flow(child) || child == element_t::param;
1153 case element_t::fieldset:
return is_flow(child) || child == element_t::legend || child == element_t::PCDATA;
1154 case element_t::font:
return is_inline(child);
1155 case element_t::form:
return is_flow(child) && child != element_t::form;
1156 case element_t::frame:
return false;
1157 case element_t::frameset:
return child == element_t::frameset || child == element_t::frame || child == element_t::noframes;
1159 case element_t::hr:
return false;
1161 case element_t::iframe:
return is_flow(child);
1162 case element_t::img:
return false;
1163 case element_t::input:
return false;
1164 case element_t::ins:
return is_flow(child);
1165 case element_t::isindex:
return false;
1166 case element_t::label:
return is_inline(child) && child != element_t::label;
1167 case element_t::legend:
return is_inline(child);
1168 case element_t::li:
return is_flow(child);
1169 case element_t::link:
return false;
1170 case element_t::listing:
return child == element_t::CDATA;
1171 case element_t::map:
return is_block(child) || child == element_t::area;
1172 case element_t::marquee:
return is_flow(child);
1173 case element_t::menu:
return child == element_t::li;
1174 case element_t::meta:
return false;
1175 case element_t::nobr:
return is_inline(child) || child == element_t::wbr;
1176 case element_t::noframes:
return (
is_flow(child) || child == element_t::body) && child != element_t::noframes;
1177 case element_t::noscript:
return is_flow(child);
1178 case element_t::noembed:
return is_flow(child);
1179 case element_t::object:
return is_flow(child) || child == element_t::param;
1180 case element_t::ol:
return child == element_t::li;
1181 case element_t::optgroup:
return child == element_t::option;
1182 case element_t::option:
return child == element_t::PCDATA;
1183 case element_t::p:
return is_inline(child);
1184 case element_t::param:
return false;
1185 case element_t::plaintext:
return is_flow(child);
1187 case element_t::q:
return is_inline(child);
1188 case element_t::rt:
return false;
1189 case element_t::ruby:
return is_inline(child);
1190 case element_t::script:
return child == element_t::CDATA;
1191 case element_t::select:
return child == element_t::optgroup || child == element_t::option;
1192 case element_t::span:
return is_inline(child);
1193 case element_t::style:
return child == element_t::CDATA;
1194 case element_t::sub:
return is_inline(child);
1195 case element_t::sup:
return is_inline(child);
1196 case element_t::table:
return child == element_t::caption || child == element_t::col || child == element_t::colgroup || child == element_t::thead || child == element_t::tfoot || child == element_t::tbody;
1197 case element_t::tbody:
return child == element_t::tr;
1198 case element_t::td:
return is_flow(child);
1199 case element_t::textarea:
return child == element_t::PCDATA;
1200 case element_t::tfoot:
return child == element_t::tr;
1201 case element_t::th:
return is_flow(child);
1202 case element_t::thead:
return child == element_t::tr;
1203 case element_t::title:
return child == element_t::PCDATA;
1204 case element_t::tr:
return child == element_t::td || child == element_t::th;
1205 case element_t::ul:
return child == element_t::li;
1206 case element_t::wbr:
return false;
1207 case element_t::unknown:
return true;
1220 static bool is_uri(_In_ element_t code, _In_reads_or_z_opt_(num_chars)
const T* attr_name, _In_
size_t num_chars)
1222 _Assume_(attr_name || !num_chars);
1224 case element_t::a:
return !stdex::strnicmp(attr_name, num_chars,
"href", SIZE_MAX);
1225 case element_t::applet:
return !stdex::strnicmp(attr_name, num_chars,
"code", SIZE_MAX) ||
1226 !stdex::strnicmp(attr_name, num_chars,
"codebase", SIZE_MAX) ||
1227 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX);
1228 case element_t::area:
return !stdex::strnicmp(attr_name, num_chars,
"href", SIZE_MAX);
1229 case element_t::base:
return !stdex::strnicmp(attr_name, num_chars,
"href", SIZE_MAX);
1230 case element_t::bgsound:
return !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX);
1231 case element_t::blockquote:
return !stdex::strnicmp(attr_name, num_chars,
"cite", SIZE_MAX);
1232 case element_t::body:
return !stdex::strnicmp(attr_name, num_chars,
"background", SIZE_MAX);
1233 case element_t::comment:
return !stdex::strnicmp(attr_name, num_chars,
"data", SIZE_MAX);
1234 case element_t::del:
return !stdex::strnicmp(attr_name, num_chars,
"cite", SIZE_MAX);
1235 case element_t::embed:
return !stdex::strnicmp(attr_name, num_chars,
"pluginspage", SIZE_MAX) ||
1236 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX);
1237 case element_t::form:
return !stdex::strnicmp(attr_name, num_chars,
"action", SIZE_MAX);
1238 case element_t::frame:
return !stdex::strnicmp(attr_name, num_chars,
"longdesc", SIZE_MAX) ||
1239 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX);
1240 case element_t::head:
return !stdex::strnicmp(attr_name, num_chars,
"profile", SIZE_MAX);
1241 case element_t::iframe:
return !stdex::strnicmp(attr_name, num_chars,
"longdesc", SIZE_MAX) ||
1242 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX);
1243 case element_t::img:
return !stdex::strnicmp(attr_name, num_chars,
"longdesc", SIZE_MAX) ||
1244 !stdex::strnicmp(attr_name, num_chars,
"lowsrc", SIZE_MAX) ||
1245 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX) ||
1246 !stdex::strnicmp(attr_name, num_chars,
"usemap", SIZE_MAX);
1247 case element_t::input:
return !stdex::strnicmp(attr_name, num_chars,
"lowsrc", SIZE_MAX) ||
1248 !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX) ||
1249 !stdex::strnicmp(attr_name, num_chars,
"usemap", SIZE_MAX);
1250 case element_t::ins:
return !stdex::strnicmp(attr_name, num_chars,
"cite", SIZE_MAX);
1251 case element_t::link:
return !stdex::strnicmp(attr_name, num_chars,
"href", SIZE_MAX);
1252 case element_t::object:
return !stdex::strnicmp(attr_name, num_chars,
"basehref", SIZE_MAX) ||
1253 !stdex::strnicmp(attr_name, num_chars,
"classid", SIZE_MAX) ||
1254 !stdex::strnicmp(attr_name, num_chars,
"code", SIZE_MAX) ||
1255 !stdex::strnicmp(attr_name, num_chars,
"codebase", SIZE_MAX) ||
1256 !stdex::strnicmp(attr_name, num_chars,
"data", SIZE_MAX) ||
1257 !stdex::strnicmp(attr_name, num_chars,
"usemap", SIZE_MAX);
1258 case element_t::q:
return !stdex::strnicmp(attr_name, num_chars,
"cite", SIZE_MAX);
1259 case element_t::script:
return !stdex::strnicmp(attr_name, num_chars,
"src", SIZE_MAX);
1260 case element_t::table:
return !stdex::strnicmp(attr_name, num_chars,
"background", SIZE_MAX);
1261 case element_t::td:
return !stdex::strnicmp(attr_name, num_chars,
"background", SIZE_MAX);
1262 case element_t::th:
return !stdex::strnicmp(attr_name, num_chars,
"background", SIZE_MAX);
1277 _Assume_(attr_name || !num_chars);
1278 if (!stdex::strnicmp(attr_name, num_chars,
"title", SIZE_MAX))
1281 case element_t::applet:
return !stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX);
1282 case element_t::area:
return !stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX);
1283 case element_t::img:
return !stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX);
1284 case element_t::input:
return !stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX);
1285 case element_t::object:
return !stdex::strnicmp(attr_name, num_chars,
"alt", SIZE_MAX);
1286 case element_t::table:
return !stdex::strnicmp(attr_name, num_chars,
"summary", SIZE_MAX);
1287 case element_t::td:
return !stdex::strnicmp(attr_name, num_chars,
"abbr", SIZE_MAX);
1288 case element_t::th:
return !stdex::strnicmp(attr_name, num_chars,
"abbr", SIZE_MAX);
1295 using sequence_store = std::vector<std::unique_ptr<sequence>>;
1303 stdex::parser::html_sequence_t
type;
1307 sequence(_In_ stdex::parser::html_sequence_t _type = stdex::parser::html_sequence_t::unknown, _In_
size_t start = 0,
size_t end = 0, _In_opt_
sequence* _parent =
nullptr) :
1324 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1325 code(element_code(src + tag.name.start, tag.name.size())),
1326 name(std::move(tag.name)),
1331 static element_t element_code(_In_reads_z_(num_chars)
const T*
name,
size_t num_chars)
1333 static const struct {
1337 {
"a", element_t::a, },
1338 {
"abbr", element_t::abbr, },
1339 {
"acronym", element_t::acronym, },
1340 {
"address", element_t::address, },
1341 {
"applet", element_t::applet, },
1342 {
"area", element_t::area, },
1343 {
"b", element_t::b, },
1344 {
"base", element_t::base, },
1345 {
"basefont", element_t::basefont, },
1346 {
"bdo", element_t::bdo, },
1347 {
"bgsound", element_t::bgsound, },
1348 {
"big", element_t::big, },
1349 {
"blink", element_t::blink, },
1350 {
"blockquote", element_t::blockquote, },
1351 {
"body", element_t::body, },
1352 {
"br", element_t::br, },
1353 {
"button", element_t::button, },
1354 {
"caption", element_t::caption, },
1355 {
"center", element_t::center, },
1356 {
"cite", element_t::cite, },
1357 {
"code", element_t::code, },
1358 {
"col", element_t::col, },
1359 {
"colgroup", element_t::colgroup, },
1360 {
"comment", element_t::comment, },
1361 {
"dd", element_t::dd, },
1362 {
"del", element_t::del, },
1363 {
"dfn", element_t::dfn, },
1364 {
"dir", element_t::dir, },
1365 {
"div", element_t::div, },
1366 {
"dl", element_t::dl, },
1367 {
"dt", element_t::dt, },
1368 {
"em", element_t::em, },
1369 {
"embed", element_t::embed, },
1370 {
"fieldset", element_t::fieldset, },
1371 {
"font", element_t::font, },
1372 {
"form", element_t::form, },
1373 {
"frame", element_t::frame, },
1374 {
"frameset", element_t::frameset, },
1375 {
"h1", element_t::h1, },
1376 {
"h2", element_t::h2, },
1377 {
"h3", element_t::h3, },
1378 {
"h4", element_t::h4, },
1379 {
"h5", element_t::h5, },
1380 {
"h6", element_t::h6, },
1381 {
"head", element_t::head, },
1382 {
"hr", element_t::hr, },
1383 {
"html", element_t::html, },
1384 {
"i", element_t::i, },
1385 {
"iframe", element_t::iframe, },
1386 {
"img", element_t::img, },
1387 {
"input", element_t::input, },
1388 {
"ins", element_t::ins, },
1389 {
"isindex", element_t::isindex, },
1390 {
"kbd", element_t::kbd, },
1391 {
"label", element_t::label, },
1392 {
"legend", element_t::legend, },
1393 {
"li", element_t::li, },
1394 {
"link", element_t::link, },
1395 {
"listing", element_t::listing, },
1396 {
"map", element_t::map, },
1397 {
"marquee", element_t::marquee, },
1398 {
"menu", element_t::menu, },
1399 {
"meta", element_t::meta, },
1400 {
"nextid", element_t::nextid, },
1401 {
"nobr", element_t::nobr, },
1402 {
"noembed", element_t::noembed, },
1403 {
"noframes", element_t::noframes, },
1404 {
"noscript", element_t::noscript, },
1405 {
"object", element_t::object, },
1406 {
"ol", element_t::ol, },
1407 {
"optgroup", element_t::optgroup, },
1408 {
"option", element_t::option, },
1409 {
"p", element_t::p, },
1410 {
"param", element_t::param, },
1411 {
"plaintext", element_t::plaintext, },
1412 {
"pre", element_t::pre, },
1413 {
"q", element_t::q, },
1414 {
"rt", element_t::rt, },
1415 {
"ruby", element_t::ruby, },
1416 {
"s", element_t::s, },
1417 {
"samp", element_t::samp, },
1418 {
"script", element_t::script, },
1419 {
"select", element_t::select, },
1420 {
"small", element_t::small, },
1421 {
"span", element_t::span, },
1422 {
"strike", element_t::strike, },
1423 {
"strong", element_t::strong, },
1424 {
"style", element_t::style, },
1425 {
"sub", element_t::sub, },
1426 {
"sup", element_t::sup, },
1427 {
"table", element_t::table, },
1428 {
"tbody", element_t::tbody, },
1429 {
"td", element_t::td, },
1430 {
"textarea", element_t::textarea, },
1431 {
"tfoot", element_t::tfoot, },
1432 {
"th", element_t::th, },
1433 {
"thead", element_t::thead, },
1434 {
"title", element_t::title, },
1435 {
"tr", element_t::tr, },
1436 {
"tt", element_t::tt, },
1437 {
"u", element_t::u, },
1438 {
"ul", element_t::ul, },
1439 {
"var", element_t::var, },
1440 {
"wbr", element_t::wbr, },
1441 {
"xmp", element_t::xmp, },
1445 for (
size_t i = 1; i < _countof(
mapping); i++)
1447 for (
size_t i = 0; i < _countof(
mapping); i++) {
1448 for (
size_t j = 0;
mapping[i].name[j]; j++)
1452 for (
size_t i = 0, j = _countof(
mapping); i < j; ) {
1453 size_t m = (i + j) / 2;
1455 for (
size_t i1 = 0, i2 = 0;;) {
1457 r = i2 >= num_chars || !
name[i2] ? 0 : -1;
1460 if (i2 >= num_chars || !
name[i2]) {
1465 auto chr =
static_cast<char>(stdex::tolower(
name[i2++]));
1484 return element_t::unknown;
1519 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1520 code(element::element_code(src + tag.name.start, tag.name.size())),
1521 name(std::move(tag.name)),
1539 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1540 name(std::move(tag.name)),
1557 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1573 sequence(tag.type, tag.interval.start, tag.interval.end,
parent),
1584 template<
class T,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
1594 template<
class T,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
1600 template<
class T,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
1642 void append(_In_reads_or_z_opt_(num_chars)
const T*
source, _In_
size_t num_chars)
1644 _Assume_(
source || !num_chars);
1651 if (m_condition_end.match(
source, i, num_chars)) {
1653 m_is_cdata ? stdex::parser::html_sequence_t::CDATA : stdex::parser::html_sequence_t::PCDATA,
1664 if (m_condition_end.match(
source, i, num_chars)) {
1681 if (m_condition_start.match(
source, i, num_chars)) {
1682 auto condition_src(
replace_entities(
source + m_condition_start.condition.start, m_condition_start.condition.size()));
1683 if (stdex::strncmp(condition_src.data(), condition_src.size(),
"CDATA", SIZE_MAX) == 0)
1685 else if (stdex::strncmp(condition_src.data(), condition_src.size(),
"RCDATA", SIZE_MAX) == 0)
1689 else if (stdex::strncmp(condition_src.data(), condition_src.size(),
"IGNORE", SIZE_MAX) == 0)
1701 if (m_tag.match(
source, i, num_chars) &&
1702 m_tag.type == stdex::parser::html_sequence_t::element_end &&
1703 element::element_code(
source + m_tag.name.start, m_tag.name.size()) == parent->code)
1708 std::unique_ptr<element_end> e(
new element_end(std::move(m_tag),
source, parent->parent, parent));
1709 parent->end = e.get();
1718 if (m_tag.match(
source, i, num_chars)) {
1723 switch (m_tag.type) {
1724 case stdex::parser::html_sequence_t::element:
1725 case stdex::parser::html_sequence_t::element_start: {
1726 std::unique_ptr<element> e(
1727 m_tag.type == stdex::parser::html_sequence_t::element ?
new element(std::move(m_tag),
source) :
1728 m_tag.type == stdex::parser::html_sequence_t::element_start ?
new element_start(std::move(m_tag),
source) :
1734 _Assume_(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1736 e->parent = starting_tag;
1739 e->parent = starting_tag->parent;
1740 starting_tag->end = e.get();
1744 if (e->type == stdex::parser::html_sequence_t::element_start) {
1747 e_start->
end = e.get();
1751 case element_t::code:
1752 case element_t::comment:
1753 case element_t::script:
1754 case element_t::style:
1761 if (e->code == element_t::meta &&
m_charset == stdex::charset_id::system) {
1762 bool is_content_type =
false;
1764 for (
auto& attr : e->attributes) {
1765 if (!stdex::strnicmp(
source + attr.name.start, attr.name.size(),
"http-equiv", SIZE_MAX) &&
1766 !stdex::strnicmp(
source + attr.value.start, attr.value.size(),
"content-type", SIZE_MAX))
1767 is_content_type =
true;
1768 else if (!stdex::strnicmp(
source + attr.name.start, attr.name.size(),
"content", SIZE_MAX))
1769 content_attr = &attr;
1771 if (is_content_type && content_attr) {
1778 str.reserve(content.charset.size());
1779 for (
size_t j = content.charset.start; j < content.charset.end; ++j)
1780 str.push_back(
static_cast<char>(
source[j]));
1781 m_charset = stdex::charset_from_name(str);
1789 case stdex::parser::html_sequence_t::element_end: {
1794 _Assume_(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
1795 if (starting_tag->code == e->code ||
1796 starting_tag->code == element_t::unknown && e->code == element_t::unknown && !stdex::strnicmp(
source + starting_tag->name.start, starting_tag->name.size(),
source + e->name.start, e->name.size()))
1798 e->start = starting_tag;
1799 e->parent = starting_tag->parent;
1800 starting_tag->end = e.get();
1809 case stdex::parser::html_sequence_t::declaration:
1810 if (m_tag.attributes.size() > 3 &&
1811 !stdex::strnicmp(
source + m_tag.attributes[0].name.start, m_tag.attributes[0].name.size(),
"entity", SIZE_MAX))
1813 if (!stdex::strncmp(
source + m_tag.attributes[1].name.start, m_tag.attributes[1].name.size(),
"%", SIZE_MAX) &&
1814 stdex::strncmp(
source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size(),
"SYSTEM", SIZE_MAX) &&
1815 stdex::strncmp(
source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size(),
"PUBLIC", SIZE_MAX))
1818 e->name = m_tag.attributes[2].name;
1819 e->value = std::move(
replace_entities(
source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size()));
1827 case stdex::parser::html_sequence_t::comment:
1830 case stdex::parser::html_sequence_t::instruction:
1834 throw std::invalid_argument(
"unknown tag type");
1841 if (m_any_char.match(
source, i, num_chars)) {
1865 void assign(_In_reads_or_z_opt_(num_chars)
const T*
source, _In_
size_t num_chars)
1877 friend class parser<T, TR, AX>;
1891 std::basic_string<T, TR, AX>
replace_entities(_In_reads_or_z_opt_(num_chars)
const T* input, _In_
size_t num_chars)
const
1893 _Assume_(input || !num_chars);
1894 const size_t num_entities =
m_entities.size();
1896 std::basic_string<T, TR, AX> output;
1897 for (
size_t i = 0; i < num_chars && input[i];) {
1898 if (input[i] ==
'%') {
1899 for (
size_t j = 0; j < num_entities; j++) {
1901 size_t entity_size = e->name.size();
1902 if (i + entity_size + 1 < num_chars &&
1903 !stdex::strncmp(input + i + 1,
source + e->name.start, entity_size) &&
1904 input[i + entity_size + 1] ==
';')
1907 i += entity_size + 2;
1911 throw std::runtime_error(
"undefined entity");
1913 output += input[i++];
1944 enum class token_t {
1955 constexpr size_t token_tag_max =
1964 constexpr char token_tag_start =
'\x12';
1970 constexpr char token_tag_end =
'\x13';
1978 token(_In_ token_t _type = token_t::root, _In_opt_
sequence* _sequence =
nullptr, _In_ uintptr_t _data = 0) :
1984 template<
class T,
class TR,
class AX>
1997 template<
class TR = std::
char_traits<
char>,
class AX = std::allocator<
char>>
1998 size_t append_tag(_Inout_ std::basic_string<char, TR, AX>& str)
const
2000 size_t n = str.size();
2002 stdex::appendf(str,
"%c%zX%c", stdex::locale_C, token_tag_start,
reinterpret_cast<uintptr_t
>(
this), token_tag_end);
2003 return str.size() - n;
2013 template<
class TR = std::
char_traits<
wchar_t>,
class AX = std::allocator<
wchar_t>>
2014 size_t append_tag(_Inout_ std::basic_string<wchar_t, TR, AX>& str)
const
2017 return stdex::appendf(str, L
"%c%zX%c", stdex::locale_C,
static_cast<wchar_t>(token_tag_start),
reinterpret_cast<uintptr_t
>(
this),
static_cast<wchar_t>(token_tag_end));
2021 static token* parse_tag(
const T* str,
size_t& offset)
2023 if (str[offset] !=
static_cast<T
>(token_tag_start))
2028 for (end = offset + 1; ; end++) {
2031 if (str[end] == token_tag_end)
2036 token* t =
reinterpret_cast<token*
>(stdex::strtouint<T, uintptr_t>(str + offset + 1, end - offset - 1,
nullptr, 16));
2038 throw std::invalid_argument(
"null token");
2049 using token_vector = std::vector<std::unique_ptr<token>>;
2050 using token_list = std::list<token*>;
2055 enum text_type_flag_t : uint32_t {
2056 has_tokens = 1 << 0,
2065 template<
class T,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
2070 _In_ token_t
type = token_t::complete,
2071 _In_reads_or_z_opt_(num_chars)
const T* _text =
nullptr, _In_
size_t num_chars = 0,
2072 _In_ uint32_t _text_type = 0,
2075 text(_text, num_chars),
2079 friend class parser<T, TR, AX>;
2090 template<
class T,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
2095 _In_reads_or_z_opt_(num_chars_text)
const T* _text =
nullptr, _In_
size_t num_chars_text = 0,
2096 _In_reads_or_z_opt_(num_chars_name)
const T* _name =
nullptr, _In_
size_t num_chars_name = 0,
2100 _In_ uintptr_t
data = 0) :
2102 name(_name, num_chars_name),
2106 friend class parser<T, TR, AX>;
2116 enum class token_url_t {
2125 template<
class T,
class TR = std::
char_traits<T>,
class AX = std::allocator<T>>
2130 _In_reads_or_z_opt_(num_chars)
const T* _url =
nullptr, _In_
size_t num_chars = 0,
2131 token_url_t _encoding = token_url_t::plain,
2134 url(_url, num_chars),
2138 friend class parser<T, TR, AX>;
2141 std::basic_string<T, TR, AX>
url;
2155 using inserted_token_list = std::list<inserted_token>;
2157 template<
class T,
class TR,
class AX>
2163 _In_reads_or_z_opt_(num_chars)
const stdex::schar_t* url =
nullptr, _In_
size_t num_chars = 0,
2166 m_url(url, stdex::strnlen(url, num_chars)),
2199 t->type == token_t::complete ||
2200 t->type == token_t::starting ||
2201 t->type == token_t::ending ||
2202 t->type == token_t::root);
2204 if (t->text_type & has_tokens) {
2205 const T* root = t->text.data();
2206 for (
size_t i = 0, num_chars = t->text.size(); i < num_chars && root[i];) {
2207 _Assume_(root[i] != token_tag_end);
2208 const token* t2 = token::parse_tag(root, i);
2211 case token_t::complete:
2212 case token_t::starting:
2213 case token_t::ending:
2217 case token_t::url: {
2219 switch (t2_url->encoding) {
2220 case token_url_t::plain:
2221 source += t2_url->
url;
2223 case token_url_t::sgml:
2224 escape(source, t2_url->url.data(), t2_url->url.size());
2226 case token_url_t::css:
2227 css_escape(source, t2_url->url.data(), t2_url->url.size());
2230 throw std::invalid_argument(
"unsupported URL encoding");
2235 throw std::invalid_argument(
"unsupported token type");
2238 else if (t->text_type & has_text) {
2239 escape_min(source, root[i]);
2243 source += root[i++];
2246 else if (t->text_type & has_text) {
2248 escape_min(source, t->text.data(), t->text.size());
2262 static void start_tokens(_Inout_ std::basic_string<T, TR, AX>& source, _Inout_ token_list& active_tokens, _In_
const token_list& new_tokens, _In_ token_list::const_iterator from)
2264 for (; from != new_tokens.cend(); ++from) {
2266 t->append_tag(source);
2267 active_tokens.push_back(t);
2280 token_list::const_iterator
end_tokens(_Inout_ std::basic_string<T, TR, AX>& source, _Inout_ token_list& active_tokens, _In_
const token_list& new_tokens)
2283 token_list::const_iterator i1, i2;
2284 for (i1 = active_tokens.cbegin(), i2 = new_tokens.cbegin(); i1 != active_tokens.cend(); ++i1, ++i2) {
2285 if (i2 == new_tokens.cend() || *i1 != *i2) {
2288 for (
auto i = active_tokens.cend(); i != active_tokens.cbegin(); ) {
2290 _Assume_(t1 && t1->type == token_t::starting);
2293 t2->text.reserve(t1->name.size() + 3);
2296 t2->text += t1->name;
2302 active_tokens.erase(i);
2305 active_tokens.erase(i);
2306 i = active_tokens.cend();
2324 _In_
size_t word_index, _In_
bool after_word,
2325 _Inout_ token_list& active_tokens)
2327 for (
auto i = inserted_tokens.begin(); i != inserted_tokens.end(); ) {
2330 if (t.word_index == word_index && t.after_word == after_word) {
2331 if (t.token->type != token_t::ending)
2332 start_tokens(source, active_tokens, t.active_tokens,
end_tokens(source, active_tokens, t.active_tokens));
2333 t.token->append_tag(source);
2334 inserted_tokens.erase(i++);
2347 static void merge(_Inout_ token_list& a, _In_
const token_list& b)
2349 for (
auto i2 = b.begin(); i2 != b.end(); ++i2) {
2351 for (
auto i1 = a.begin(); i1 != a.end(); ++i1) {
2352 if (i1 == a.end()) {
2368 _Unreferenced_(rel);
2389 template <
class T_token>
2394 auto t =
token.get();
2407 template <
class T_token>
2408 size_t append_token(_Inout_ std::unique_ptr<T_token>&&
token, _Inout_ std::basic_string<T, TR, AX>& source)
2445 stdex::strnchr(
m_source + s->interval.start, s->interval.size(),
static_cast<T
>(token_tag_start)) == stdex::npos &&
2446 stdex::strnchr(
m_source + s->interval.start, s->interval.size(),
static_cast<T
>(token_tag_end)) == stdex::npos);
2448 if (s->type == stdex::parser::html_sequence_t::text) {
2449 rel.from = s->interval.start;
2450 token->mapping.push_back(rel);
2451 stdex::sgml2strcat(
token->text,
m_source + s->interval.start, s->interval.size(), 0, rel, &
token->mapping);
2452 rel.to =
token->text.size();
2453 if (!(
token->text_type & has_text) &&
2454 !stdex::isblank(
m_source + s->interval.start, s->interval.size()))
2455 token->text_type |= has_text;
2458 else if (s->type == stdex::parser::html_sequence_t::element || s->type == stdex::parser::html_sequence_t::element_start) {
2461 const element_start* s_el_start = s->type == stdex::parser::html_sequence_t::element_start ?
static_cast<const element_start*
>(s.get()) :
nullptr;
2463 throw std::invalid_argument(
"<frameset> detected");
2466 size_t offset = s->interval.start;
2467 std::unique_ptr<text_token<T, TR, AX>> t(s->type == stdex::parser::html_sequence_t::element ||
element_traits::span(s_el_start->
code) == element_span_t::immediate ?
2473 if (a.value.empty() ||
2474 stdex::isblank(
m_source + a.value.start, a.value.size()))
2478 t->text.append(
m_source + offset, a.value.start - offset);
2483 stdex::sgml2strcat(t_url->url,
m_source + a.value.start, a.value.size());
2485 t->text_type |= has_tokens;
2486 offset = a.value.end;
2489 t->text.append(
m_source + offset, a.value.start - offset);
2493 has_text | is_title,
2496 t_value->mapping.push_back(rel_value);
2497 stdex::sgml2strcat(t_value->text,
m_source + a.value.start, a.value.size(), 0, rel_value, &t_value->mapping);
2499 t->text_type |= has_tokens;
2500 offset = a.value.end;
2504 t->text.append(
m_source + offset, s->interval.end - offset);
2505 rel.from = s->interval.start;
2506 token->mapping.push_back(rel);
2508 token->text_type |= has_tokens;
2513 if (s_el_start->
code == element_t::address ||
2514 s_el_start->
code == element_t::code ||
2515 s_el_start->
code == element_t::comment ||
2516 s_el_start->
code == element_t::cite ||
2517 s_el_start->
code == element_t::kbd ||
2518 s_el_start->
code == element_t::samp ||
2519 s_el_start->
code == element_t::script ||
2520 s_el_start->
code == element_t::style)
2523 auto s_end = s_el_start->
end;
2526 if (s->interval.end < s_end->interval.start) {
2527 if (s_el_start->
code != element_t::style) {
2528 rel.from = s->interval.start;
2529 token->mapping.push_back(rel);
2533 m_source + s->interval.end, s_end->interval.start - s->interval.end,
2540 auto t =
parse_css(s->interval.end, s_end->interval.start);
2542 rel.from = s->interval.start;
2543 token->mapping.push_back(rel);
2544 rel.to += t->append_tag(
token->text);
2546 token->text_type |= has_tokens;
2553 while (limit != end && limit->get() != s_el_start->
end)
2555 auto t =
parse(limit,
2558 rel.from = s->interval.start;
2559 token->mapping.push_back(rel);
2560 rel.to += t->append_tag(
token->text);
2561 token->text_type |= has_tokens;
2565 else if (s->type == stdex::parser::html_sequence_t::element_end) {
2566 rel.from = s->interval.start;
2567 token->mapping.push_back(rel);
2571 m_source + s->interval.start, s->interval.size(),
2575 token->text_type |= has_tokens;
2580 rel.from = s->interval.start;
2581 token->mapping.push_back(rel);
2585 m_source + s->interval.start, s->interval.size(),
2589 token->text_type |= has_tokens;
2603 std::unique_ptr<text_token<T, TR, AX>>
token(
2611 if (m_css_comment.match(
m_source, start, end)) {
2615 else if (m_css_cdo.match(
m_source, start, end)) {
2619 else if (m_css_cdc.match(
m_source, start, end)) {
2624 m_css_import.match(
m_source, start, end) && (section = m_css_import.
interval, content = m_css_import.content,
true) ||
2625 m_css_uri.match(
m_source, start, end) && (section = m_css_uri.
interval, content = m_css_uri.content,
true))
2627 std::unique_ptr<url_token<T, TR, AX>> t_url(
2636 token->text_type |= has_tokens;
2637 start = section.
end;
2639 else if (m_any_char.match(
m_source, start, end)) {
HTML declaration.
Definition html.hpp:1535
stdex::interval< size_t > name
Declaration name position in source.
Definition html.hpp:1545
std::vector< stdex::parser::html_attribute > attributes
Declaration attribute positions in source.
Definition html.hpp:1546
HTML document.
Definition html.hpp:1602
bool m_is_rcdata
Inside of RCDATA?
Definition html.hpp:1928
const std::basic_string< T, TR, AX > & source() const
Returns document HTML source code.
Definition html.hpp:1875
void append(_In_reads_or_z_opt_(num_chars) const T *source, size_t num_chars)
Parses HTML source code by chunks.
Definition html.hpp:1642
size_t m_num_valid_conditions
Number of started valid conditions.
Definition html.hpp:1925
size_t m_num_invalid_conditions
Number of started invalid conditions.
Definition html.hpp:1926
bool m_is_cdata
Inside of CDATA?
Definition html.hpp:1927
stdex::charset_id m_charset
Document charset.
Definition html.hpp:1922
sequence_store m_sequences
Store of sequences.
Definition html.hpp:1936
element_start * active_element() const
Returns starting tag of currently active element or nullptr if no element is known to be started.
Definition html.hpp:1883
size_t m_num_parsed
Number of characters already parsed.
Definition html.hpp:1921
std::vector< element_start * > m_element_stack
LIFO stack of started elements.
Definition html.hpp:1937
void finalize()
Finalizes document when no more appending is planned.
Definition html.hpp:1853
std::basic_string< T, TR, AX > replace_entities(_In_reads_or_z_opt_(num_chars) const T *input, size_t num_chars) const
Replaces entities with their content.
Definition html.hpp:1891
void assign(_In_reads_or_z_opt_(num_chars) const T *source, size_t num_chars)
Parses HTML document source code.
Definition html.hpp:1865
bool m_is_special_element
Inside of a special element (<SCRIPT>, <STYLE>, ...)?
Definition html.hpp:1938
std::vector< std::unique_ptr< entity< T, TR, AX > > > m_entities
Array of entities.
Definition html.hpp:1932
void clear()
Empties document.
Definition html.hpp:1621
std::basic_string< T, TR, AX > m_source
Document HTML source code.
Definition html.hpp:1920
Ending tag of an HTML element </...>
Definition html.hpp:1515
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1527
element_start * start
Corresponding starting tag.
Definition html.hpp:1528
element_t code
Element code.
Definition html.hpp:1526
Starting tag of an HTML element <...>
Definition html.hpp:1499
sequence * end
Corresponding ending tag of type element_end; When element is ended by a start of another element,...
Definition html.hpp:1508
HTML element <.../>
Definition html.hpp:1320
stdex::interval< size_t > name
Element name position in source.
Definition html.hpp:1489
std::vector< stdex::parser::html_attribute > attributes
Element attribute positions in source.
Definition html.hpp:1490
element_t code
Element code.
Definition html.hpp:1488
HTML instruction.
Definition html.hpp:1569
stdex::interval< size_t > content
Instruction content position in source.
Definition html.hpp:1578
HTML parser.
Definition html.hpp:2159
token_vector m_tokens
HTML token storage.
Definition html.hpp:2656
void append_inserted_tokens(std::basic_string< T, TR, AX > &source, inserted_token_list &inserted_tokens, size_t word_index, bool after_word, token_list &active_tokens)
Adds matching inserted tokens before/after the given word in source code.
Definition html.hpp:2323
text_token< T, TR, AX > * parse(const sequence_store::const_iterator &end, uint32_t text_type=0)
Recursively parses HTML document.
Definition html.hpp:2425
const stdex::sstring m_url
Absolute document URL.
Definition html.hpp:2652
text_token< T, TR, AX > * parse()
Parses HTML document.
Definition html.hpp:2175
const document< T, TR, AX > & m_document
Document being analyzed.
Definition html.hpp:2651
token_list::const_iterator end_tokens(std::basic_string< T, TR, AX > &source, token_list &active_tokens, const token_list &new_tokens)
Pops ending tokens from the active token list and append their tags to the source code string.
Definition html.hpp:2280
static void merge(token_list &a, const token_list &b)
Adds tokens from list b to list a creating an union.
Definition html.hpp:2347
text_token< T, TR, AX > * parse_css(size_t start, size_t end)
Parses CSS.
Definition html.hpp:2600
static void start_tokens(std::basic_string< T, TR, AX > &source, token_list &active_tokens, const token_list &new_tokens, token_list::const_iterator from)
Pushes tokens to the active token list and appends their tags to the source code string.
Definition html.hpp:2262
static void link(std::basic_string< T, TR, AX > &source, const text_token< T, TR, AX > *t)
Rebuilds HTML source code from the token tree.
Definition html.hpp:2195
T_token * append_token(std::unique_ptr< T_token > &&token)
Adds token to the collection.
Definition html.hpp:2390
sequence_store::const_iterator m_offset
Index of active section.
Definition html.hpp:2657
const T * m_source
HTML source code.
Definition html.hpp:2655
stdex::progress< size_t > * m_progress
Progress indicator.
Definition html.hpp:2654
const bool m_parse_frames
Parse frames.
Definition html.hpp:2653
void make_absolute_url(std::basic_string< T, TR, AX > &rel)
Converts URL to absolute.
Definition html.hpp:2366
size_t append_token(std::unique_ptr< T_token > &&token, std::basic_string< T, TR, AX > &source)
Adds token to the collection and appends its tag to the source code string.
Definition html.hpp:2408
const token_vector & tokens() const
Returns collection of tokens.
Definition html.hpp:2379
Base class for HTML sequences.
Definition html.hpp:1301
stdex::interval< size_t > interval
Sequence position in source.
Definition html.hpp:1304
stdex::parser::html_sequence_t type
Sequence type. Enum is used for performance reasons (vs. dynamic_cast)
Definition html.hpp:1303
sequence * parent
Parent sequence.
Definition html.hpp:1305
Token representing start HTML tag.
Definition html.hpp:2092
stdex::html::sequence * end_sequence
Ending tag sequence.
Definition html.hpp:2110
std::basic_string< T, TR, AX > name
Element name allowing later recreation of ending </tag>
Definition html.hpp:2109
Token representing part of HTML text.
Definition html.hpp:2067
stdex::mapping_vector< size_t > mapping
Mapping between source and text positions.
Definition html.hpp:2084
uint32_t text_type
Mask of text_type_flag_t to specify text content.
Definition html.hpp:2083
std::basic_string< T, TR, AX > text
Token text.
Definition html.hpp:2082
HTML token base class.
Definition html.hpp:1976
sequence * sequence
Pointer to the sequence this token represents or nullptr when it doesn't trivially represent one sequ...
Definition html.hpp:2045
uintptr_t data
Any user-supplied data.
Definition html.hpp:2046
size_t append_tag(std::basic_string< wchar_t, TR, AX > &str) const
Appends token tag to the source code.
Definition html.hpp:2014
token_t type
Token type.
Definition html.hpp:2044
size_t append_tag(std::basic_string< char, TR, AX > &str) const
Appends token tag to the source code.
Definition html.hpp:1998
HTTP token representing an URL.
Definition html.hpp:2127
token_url_t encoding
URL encoding.
Definition html.hpp:2142
std::basic_string< T, TR, AX > url
URL.
Definition html.hpp:2141
stdex::interval< size_t > interval
Region of the last match.
Definition parser.hpp:116
Test for given string.
Definition parser.hpp:831
Progress indicator base class.
Definition progress.hpp:22
virtual bool cancel()
Query whether user requested abort.
Definition progress.hpp:68
virtual void set(T value)
Set current progress.
Definition progress.hpp:50
virtual void set_range(T start, T end)
Set progress range extent.
Definition progress.hpp:40
User cancelled exception.
Definition exception.hpp:17
Describes attributes associated with a HTML element.
Definition html.hpp:685
static bool is_group(element_t code)
Does element represent a separate part of text?
Definition html.hpp:1090
static bool is_flow(element_t code)
Does element typically represent text?
Definition html.hpp:1004
static bool is_heading(element_t code)
Does element represent a heading?
Definition html.hpp:922
static bool is_head_content(element_t code)
Is element part of the document head?
Definition html.hpp:1014
static bool is_fontstyle(element_t code)
Does element represent font styling?
Definition html.hpp:810
static bool is_block(element_t code)
Is element typically displayed as a stand-alone section of text?
Definition html.hpp:973
static bool is_head_misc(element_t code)
May element be a part of document head?
Definition html.hpp:1031
static bool is_list(element_t code)
Does element represent a list of items?
Definition html.hpp:941
static bool is_uri(element_t code, _In_reads_or_z_opt_(num_chars) const T *attr_name, size_t num_chars)
Checks if expected element attribute value is URI.
Definition html.hpp:1220
static bool is_preformatted(element_t code)
Does element represent preformatted text, source code etc.?
Definition html.hpp:958
static bool is_localizable(element_t code, const T *attr_name, size_t num_chars)
Checks if expected element attribute value is localizable.
Definition html.hpp:1275
static bool is_special(element_t code)
Does element represent non-textual item in the document?
Definition html.hpp:856
static bool is_pre_exclusion(element_t code)
May element be a part of.
Definition html.hpp:1049
static bool is_inline(element_t code)
Is element typically displayed inline with text?
Definition html.hpp:907
static bool is_html_content(element_t code)
Does element represent the document body?
Definition html.hpp:1074
static bool is_formctrl(element_t code)
Does element represent a form control?
Definition html.hpp:889
static bool is_phrase(element_t code)
Does element represent a phrase-of-speech?
Definition html.hpp:832
static bool may_contain(element_t parent, element_t child)
Checks if one element may nest inside another.
Definition html.hpp:1120
static element_span_t span(element_t code)
Returns expected element span in HTML code.
Definition html.hpp:691
HTML entity.
Definition html.hpp:1586
std::basic_string< T, TR, AX > value
Entity value.
Definition html.hpp:1588
stdex::interval< size_t > name
Name position in source.
Definition html.hpp:1587
Inserted HTML token.
Definition html.hpp:2148
bool after_word
true if token is anchored after the word; false if anchored before the word
Definition html.hpp:2152
std::list< stdex::html::token * > active_tokens
List of started tokens at inserted token.
Definition html.hpp:2150
size_t word_index
Index of the word, token is anchored to.
Definition html.hpp:2151
token * token
Points to the token.
Definition html.hpp:2149
Numerical interval.
Definition interval.hpp:18
T size() const
Returns interval size.
Definition interval.hpp:47
T end
interval end
Definition interval.hpp:20
T start
interval start
Definition interval.hpp:19
Maps index in source string to index in destination string.
Definition mapping.hpp:17
Tag attribute.
Definition parser.hpp:8040
stdex::interval< size_t > value
attribute value position in source
Definition parser.hpp:8042