/* SPDX-License-Identifier: MIT Copyright © 2016-2026 Amebis */ #pragma once #include "assert.hpp" #include "compat.hpp" #include "exception.hpp" #include "interval.hpp" #include "mapping.hpp" #include "parser.hpp" #include "progress.hpp" #include "sgml.hpp" #include "string.hpp" #include "system.hpp" #include "unicode.hpp" #include #include #include #include #include #include #include #include #ifdef _WIN32 #undef small #endif #if defined(__GNUC__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunknown-pragmas" #endif namespace stdex { namespace html { /// /// Appends HTML escaped string /// /// \param[in,out] dst String to append to /// \param[in] src Source string /// \param[in] num_chars Code unit limit in string `src` /// template, class AX = std::allocator> void escape( _Inout_ std::basic_string& dst, _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars) { num_chars = stdex::strnlen(src, num_chars); dst.reserve(dst.size() + num_chars + (num_chars >> 2)); for (size_t i = 0; i < num_chars; ++i) { switch (src[i]) { case '&': dst += "&"; break; case ';': dst += ";"; break; case '\"': dst += """; break; case '\'': dst += "'"; break; case '<': dst += "<"; break; case '>': dst += ">"; break; case 0x00a0: dst += " "; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space. default: dst += src[i]; break; } } } /// /// Appends HTML escaped string /// /// \param[in,out] dst String to append to /// \param[in] src Source string /// \param[in] num_chars Code unit limit in string `src` /// template, class AX = std::allocator> void escape( _Inout_ std::basic_string& dst, _In_reads_or_z_opt_(num_chars) const wchar_t* src, _In_ size_t num_chars) { num_chars = stdex::strnlen(src, num_chars); dst.reserve(dst.size() + num_chars + (num_chars >> 2)); for (size_t i = 0; i < num_chars; ++i) { switch (src[i]) { case L'&': dst += L"&"; break; case L';': dst += L";"; break; case L'\"': dst += L"""; break; case L'\'': dst += L"'"; break; case L'<': dst += L"<"; break; case L'>': dst += L">"; break; case L'\u00a0': dst += L" "; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space. default: dst += src[i]; break; } } } /// /// Appends HTML escaped string /// /// \param[in,out] dst String to append to /// \param[in] src Source string /// \param[in] num_chars Code unit limit in string `src` /// template, class AX = std::allocator> void escape( _Inout_ std::basic_string& dst, _In_reads_or_z_opt_(num_chars) const char16_t* src, _In_ size_t num_chars) { num_chars = stdex::strnlen(src, num_chars); dst.reserve(dst.size() + num_chars + (num_chars >> 2)); for (size_t i = 0; i < num_chars; ++i) { switch (src[i]) { case L'&': dst += u"&"; break; case L';': dst += u";"; break; case L'\"': dst += u"""; break; case L'\'': dst += u"'"; break; case L'<': dst += u"<"; break; case L'>': dst += u">"; break; case L'\u00a0': dst += u" "; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space. default: dst += src[i]; break; } } } /// /// Appends HTML escaped string /// /// \param[in,out] dst String to append to /// \param[in] src Source string /// template, class AX = std::allocator> void escape( _Inout_ std::basic_string& dst, _In_ const T (&src)[N]) { escape(dst, src, N); } /// /// Appends HTML escaped string /// /// \param[in,out] dst String to append to /// \param[in] src Source string /// template, class AX_dst = std::allocator, class TR_src = std::char_traits, class AX_src = std::allocator> void escape( _Inout_ std::basic_string& dst, _In_ const std::basic_string& src) { escape(dst, src.data(), src.size()); } /// /// Appends HTML escaped character /// /// \param[in,out] dst String to append to /// \param[in] chr Source character /// template, class AX = std::allocator> void escape_min(_Inout_ std::basic_string& dst, _In_ char chr) { switch (chr) { case '&': dst += "&"; break; case '<': dst += "<"; break; case '>': dst += ">"; break; case 0x00a0: dst += " "; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space. default: dst += chr; break; } } /// /// Appends HTML escaped character /// /// \param[in,out] dst String to append to /// \param[in] chr Source character /// template, class AX = std::allocator> void escape_min(_Inout_ std::basic_string& dst, _In_ wchar_t chr) { switch (chr) { case L'&': dst += L"&"; break; case L'<': dst += L"<"; break; case L'>': dst += L">"; break; case L'\u00a0': dst += L" "; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space. default: dst += chr; break; } } /// /// Appends HTML escaped character /// /// \param[in,out] dst String to append to /// \param[in] chr Source character /// template, class AX = std::allocator> void escape_min(_Inout_ std::basic_string& dst, _In_ char16_t chr) { switch (chr) { case L'&': dst += u"&"; break; case L'<': dst += u"<"; break; case L'>': dst += u">"; break; case L'\u00a0': dst += u" "; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space. default: dst += chr; break; } } /// /// Appends HTML escaped string /// /// \param[in,out] dst String to append to /// \param[in] src Source string /// \param[in] num_chars Code unit limit in string `src` /// template, class AX = std::allocator> void escape_min( _Inout_ std::basic_string& dst, _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars) { num_chars = stdex::strnlen(src, num_chars); stdex_assert(src || !num_chars); dst.reserve(dst.size() + num_chars + (num_chars >> 2)); for (size_t i = 0; i < num_chars; ++i) { switch (src[i]) { case '&': dst += "&"; break; case '<': dst += "<"; break; case '>': dst += ">"; break; case 0x00a0: dst += " "; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space. default: dst += src[i]; break; } } } /// /// Appends HTML escaped string /// /// \param[in,out] dst String to append to /// \param[in] src Source string /// \param[in] num_chars Code unit limit in string `src` /// template, class AX = std::allocator> void escape_min( _Inout_ std::basic_string& dst, _In_reads_or_z_opt_(num_chars) const wchar_t* src, _In_ size_t num_chars) { num_chars = stdex::strnlen(src, num_chars); dst.reserve(dst.size() + num_chars + (num_chars >> 2)); for (size_t i = 0; i < num_chars; ++i) { switch (src[i]) { case L'&': dst += L"&"; break; case L'<': dst += L"<"; break; case L'>': dst += L">"; break; case L'\u00a0': dst += L" "; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space. default: dst += src[i]; break; } } } /// /// Appends HTML escaped string /// /// \param[in,out] dst String to append to /// \param[in] src Source string /// \param[in] num_chars Code unit limit in string `src` /// template, class AX = std::allocator> void escape_min( _Inout_ std::basic_string& dst, _In_reads_or_z_opt_(num_chars) const char16_t* src, _In_ size_t num_chars) { num_chars = stdex::strnlen(src, num_chars); dst.reserve(dst.size() + num_chars + (num_chars >> 2)); for (size_t i = 0; i < num_chars; ++i) { switch (src[i]) { case L'&': dst += u"&"; break; case L'<': dst += u"<"; break; case L'>': dst += u">"; break; case L'\u00a0': dst += u" "; break; // No-break space must be escaped as SGML entity, otherwise browsers treat it as a normal space. default: dst += src[i]; break; } } } /// /// Appends HTML escaped string /// /// \param[in,out] dst String to append to /// \param[in] src Source string /// template, class AX = std::allocator> void escape_min( _Inout_ std::basic_string& dst, _In_ const T (&src)[N]) { escape_min(dst, src, N); } /// /// Appends HTML escaped string /// /// \param[in,out] dst String to append to /// \param[in] src Source string /// template, class AX_dst = std::allocator, class TR_src = std::char_traits, class AX_src = std::allocator> void escape_min( _Inout_ std::basic_string& dst, _In_ const std::basic_string& src) { escape_min(dst, src.data(), src.size()); } /// /// Appends unescaped URL string /// /// \param[in,out] dst String to append to /// \param[in] src Source string /// \param[in] num_chars Code unit limit in string `src` /// template, class AX = std::allocator> void url_unescape( _Inout_ std::basic_string& dst, _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars) { stdex_assert(src || !num_chars); for (size_t i = 0; i < num_chars && src[i];) { switch (src[i]) { case '+': dst += ' '; i++; break; case '%': { i++; char chr; if ('0' <= src[i] && src[i] <= '9') chr = static_cast((src[i++] - '0') << 4); else if ('A' <= src[i] && src[i] <= 'F') chr = static_cast((src[i++] - 'A' + 10) << 4); else if ('a' <= src[i] && src[i] <= 'f') chr = static_cast((src[i++] - 'a' + 10) << 4); else { dst += '%'; continue; } if ('0' <= src[i] && src[i] <= '9') chr |= static_cast((src[i++] - '0')); else if ('A' <= src[i] && src[i] <= 'F') chr |= static_cast((src[i++] - 'A' + 10)); else if ('a' <= src[i] && src[i] <= 'f') chr |= static_cast((src[i++] - 'a' + 10)); else { dst += '%'; dst += src[i - 1]; continue; } dst += chr; break; } default: dst += src[i++]; } } } /// /// Appends unescaped URL string /// /// \param[in,out] dst String to append to /// \param[in] src Source string /// template, class AX = std::allocator> void url_unescape( _Inout_ std::basic_string& dst, _In_ const char (&src)[N]) { url_unescape(dst, src, N); } /// /// Appends unescaped URL string /// /// \param[in,out] dst String to append to /// \param[in] src Source string /// template, class AX_dst = std::allocator> void url_unescape( _Inout_ std::basic_string& dst, _In_ const std::basic_string_view> src) { url_unescape(dst, src.data(), src.size()); } /// /// Appends escaped URL string /// /// \param[in,out] dst String to append to /// \param[in] src Source string /// \param[in] num_chars Code unit limit in string `src` /// template, class AX = std::allocator> void url_escape( _Inout_ std::basic_string& dst, _In_reads_or_z_opt_(num_chars) const char* src, _In_ size_t num_chars) { stdex_assert(src || !num_chars); for (size_t i = 0; i < num_chars && src[i]; ++i) { switch (src[i]) { case ' ': dst += "%20"; break; case '<': dst += "%3C"; break; case '>': dst += "%3E"; break; case '#': dst += "%23"; break; case '%': dst += "%25"; break; case '{': dst += "%7B"; break; case '}': dst += "%7D"; break; case '|': dst += "%7C"; break; case '\\': dst += "%5C"; break; case '^': dst += "%5E"; break; case '~': dst += "%7E"; break; case '[': dst += "%5B"; break; case ']': dst += "%5D"; break; case '`': dst += "%60"; break; case ';': dst += "%3B"; break; case '+': dst += "%2B"; break; case '/': dst += "%2F"; break; case '?': dst += "%3F"; break; case ':': dst += "%3A"; break; case '@': dst += "%40"; break; case '=': dst += "%3D"; break; case '&': dst += "%26"; break; case '$': dst += "%24"; break; default: if (0x20 < static_cast(src[i]) && static_cast(src[i]) < 0x7f) dst += src[i]; else { dst += '%'; uint8_t n = (static_cast(src[i]) & 0xf0) >> 4; dst += n < 10 ? static_cast('0' + n) : static_cast('A' + n - 10); n = ((uint8_t)src[i] & 0x0f); dst += n < 10 ? static_cast('0' + n) : static_cast('A' + n - 10); } } } } /// /// Appends escaped URL string /// /// \param[in,out] dst String to append to /// \param[in] src Source string /// template, class AX = std::allocator> void url_escape( _Inout_ std::basic_string& dst, _In_ const char (&src)[N]) { url_escape(dst, src, N); } /// /// Appends escaped URL string /// /// \param[in,out] dst String to append to /// \param[in] src Source string /// template, class AX_dst = std::allocator> void url_escape( _Inout_ std::basic_string& dst, _In_ const std::basic_string_view> src) { url_escape(dst, src.data(), src.size()); } /// /// Appends unescaped CSS string /// /// \param[in,out] dst String to append to /// \param[in] src Source string /// \param[in] num_chars Code unit limit in string `src` /// template, class AX = std::allocator> void css_unescape( _Inout_ std::basic_string& dst, _In_reads_or_z_opt_(num_chars) const T* src, _In_ size_t num_chars) { stdex_assert(src || !num_chars); for (size_t i = 0; i < num_chars && src[i];) { if (src[i] != '\\') dst += src[i++]; else if (i + 1 < num_chars) { i++; switch (src[i]) { // Classic escapes case 'n': dst += '\n'; i++; break; case 'r': dst += '\r'; i++; break; case 't': dst += '\t'; i++; break; // `\` at the end of the line case '\n': i++; break; // `\nnnn` escape case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case 'A': case 'a': case 'B': case 'b': case 'C': case 'c': case 'D': case 'd': case 'E': case 'e': case 'F': case 'f': { char32_t chr = 0; size_t end = std::min(num_chars, i + 6); for (; i < end; ++i) { if ('0' <= src[i] && src[i] <= '9') chr = chr * 0x10 + src[i] - '0'; else if ('A' <= src[i] && src[i] <= 'F') chr = chr * 0x10 + src[i] - 'A' + 10; else if ('a' <= src[i] && src[i] <= 'f') chr = chr * 0x10 + src[i] - 'a' + 10; else break; } dst += static_cast(chr); if (i < end && src[i] == ' ') { // Skip space after `\nnnn`. i++; } break; } default: dst += src[i++]; } } } } /// /// Appends unescaped CSS string /// /// \param[in,out] dst String to append to /// \param[in] src Source string /// template, class AX = std::allocator> void css_unescape( _Inout_ std::basic_string& dst, _In_ const T (&src)[N]) { css_unescape(dst, src, N); } /// /// Appends unescaped CSS string /// /// \param[in,out] dst String to append to /// \param[in] src Source string /// template, class AX_dst = std::allocator, class TR_src = std::char_traits, class AX_src = std::allocator> void css_unescape( _Inout_ std::basic_string& dst, _In_ const std::basic_string& src) { css_unescape(dst, src.data(), src.size()); } /// /// Appends escaped CSS string /// /// \param[in,out] dst String to append to /// \param[in] src Source string /// \param[in] num_chars Code unit limit in string `src` /// template, class AX = std::allocator> void css_escape( _Inout_ std::basic_string& dst, _In_reads_or_z_opt_(num_chars) const T* src, _In_ size_t num_chars) { num_chars = stdex::strnlen(src, num_chars); dst.reserve(dst.size() + num_chars + (num_chars >> 3)); for (size_t i = 0; i < num_chars; ++i) { switch (src[i]) { case '\\': dst += '\\'; dst+= '\\'; break; case '\n': dst += '\\'; dst+= 'n'; break; case '\r': dst += '\\'; dst+= 'r'; break; case '\t': dst += '\\'; dst+= 't'; break; case '\"': dst += '\\'; dst+= '"'; break; case '\'': dst += '\\'; dst+= '\''; break; default: dst += src[i]; break; } } } /// /// Appends escaped CSS string /// /// \param[in,out] dst String to append to /// \param[in] src Source string /// template, class AX = std::allocator> void css_escape( _Inout_ std::basic_string& dst, _In_ const T (&src)[N]) { css_escape(dst, src, N); } /// /// Appends escaped CSS string /// /// \param[in,out] dst String to append to /// \param[in] src Source string /// template, class AX_dst = std::allocator, class TR_src = std::char_traits, class AX_src = std::allocator> void css_escape( _Inout_ std::basic_string& dst, _In_ const std::basic_string& src) { css_escape(dst, src.data(), src.size()); } /// /// HTML element type /// enum class element_t { empty = 0, a, abbr, acronym, address, applet, area, b, base, basefont, bdo, bgsound, // Microsoft Specific big, blink, // Microsoft Specific blockquote, body, br, button, caption, center, cite, code, col, colgroup, comment, // Microsoft Specific dd, del, dfn, dir, div, dl, dt, em, embed, // Microsoft Specific fieldset, font, form, frame, frameset, h1, h2, h3, h4, h5, h6, head, hr, html, i, iframe, img, input, ins, isindex, kbd, label, legend, li, link, listing, // Microsoft Specific map, marquee, // Microsoft Specific menu, meta, nextid, // Microsoft Specific nobr, // Microsoft Specific noembed, // Microsoft Specific noframes, noscript, object, ol, optgroup, option, p, param, plaintext, // Microsoft Specific pre, q, rt, // Microsoft Specific ruby, // Microsoft Specific s, samp, script, select, small, span, strike, strong, style, sub, sup, table, tbody, td, textarea, tfoot, th, thead, title, tr, tt, u, ul, var, wbr, // Microsoft Specific xmp, // Microsoft Specific unknown = -1, PCDATA = -2, CDATA = -3, }; /// /// Expected pairing of and /// enum class element_span_t { needs_end = 0, ///< May start and end in a single ; otherwise, needs explicit end (e.g. `...`) end_optional, ///< End is optional. May not contain the same type child elements. (e.g. `

`) immediate, ///< Never spans. Only or forms. (e.g. `
`) }; /// /// Describes attributes associated with a HTML element /// struct element_traits { /// /// Returns expected element span in HTML code /// /// \param[in] code Element code /// static element_span_t span(_In_ element_t code) { static element_span_t lookup[] = { element_span_t::needs_end, // a element_span_t::needs_end, // abbr element_span_t::needs_end, // acronym element_span_t::needs_end, // address element_span_t::needs_end, // applet element_span_t::immediate, // area element_span_t::needs_end, // b element_span_t::immediate, // base element_span_t::immediate, // basefont element_span_t::needs_end, // bdo element_span_t::immediate, // bgsound element_span_t::needs_end, // big element_span_t::needs_end, // blink element_span_t::needs_end, // blockquote element_span_t::end_optional, // body element_span_t::immediate, // br element_span_t::needs_end, // button element_span_t::needs_end, // caption element_span_t::needs_end, // center element_span_t::needs_end, // cite element_span_t::needs_end, // code element_span_t::immediate, // col element_span_t::end_optional, // colgroup element_span_t::needs_end, // comment element_span_t::end_optional, // dd element_span_t::needs_end, // del element_span_t::needs_end, // dfn element_span_t::needs_end, // dir element_span_t::needs_end, // div element_span_t::needs_end, // dl element_span_t::end_optional, // dt element_span_t::needs_end, // em element_span_t::immediate, // embed element_span_t::needs_end, // fieldset element_span_t::needs_end, // font element_span_t::needs_end, // form element_span_t::immediate, // frame element_span_t::needs_end, // frameset element_span_t::needs_end, // h1 element_span_t::needs_end, // h2 element_span_t::needs_end, // h3 element_span_t::needs_end, // h4 element_span_t::needs_end, // h5 element_span_t::needs_end, // h6 element_span_t::end_optional, // head element_span_t::immediate, // hr element_span_t::end_optional, // html element_span_t::needs_end, // i element_span_t::needs_end, // iframe element_span_t::immediate, // img element_span_t::immediate, // input element_span_t::needs_end, // ins element_span_t::immediate, // isindex element_span_t::needs_end, // kbd element_span_t::needs_end, // label element_span_t::needs_end, // legend element_span_t::end_optional, // li element_span_t::immediate, // link element_span_t::needs_end, // listing element_span_t::needs_end, // map element_span_t::needs_end, // marquee element_span_t::needs_end, // menu element_span_t::immediate, // meta element_span_t::immediate, // nextid element_span_t::needs_end, // nobr element_span_t::needs_end, // noembed element_span_t::needs_end, // noframes element_span_t::needs_end, // noscript element_span_t::needs_end, // object element_span_t::needs_end, // ol element_span_t::needs_end, // optgroup element_span_t::end_optional, // option element_span_t::end_optional, // p element_span_t::immediate, // param element_span_t::end_optional, // plaintext element_span_t::needs_end, // pre element_span_t::needs_end, // q element_span_t::immediate, // rt element_span_t::needs_end, // ruby element_span_t::needs_end, // s element_span_t::needs_end, // samp element_span_t::needs_end, // script element_span_t::needs_end, // select element_span_t::needs_end, // small element_span_t::needs_end, // span element_span_t::needs_end, // strike element_span_t::needs_end, // strong element_span_t::needs_end, // style element_span_t::needs_end, // sub element_span_t::needs_end, // sup element_span_t::needs_end, // table element_span_t::end_optional, // tbody element_span_t::end_optional, // td element_span_t::needs_end, // textarea element_span_t::end_optional, // tfoot element_span_t::end_optional, // th element_span_t::end_optional, // thead element_span_t::needs_end, // title element_span_t::end_optional, // tr element_span_t::needs_end, // tt element_span_t::needs_end, // u element_span_t::needs_end, // ul element_span_t::needs_end, // var element_span_t::immediate, // wbr element_span_t::needs_end, // xmp }; return element_t::a <= code && code <= element_t::xmp ? lookup[static_cast(code) - static_cast(element_t::a)] : element_span_t::needs_end; } /// /// Does element represent font styling? /// /// \param[in] code Element code /// static bool is_fontstyle(_In_ element_t code) { switch (code) { case element_t::tt: case element_t::i: case element_t::b: case element_t::u: case element_t::s: case element_t::strike: case element_t::blink: case element_t::big: case element_t::small: return true; default: return false; }; } /// /// Does element represent a phrase-of-speech? /// /// \param[in] code Element code /// static bool is_phrase(_In_ element_t code) { switch (code) { case element_t::em: case element_t::strong: case element_t::dfn: case element_t::code: case element_t::samp: case element_t::kbd: case element_t::var: case element_t::cite: case element_t::abbr: case element_t::acronym: case element_t::xmp: return true; default: return false; }; } /// /// Does element represent non-textual item in the document? /// /// \param[in] code Element code /// static bool is_special(_In_ element_t code) { switch (code) { case element_t::a: case element_t::img: case element_t::applet: case element_t::object: case element_t::embed: case element_t::font: case element_t::basefont: case element_t::br: case element_t::wbr: case element_t::rt: case element_t::script: case element_t::map: case element_t::q: case element_t::sub: case element_t::sup: case element_t::ruby: case element_t::span: case element_t::bdo: case element_t::iframe: case element_t::nobr: return true; default: return false; }; } /// /// Does element represent a form control? /// /// \param[in] code Element code /// static bool is_formctrl(_In_ element_t code) { switch (code) { case element_t::input: case element_t::select: case element_t::textarea: case element_t::label: case element_t::button: return true; default: return false; }; } /// /// Is element typically displayed inline with text? /// /// \param[in] code Element code /// static bool is_inline(_In_ element_t code) { return code == element_t::PCDATA || is_fontstyle(code) || is_phrase(code) || is_special(code) || is_formctrl(code); } /// /// Does element represent a heading? /// /// \param[in] code Element code /// static bool is_heading(_In_ element_t code) { switch (code) { case element_t::h1: case element_t::h2: case element_t::h3: case element_t::h4: case element_t::h5: case element_t::h6: return true; default: return false; }; } /// /// Does element represent a list of items? /// /// \param[in] code Element code /// static bool is_list(_In_ element_t code) { switch (code) { case element_t::ul: case element_t::ol: case element_t::dir: case element_t::menu: return true; default: return false; }; } /// /// Does element represent preformatted text, source code etc.? /// /// \param[in] code Element code /// static bool is_preformatted(_In_ element_t code) { switch (code) { case element_t::pre: case element_t::listing: return true; default: return false; } } /// /// Is element typically displayed as a stand-alone section of text? /// /// \param[in] code Element code /// static bool is_block(_In_ element_t code) { if (is_heading(code) || is_list(code) || is_preformatted(code)) return true; switch (code) { case element_t::p: case element_t::dl: case element_t::div: case element_t::center: case element_t::marquee: case element_t::noscript: case element_t::noframes: case element_t::noembed: case element_t::blockquote: case element_t::form: case element_t::isindex: case element_t::hr: case element_t::table: case element_t::fieldset: case element_t::address: return true; default: return false; }; } /// /// Does element typically represent text? /// /// \param[in] code Element code /// static bool is_flow(_In_ element_t code) { return is_block(code) || is_inline(code); } /// /// Is element part of the document head? /// /// \param[in] code Element code /// static bool is_head_content(_In_ element_t code) { switch (code) { case element_t::title: case element_t::isindex: case element_t::base: case element_t::nextid: return true; default: return false; }; } /// /// May element be a part of document head? /// /// \param[in] code Element code /// static bool is_head_misc(_In_ element_t code) { switch (code) { case element_t::script: case element_t::style: case element_t::meta: case element_t::link: case element_t::object: return true; default: return false; }; } /// /// May element be a part of `

`?
			///
			/// \param[in] code  Element code
			///
			static bool is_pre_exclusion(_In_ element_t code)
			{
				switch (code) {
				case element_t::img:
				case element_t::object:
				case element_t::applet:
				case element_t::embed:
				case element_t::big:
				case element_t::small:
				case element_t::sub:
				case element_t::sup:
				case element_t::ruby:
				case element_t::font:
				case element_t::basefont:
				case element_t::nobr:
					return true;
				default:
					return false;
				};
			}

			///
			/// Does element represent the document body?
			///
			/// \param[in] code  Element code
			///
			static bool is_html_content(_In_ element_t code)
			{
				switch (code) {
				case element_t::head:
				case element_t::body:
				case element_t::frameset:
					return true;
				default:
					return false;
				};
			}

			///
			/// Does element represent a separate part of text?
			///
			/// \param[in] code  Element code
			///
			static bool is_group(_In_ element_t code)
			{
				if (is_block(code) ||
					is_html_content(code) ||
					is_head_content(code)) return true;
				switch (code) {
				case element_t::col:
				case element_t::colgroup:
				case element_t::dd:
				case element_t::dir:
				case element_t::dt:
				case element_t::frame:
				case element_t::iframe:
				case element_t::legend:
				case element_t::td:
				case element_t::th:
				case element_t::tr:
					return true;
				default:
					return false;
				};
			}

			///
			/// Checks if one element may nest inside another
			///
			/// \param[in] parent  Parent element code
			/// \param[in] child   Child element code
			///
			/// \returns `true` if `child` may nest in `parent`; `false` otherwise
			///
			static bool may_contain(_In_ element_t parent, _In_ element_t child)
			{
				if (child == element_t::unknown || child == element_t::comment)
					return true;
				if (is_fontstyle(parent) || is_phrase(parent))
					return is_inline(child);
				if (is_heading(parent))
					return is_inline(child);

				switch (parent) {
				case element_t::a:             return is_inline(child) && child != element_t::a;
				case element_t::address:       return is_inline(child) || child == element_t::p;
				case element_t::applet:        return is_flow(child) || child == element_t::param;
				case element_t::area:          return false;
				case element_t::base:          return false;
				case element_t::basefont:      return false;
				case element_t::bdo:           return is_inline(child);
				case element_t::blockquote:    return is_flow(child);
				case element_t::body:          return is_flow(child) || child == element_t::ins || child == element_t::del;
				case element_t::br:            return false;
				case element_t::button:        return is_flow(child) && !is_formctrl(child) && child != element_t::a && child != element_t::form && child != element_t::isindex && child != element_t::fieldset && child != element_t::iframe;
				case element_t::caption:       return is_inline(child);
				case element_t::center:        return is_flow(child);
				case element_t::col:           return false;
				case element_t::colgroup:      return child == element_t::col;
				case element_t::comment:       return child == element_t::CDATA;
				case element_t::dd:            return is_flow(child);
				case element_t::del:           return is_flow(child);
				case element_t::dir:           return child == element_t::li;
				case element_t::div:           return is_flow(child);
				case element_t::dl:            return child == element_t::dt || child == element_t::dd;
				case element_t::dt:            return is_inline(child);
				case element_t::embed:         return is_flow(child) || child == element_t::param;
				case element_t::fieldset:      return is_flow(child) || child == element_t::legend || child == element_t::PCDATA;
				case element_t::font:          return is_inline(child);
				case element_t::form:          return is_flow(child) && child != element_t::form;
				case element_t::frame:         return false;
				case element_t::frameset:      return child == element_t::frameset || child == element_t::frame || child == element_t::noframes;
				case element_t::head:          return is_head_content(child) || is_head_misc(child);
				case element_t::hr:            return false;
				case element_t::html:          return is_html_content(child);
				case element_t::iframe:        return is_flow(child);
				case element_t::img:           return false;
				case element_t::input:         return false;
				case element_t::ins:           return is_flow(child);
				case element_t::isindex:       return false;
				case element_t::label:         return is_inline(child) && child != element_t::label;
				case element_t::legend:        return is_inline(child);
				case element_t::li:            return is_flow(child);
				case element_t::link:          return false;
				case element_t::listing:       return child == element_t::CDATA;
				case element_t::map:           return is_block(child) || child == element_t::area;
				case element_t::marquee:       return is_flow(child);
				case element_t::menu:          return child == element_t::li;
				case element_t::meta:          return false;
				case element_t::nobr:          return is_inline(child) || child == element_t::wbr;
				case element_t::noframes:      return (is_flow(child) || child == element_t::body) && child != element_t::noframes;
				case element_t::noscript:      return is_flow(child);
				case element_t::noembed:       return is_flow(child);
				case element_t::object:        return is_flow(child) || child == element_t::param;
				case element_t::ol:            return child == element_t::li;
				case element_t::optgroup:      return child == element_t::option;
				case element_t::option:        return child == element_t::PCDATA;
				case element_t::p:             return is_inline(child);
				case element_t::param:         return false;
				case element_t::plaintext:     return is_flow(child);
				case element_t::pre:           return is_inline(child) && !is_pre_exclusion(child);
				case element_t::q:             return is_inline(child);
				case element_t::rt:            return false;
				case element_t::ruby:          return is_inline(child);
				case element_t::script:        return child == element_t::CDATA;
				case element_t::select:        return child == element_t::optgroup || child == element_t::option;
				case element_t::span:          return is_inline(child);
				case element_t::style:         return child == element_t::CDATA;
				case element_t::sub:           return is_inline(child);
				case element_t::sup:           return is_inline(child);
				case element_t::table:         return child == element_t::caption || child == element_t::col || child == element_t::colgroup || child == element_t::thead || child == element_t::tfoot || child == element_t::tbody;
				case element_t::tbody:         return child == element_t::tr;
				case element_t::td:            return is_flow(child);
				case element_t::textarea:      return child == element_t::PCDATA;
				case element_t::tfoot:         return child == element_t::tr;
				case element_t::th:            return is_flow(child);
				case element_t::thead:         return child == element_t::tr;
				case element_t::title:         return child == element_t::PCDATA;
				case element_t::tr:            return child == element_t::td || child == element_t::th;
				case element_t::ul:            return child == element_t::li;
				case element_t::wbr:           return false;
				case element_t::unknown:       return true;
				default:                       return false;
				}
			}

			///
			/// Checks if expected element attribute value is URI
			///
			/// \param[in] code       Element code
			/// \param[in] attr_name  Attribute name
			/// \param[in] num_chars  Code unit limit in `attr_name`
			///
			template 
			static bool is_uri(_In_ element_t code, _In_reads_or_z_opt_(num_chars) const T* attr_name, _In_ size_t num_chars)
			{
				stdex_assert(attr_name || !num_chars);
				switch (code) {
				case element_t::a:          return stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX) == 0;
				case element_t::applet:     return stdex::strnicmp(attr_name, num_chars, "code", SIZE_MAX) == 0 ||
					stdex::strnicmp(attr_name, num_chars, "codebase", SIZE_MAX) == 0 ||
					stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) == 0;
				case element_t::area:       return stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX) == 0;
				case element_t::base:       return stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX) == 0;
				case element_t::bgsound:    return stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) == 0;
				case element_t::blockquote: return stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX) == 0;
				case element_t::body:       return stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX) == 0;
				case element_t::comment:    return stdex::strnicmp(attr_name, num_chars, "data", SIZE_MAX) == 0;
				case element_t::del:        return stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX) == 0;
				case element_t::embed:      return stdex::strnicmp(attr_name, num_chars, "pluginspage", SIZE_MAX) == 0 ||
					stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) == 0;
				case element_t::form:       return stdex::strnicmp(attr_name, num_chars, "action", SIZE_MAX) == 0;
				case element_t::frame:      return stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX) == 0 ||
					stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) == 0;
				case element_t::head:       return stdex::strnicmp(attr_name, num_chars, "profile", SIZE_MAX) == 0;
				case element_t::iframe:     return stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX) == 0 ||
					stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) == 0;
				case element_t::img:        return stdex::strnicmp(attr_name, num_chars, "longdesc", SIZE_MAX) == 0 ||
					stdex::strnicmp(attr_name, num_chars, "lowsrc", SIZE_MAX) == 0 ||
					stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) == 0 ||
					stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX) == 0;
				case element_t::input:      return stdex::strnicmp(attr_name, num_chars, "lowsrc", SIZE_MAX) == 0 ||
					stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) == 0 ||
					stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX) == 0;
				case element_t::ins:        return stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX) == 0;
				case element_t::link:       return stdex::strnicmp(attr_name, num_chars, "href", SIZE_MAX) == 0;
				case element_t::object:     return stdex::strnicmp(attr_name, num_chars, "basehref", SIZE_MAX) == 0 ||
					stdex::strnicmp(attr_name, num_chars, "classid", SIZE_MAX) == 0 ||
					stdex::strnicmp(attr_name, num_chars, "code", SIZE_MAX) == 0 ||
					stdex::strnicmp(attr_name, num_chars, "codebase", SIZE_MAX) == 0 ||
					stdex::strnicmp(attr_name, num_chars, "data", SIZE_MAX) == 0 ||
					stdex::strnicmp(attr_name, num_chars, "usemap", SIZE_MAX) == 0;
				case element_t::q:          return stdex::strnicmp(attr_name, num_chars, "cite", SIZE_MAX) == 0;
				case element_t::script:     return stdex::strnicmp(attr_name, num_chars, "src", SIZE_MAX) == 0;
				case element_t::table:      return stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX) == 0;
				case element_t::td:         return stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX) == 0;
				case element_t::th:         return stdex::strnicmp(attr_name, num_chars, "background", SIZE_MAX) == 0;
				default:                    return false;
				}
			}

			///
			/// Checks if expected element attribute value is localizable
			///
			/// \param[in] code       Element code
			/// \param[in] attr_name  Attribute name
			/// \param[in] num_chars  Code unit limit in `attr_name`
			///
			template 
			static bool is_localizable(element_t code, const T* attr_name, size_t num_chars)
			{
				stdex_assert(attr_name || !num_chars);
				if (stdex::strnicmp(attr_name, num_chars, "title", SIZE_MAX) == 0)
					return true;
				switch (code) {
				case element_t::applet: return stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX) == 0;
				case element_t::area:   return stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX) == 0;
				case element_t::img:    return stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX) == 0;
				case element_t::input:  return stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX) == 0;
				case element_t::object: return stdex::strnicmp(attr_name, num_chars, "alt", SIZE_MAX) == 0;
				case element_t::table:  return stdex::strnicmp(attr_name, num_chars, "summary", SIZE_MAX) == 0;
				case element_t::td:     return stdex::strnicmp(attr_name, num_chars, "abbr", SIZE_MAX) == 0;
				case element_t::th:     return stdex::strnicmp(attr_name, num_chars, "abbr", SIZE_MAX) == 0;
				default:                return false;
				}
			}
		};

		class sequence;
		using sequence_store = std::vector>;

		///
		/// Base class for HTML sequences
		///
		class sequence
		{
		public:
			stdex::parser::html_sequence_t type; ///< Sequence type. Enum is used for performance reasons (vs. `dynamic_cast`)
			stdex::interval interval;    ///< Sequence position in source
			sequence* parent;                    ///< Parent sequence

			sequence(_In_ stdex::parser::html_sequence_t _type = stdex::parser::html_sequence_t::unknown, _In_ size_t start = 0, size_t end = 0, _In_opt_ sequence* _parent = nullptr) :
				type(_type),
				interval(start, end),
				parent(_parent)
			{}

			virtual ~sequence() {} // make polymorphic
		};

		///
		/// HTML element `<.../>`
		///
		class element : public sequence
		{
		public:
			template 
			element(_Inout_ stdex::parser::basic_html_tag&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr) :
				sequence(tag.type, tag.interval.start, tag.interval.end, parent),
				code(element_code(src + tag.name.start, tag.name.size())),
				name(std::move(tag.name)),
				attributes(std::move(tag.attributes))
			{}

			template 
			static element_t element_code(_In_reads_z_(num_chars) const T* name, size_t num_chars)
			{
				static const struct {
					const char* name;
					element_t code;
				} mapping[] = {
					{ "a",          element_t::a,          },
					{ "abbr",       element_t::abbr,       },
					{ "acronym",    element_t::acronym,    },
					{ "address",    element_t::address,    },
					{ "applet",     element_t::applet,     },
					{ "area",       element_t::area,       },
					{ "b",          element_t::b,          },
					{ "base",       element_t::base,       },
					{ "basefont",   element_t::basefont,   },
					{ "bdo",        element_t::bdo,        },
					{ "bgsound",    element_t::bgsound,    },
					{ "big",        element_t::big,        },
					{ "blink",      element_t::blink,      },
					{ "blockquote", element_t::blockquote, },
					{ "body",       element_t::body,       },
					{ "br",         element_t::br,         },
					{ "button",     element_t::button,     },
					{ "caption",    element_t::caption,    },
					{ "center",     element_t::center,     },
					{ "cite",       element_t::cite,       },
					{ "code",       element_t::code,       },
					{ "col",        element_t::col,        },
					{ "colgroup",   element_t::colgroup,   },
					{ "comment",    element_t::comment,    },
					{ "dd",         element_t::dd,         },
					{ "del",        element_t::del,        },
					{ "dfn",        element_t::dfn,        },
					{ "dir",        element_t::dir,        },
					{ "div",        element_t::div,        },
					{ "dl",         element_t::dl,         },
					{ "dt",         element_t::dt,         },
					{ "em",         element_t::em,         },
					{ "embed",      element_t::embed,      },
					{ "fieldset",   element_t::fieldset,   },
					{ "font",       element_t::font,       },
					{ "form",       element_t::form,       },
					{ "frame",      element_t::frame,      },
					{ "frameset",   element_t::frameset,   },
					{ "h1",         element_t::h1,         },
					{ "h2",         element_t::h2,         },
					{ "h3",         element_t::h3,         },
					{ "h4",         element_t::h4,         },
					{ "h5",         element_t::h5,         },
					{ "h6",         element_t::h6,         },
					{ "head",       element_t::head,       },
					{ "hr",         element_t::hr,         },
					{ "html",       element_t::html,       },
					{ "i",          element_t::i,          },
					{ "iframe",     element_t::iframe,     },
					{ "img",        element_t::img,        },
					{ "input",      element_t::input,      },
					{ "ins",        element_t::ins,        },
					{ "isindex",    element_t::isindex,    },
					{ "kbd",        element_t::kbd,        },
					{ "label",      element_t::label,      },
					{ "legend",     element_t::legend,     },
					{ "li",         element_t::li,         },
					{ "link",       element_t::link,       },
					{ "listing",    element_t::listing,    },
					{ "map",        element_t::map,        },
					{ "marquee",    element_t::marquee,    },
					{ "menu",       element_t::menu,       },
					{ "meta",       element_t::meta,       },
					{ "nextid",     element_t::nextid,     },
					{ "nobr",       element_t::nobr,       },
					{ "noembed",    element_t::noembed,    },
					{ "noframes",   element_t::noframes,   },
					{ "noscript",   element_t::noscript,   },
					{ "object",     element_t::object,     },
					{ "ol",         element_t::ol,         },
					{ "optgroup",   element_t::optgroup,   },
					{ "option",     element_t::option,     },
					{ "p",          element_t::p,          },
					{ "param",      element_t::param,      },
					{ "plaintext",  element_t::plaintext,  },
					{ "pre",        element_t::pre,        },
					{ "q",          element_t::q,          },
					{ "rt",         element_t::rt,         },
					{ "ruby",       element_t::ruby,       },
					{ "s",          element_t::s,          },
					{ "samp",       element_t::samp,       },
					{ "script",     element_t::script,     },
					{ "select",     element_t::select,     },
					{ "small",      element_t::small,      },
					{ "span",       element_t::span,       },
					{ "strike",     element_t::strike,     },
					{ "strong",     element_t::strong,     },
					{ "style",      element_t::style,      },
					{ "sub",        element_t::sub,        },
					{ "sup",        element_t::sup,        },
					{ "table",      element_t::table,      },
					{ "tbody",      element_t::tbody,      },
					{ "td",         element_t::td,         },
					{ "textarea",   element_t::textarea,   },
					{ "tfoot",      element_t::tfoot,      },
					{ "th",         element_t::th,         },
					{ "thead",      element_t::thead,      },
					{ "title",      element_t::title,      },
					{ "tr",         element_t::tr,         },
					{ "tt",         element_t::tt,         },
					{ "u",          element_t::u,          },
					{ "ul",         element_t::ul,         },
					{ "var",        element_t::var,        },
					{ "wbr",        element_t::wbr,        },
					{ "xmp",        element_t::xmp,        },
				};
#ifndef NDEBUG
				// The mapping table MUST be sorted and all names in lowercase.
				for (size_t i = 1; i < _countof(mapping); i++)
					stdex_assert(stdex::strcmp(mapping[i - 1].name, mapping[i].name) <= 0);
				for (size_t i = 0; i < _countof(mapping); i++) {
					for (size_t j = 0; mapping[i].name[j]; j++)
						stdex_assert(stdex::islower(mapping[i].name[j]) | stdex::isdigit(mapping[i].name[j]));
				}
#endif
				for (size_t i = 0, j = _countof(mapping); i < j; ) {
					size_t m = (i + j) / 2;
					int r = 0;
					for (size_t i1 = 0, i2 = 0;;) {
						if (!mapping[m].name[i1]) {
							r = i2 >= num_chars || !name[i2] ? 0 : -1;
							break;
						}
						if (i2 >= num_chars || !name[i2]) {
							r = 1;
							break;
						}

						auto chr = static_cast(stdex::tolower(name[i2++]));
						if (mapping[m].name[i1] > chr) {
							r = 1;
							break;
						}
						if (mapping[m].name[i1] < chr) {
							r = -1;
							break;
						}
						i1++;
					}

					if (r < 0)
						i = m + 1;
					else if (r > 0)
						j = m;
					else
						return mapping[m].code;
				}
				return element_t::unknown;
			}

		public:
			element_t code;                                        ///< Element code
			stdex::interval name;                          ///< Element name position in source
			std::vector attributes; ///< Element attribute positions in source
		};

		class element_end;

		///
		/// Starting tag of an HTML element `<...>`
		///
		class element_start : public element
		{
		public:
			template 
			element_start(_Inout_ stdex::parser::basic_html_tag&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr, _In_opt_ sequence* _end = nullptr) :
				element(std::move(tag), src, parent),
				end(_end)
			{}

		public:
			sequence* end; ///< Corresponding ending tag of type `element_end`; When element is ended by a start of another element, this points to the another element start.
		};

		///
		/// Ending tag of an HTML element ``
		///
		class element_end : public sequence
		{
		public:
			template 
			element_end(_Inout_ stdex::parser::basic_html_tag&& tag, _In_z_ const T* src, _In_opt_ sequence* parent = nullptr, _In_opt_ element_start* _start = nullptr) :
				sequence(tag.type, tag.interval.start, tag.interval.end, parent),
				code(element::element_code(src + tag.name.start, tag.name.size())),
				name(std::move(tag.name)),
				start(_start)
			{}

		public:
			element_t code;                    ///< Element code
			stdex::interval name;      ///< Element name position in source
			element_start* start;              ///< Corresponding starting tag
		};

		///
		/// HTML declaration
		///
		class declaration : public sequence
		{
		public:
			template 
			declaration(_Inout_ stdex::parser::basic_html_tag&& tag, _In_opt_ sequence* parent = nullptr) :
				sequence(tag.type, tag.interval.start, tag.interval.end, parent),
				name(std::move(tag.name)),
				attributes(std::move(tag.attributes))
			{}

		public:
			stdex::interval name;                          ///< Declaration name position in source
			std::vector attributes; ///< Declaration attribute positions in source
		};

		///
		/// HTML comment
		///
		class comment : public sequence
		{
		public:
			template 
			comment(_Inout_ stdex::parser::basic_html_tag&& tag, _In_opt_ sequence* parent = nullptr) :
				sequence(tag.type, tag.interval.start, tag.interval.end, parent),
				content(std::move(tag.name))
			{}

		public:
			stdex::interval content; ///< Comment content position in source
		};

		///
		/// HTML instruction
		///
		class instruction : public sequence
		{
		public:
			template 
			instruction(_Inout_ stdex::parser::basic_html_tag&& tag, _In_opt_ sequence* parent = nullptr) :
				sequence(tag.type, tag.interval.start, tag.interval.end, parent),
				content(std::move(tag.name))
			{}

		public:
			stdex::interval content; ///< Instruction content position in source
		};

		///
		/// HTML entity
		///
		template, class AX = std::allocator>
		struct entity
		{
			stdex::interval name;       ///< Name position in source
			std::basic_string value; ///< Entity value
		};

		///
		/// HTML parser
		///
		template, class AX = std::allocator>
		class parser;

		///
		/// HTML document
		///
		template, class AX = std::allocator>
		class document
		{
		public:
			document() :
				m_num_parsed(0),
				m_charset(stdex::charset_id::system),

				// Declaration parsing data
				m_num_valid_conditions(0),
				m_num_invalid_conditions(0),
				m_is_cdata(false),
				m_is_rcdata(false),

				// Element parsing data
				m_is_special_element(false)
			{}

			///
			/// Empties document
			///
			void clear()
			{
				m_source.clear();
				m_num_parsed = 0;
				m_charset = stdex::charset_id::system;

				// Declaration parsing data
				m_num_valid_conditions = m_num_invalid_conditions = 0;
				m_is_cdata = m_is_rcdata = false;
				m_entities.clear();

				// Element parsing data
				m_sequences.clear();

				m_element_stack.clear();
				m_is_special_element = false;
			}

			///
			/// Parses HTML source code by chunks
			///
			void append(_In_reads_or_z_opt_(num_chars) const T* source, _In_ size_t num_chars = SIZE_MAX)
			{
				stdex_assert(source || !num_chars);
				m_source.append(source, stdex::strnlen(source, num_chars));
				source = m_source.data();
				num_chars = m_source.size();

				for (size_t i = m_num_parsed; i < num_chars;) {
					if (m_is_cdata || m_is_rcdata) {
						if (m_condition_end.match(source, i, num_chars)) {
							m_sequences.push_back(std::move(std::unique_ptr(new sequence(
								m_is_cdata ? stdex::parser::html_sequence_t::CDATA : stdex::parser::html_sequence_t::PCDATA,
								m_num_parsed, i,
								active_element()))));
							m_is_cdata = m_is_rcdata = false;
							i = m_num_parsed = m_condition_end.interval.end;
							continue;
						}
						goto next_char;
					}

					if (m_num_invalid_conditions) {
						if (m_condition_end.match(source, i, num_chars)) {
							m_num_invalid_conditions--;
							i = m_num_parsed = m_condition_end.interval.end;
							continue;
						}
						goto next_char;
					}

					if (m_num_valid_conditions && m_condition_end.match(source, i, num_chars)) {
						if (m_num_parsed < i)
							m_sequences.push_back(std::move(std::unique_ptr(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));

						m_num_valid_conditions--;
						i = m_num_parsed = m_condition_end.interval.end;
						continue;
					}

					if (m_condition_start.match(source, i, num_chars)) {
						auto condition_src(replace_entities(source + m_condition_start.condition.start, m_condition_start.condition.size()));
						if (stdex::strncmp(condition_src.data(), condition_src.size(), "CDATA", SIZE_MAX) == 0)
							m_is_cdata = true;
						else if (stdex::strncmp(condition_src.data(), condition_src.size(), "RCDATA", SIZE_MAX) == 0)
							m_is_rcdata = true;
						if (m_num_invalid_conditions)
							m_num_invalid_conditions++;
						else if (stdex::strncmp(condition_src.data(), condition_src.size(), "IGNORE", SIZE_MAX) == 0)
							m_num_invalid_conditions++;
						else
							m_num_valid_conditions++;

						i = m_num_parsed = m_condition_start.interval.end;
						continue;
					}

					if (m_is_special_element) {
						auto parent = active_element();
						stdex_assert(parent);
						if (m_tag.match(source, i, num_chars) &&
							m_tag.type == stdex::parser::html_sequence_t::element_end &&
							element::element_code(source + m_tag.name.start, m_tag.name.size()) == parent->code)
						{
							if (m_num_parsed < i)
								m_sequences.push_back(std::move(std::unique_ptr(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, parent))));
							i = m_num_parsed = m_tag.interval.end;
							std::unique_ptr e(new element_end(std::move(m_tag), source, parent->parent, parent));
							parent->end = e.get();
							m_sequences.push_back(std::move(e));
							m_element_stack.pop_back();
							m_is_special_element = false;
							continue;
						}
						goto next_char;
					}

					if (m_tag.match(source, i, num_chars)) {
						if (m_num_parsed < i)
							m_sequences.push_back(std::move(std::unique_ptr(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));
						i = m_num_parsed = m_tag.interval.end;

						switch (m_tag.type) {
						case stdex::parser::html_sequence_t::element:
						case stdex::parser::html_sequence_t::element_start: {
							std::unique_ptr e(
								m_tag.type == stdex::parser::html_sequence_t::element ? new element(std::move(m_tag), source) :
								m_tag.type == stdex::parser::html_sequence_t::element_start ? new element_start(std::move(m_tag), source) :
								nullptr);

							// Does this tag end any of the started elements?
							for (size_t j = m_element_stack.size(); j--; ) {
								auto starting_tag = m_element_stack[j];
								stdex_assert(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
								if (element_traits::may_contain(starting_tag->code, e->code)) {
									e->parent = starting_tag;
									break;
								}
								e->parent = starting_tag->parent;
								starting_tag->end = e.get();
								m_element_stack.resize(j);
							}

							if (e->type == stdex::parser::html_sequence_t::element_start) {
								auto e_start = static_cast(e.get());
								if (element_traits::span(e->code) == element_span_t::immediate)
									e_start->end = e.get();
								else {
									m_element_stack.push_back(e_start);
									switch (e->code) {
									case element_t::code:
									case element_t::comment:
									case element_t::script:
									case element_t::style:
										m_is_special_element = true;
										break;
									default:;
									}
								}
							}

							if (e->code == element_t::meta && m_charset == stdex::charset_id::system) {
								bool is_content_type = false;
								stdex::parser::html_attribute* content_attr = nullptr;
								for (auto& attr : e->attributes) {
									if (stdex::strnicmp(source + attr.name.start, attr.name.size(), "http-equiv", SIZE_MAX) == 0 &&
										stdex::strnicmp(source + attr.value.start, attr.value.size(), "content-type", SIZE_MAX) == 0)
										is_content_type = true;
									else if (stdex::strnicmp(source + attr.name.start, attr.name.size(), "content", SIZE_MAX) == 0)
										content_attr = &attr;
								}
								if (is_content_type && content_attr) {
									//  found.
									stdex::parser::basic_mime_type content;
									if (content.match(source, content_attr->value.start, content_attr->value.end) &&
										content.charset)
									{
										std::string str;
										str.reserve(content.charset.size());
										for (size_t j = content.charset.start; j < content.charset.end; ++j)
											str.push_back(static_cast(source[j]));
										m_charset = stdex::charset_from_name(str);
									}
								}
							}

							m_sequences.push_back(std::move(e));
							break;
						}
						case stdex::parser::html_sequence_t::element_end: {
							std::unique_ptr e(new element_end(std::move(m_tag), source, active_element()));

							for (size_t j = m_element_stack.size(); j--; ) {
								auto starting_tag = m_element_stack[j];
								stdex_assert(starting_tag && starting_tag->type == stdex::parser::html_sequence_t::element_start);
								if (starting_tag->code == e->code ||
									(starting_tag->code == element_t::unknown && e->code == element_t::unknown && stdex::strnicmp(source + starting_tag->name.start, starting_tag->name.size(), source + e->name.start, e->name.size()) == 0))
								{
									e->start = starting_tag;
									e->parent = starting_tag->parent;
									starting_tag->end = e.get();
									m_element_stack.resize(j);
									break;
								}
							}

							m_sequences.push_back(std::move(e));
							break;
						}
						case stdex::parser::html_sequence_t::declaration:
							if (m_tag.attributes.size() > 3 &&
								stdex::strnicmp(source + m_tag.attributes[0].name.start, m_tag.attributes[0].name.size(), "entity", SIZE_MAX) == 0)
							{
								if (stdex::strncmp(source + m_tag.attributes[1].name.start, m_tag.attributes[1].name.size(), "%", SIZE_MAX) == 0 &&
									stdex::strncmp(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size(), "SYSTEM", SIZE_MAX) &&
									stdex::strncmp(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size(), "PUBLIC", SIZE_MAX))
								{
									std::unique_ptr> e(new entity());
									e->name = m_tag.attributes[2].name;
									e->value = std::move(replace_entities(source + m_tag.attributes[3].name.start, m_tag.attributes[3].name.size()));
									m_entities.push_back(std::move(e));
								}

								// TODO: Parse & entities and entities in SYSTEM and PUBLIC external files.
							}
							m_sequences.push_back(std::move(std::unique_ptr(new declaration(std::move(m_tag), active_element()))));
							break;
						case stdex::parser::html_sequence_t::comment:
							m_sequences.push_back(std::move(std::unique_ptr(new comment(std::move(m_tag), active_element()))));
							break;
						case stdex::parser::html_sequence_t::instruction:
							m_sequences.push_back(std::move(std::unique_ptr(new instruction(std::move(m_tag), active_element()))));
							break;
						default:
							throw std::invalid_argument("unknown tag type");
						}

						continue;
					}

				next_char:
					if (m_any_char.match(source, i, num_chars)) {
						// Skip any character, but don't declare it as parsed yet. It might be a part of unfinished tag.
						i = m_any_char.interval.end;
					}
					else
						break;
				}
			}

			///
			/// Finalizes document when no more appending is planned
			///
			void finalize()
			{
				size_t i = m_source.size();
				if (m_num_parsed < i)
					m_sequences.push_back(std::move(std::unique_ptr(new sequence(stdex::parser::html_sequence_t::text, m_num_parsed, i, active_element()))));
				m_num_parsed = i;
				m_element_stack.clear();
			}

			///
			/// Parses HTML document source code
			///
			void assign(_In_reads_or_z_opt_(num_chars) const T* source, _In_ size_t num_chars = SIZE_MAX)
			{
				clear();
				append(source, num_chars);
				finalize();
			}

			///
			/// Returns document HTML source code
			///
			const std::basic_string& source() const { return m_source; }

			friend class parser;

		protected:
			///
			/// Returns starting tag of currently active element or nullptr if no element is known to be started.
			///
			element_start* active_element() const
			{
				return m_element_stack.empty() ? nullptr : m_element_stack.back();
			}

			///
			/// Replaces entities with their content
			///
			std::basic_string replace_entities(_In_reads_or_z_opt_(num_chars) const T* input, _In_ size_t num_chars) const
			{
				stdex_assert(input || !num_chars);
				const size_t num_entities = m_entities.size();
				const T* source = m_source.data();
				std::basic_string output;
				for (size_t i = 0; i < num_chars && input[i];) {
					if (input[i] == '%') {
						for (size_t j = 0; j < num_entities; j++) {
							auto& e = m_entities[j];
							size_t entity_size = e->name.size();
							if (i + entity_size + 1 < num_chars &&
								stdex::strncmp(input + i + 1, source + e->name.start, entity_size) == 0 &&
								input[i + entity_size + 1] == ';')
							{
								output += e->value;
								i += entity_size + 2;
								goto next_char;
							}
						}
						throw std::runtime_error("undefined entity");
					}
					output += input[i++];
				next_char:;
				}
				return output;
			}

		protected:
			std::basic_string m_source;       ///< Document HTML source code
			size_t m_num_parsed;                         ///< Number of characters already parsed
			stdex::charset_id m_charset;                 ///< Document charset

			// Declaration parsing data
			size_t m_num_valid_conditions;               ///< Number of started valid conditions
			size_t m_num_invalid_conditions;             ///< Number of started invalid conditions
			bool m_is_cdata;                             ///< Inside of CDATA?
			bool m_is_rcdata;                            ///< Inside of RCDATA?
			stdex::parser::basic_html_declaration_condition_start m_condition_start;
			stdex::parser::basic_html_declaration_condition_end m_condition_end;
			stdex::parser::basic_any_cu m_any_char;
			std::vector>> m_entities; ///< Array of entities

			// Element parsing data
			stdex::parser::basic_html_tag m_tag;
			sequence_store m_sequences;                  ///< Store of sequences
			std::vector m_element_stack; ///< LIFO stack of started elements
			bool m_is_special_element;                   ///< Inside of a special element (