diff --git a/UnitTests/UnitTests.vcxproj b/UnitTests/UnitTests.vcxproj
index 52f7d4f59..4570edd78 100644
--- a/UnitTests/UnitTests.vcxproj
+++ b/UnitTests/UnitTests.vcxproj
@@ -115,6 +115,7 @@
+
Create
diff --git a/UnitTests/UnitTests.vcxproj.filters b/UnitTests/UnitTests.vcxproj.filters
index 3c41d8037..a2ef46a40 100644
--- a/UnitTests/UnitTests.vcxproj.filters
+++ b/UnitTests/UnitTests.vcxproj.filters
@@ -21,6 +21,9 @@
Source Files
+
+ Source Files
+
diff --git a/UnitTests/parser.cpp b/UnitTests/parser.cpp
new file mode 100644
index 000000000..fd7c353ed
--- /dev/null
+++ b/UnitTests/parser.cpp
@@ -0,0 +1,225 @@
+/*
+ SPDX-License-Identifier: MIT
+ Copyright © 2023 Amebis
+*/
+
+#include "pch.h"
+
+using namespace std;
+using namespace stdex;
+using namespace stdex::parser;
+using namespace Microsoft::VisualStudio::CppUnitTestFramework;
+
+namespace UnitTests
+{
+ TEST_CLASS(parser)
+ {
+ public:
+ TEST_METHOD(wtest)
+ {
+ static const wchar_t text[] = L"This is a test.\nSecond line.";
+
+ {
+ wnoop t;
+ Assert::IsTrue(t.match(text));
+ Assert::AreEqual((size_t)0, t.interval.start);
+ Assert::AreEqual((size_t)0, t.interval.end);
+ }
+
+ {
+ wcu t(L't');
+ Assert::IsFalse(t.match(text));
+ Assert::IsTrue(t.match(text, 0, _countof(text), match_case_insensitive));
+ Assert::AreEqual((size_t)0, t.interval.start);
+ Assert::AreEqual((size_t)1, t.interval.end);
+ }
+
+ {
+ wspace_cu t;
+ Assert::IsFalse(t.match(text));
+ Assert::IsTrue(t.match(text, 4));
+ Assert::AreEqual((size_t)4, t.interval.start);
+ Assert::AreEqual((size_t)5, t.interval.end);
+ }
+
+ {
+ wpunct_cu t;
+ Assert::IsFalse(t.match(text));
+ Assert::IsTrue(t.match(text, 14));
+ Assert::AreEqual((size_t)14, t.interval.start);
+ Assert::AreEqual((size_t)15, t.interval.end);
+ }
+
+ {
+ wspace_or_punct_cu t;
+ Assert::IsFalse(t.match(text));
+ Assert::IsTrue(t.match(text, 4));
+ Assert::AreEqual((size_t)4, t.interval.start);
+ Assert::AreEqual((size_t)5, t.interval.end);
+ Assert::IsTrue(t.match(text, 14));
+ Assert::AreEqual((size_t)14, t.interval.start);
+ Assert::AreEqual((size_t)15, t.interval.end);
+ }
+
+ {
+ wbol t;
+ Assert::IsTrue(t.match(text));
+ Assert::AreEqual((size_t)0, t.interval.start);
+ Assert::AreEqual((size_t)0, t.interval.end);
+ Assert::IsFalse(t.match(text, 1));
+ Assert::IsFalse(t.match(text, 15));
+ Assert::IsTrue(t.match(text, 16));
+ Assert::AreEqual((size_t)16, t.interval.start);
+ Assert::AreEqual((size_t)16, t.interval.end);
+ }
+
+ {
+ weol t;
+ Assert::IsFalse(t.match(text));
+ Assert::IsFalse(t.match(text, 1));
+ Assert::IsTrue(t.match(text, 15));
+ Assert::AreEqual((size_t)15, t.interval.start);
+ Assert::AreEqual((size_t)15, t.interval.end);
+ Assert::IsFalse(t.match(text, 16));
+ }
+
+ {
+ wcu_set t(L"abcD");
+ Assert::IsFalse(t.match(text));
+ Assert::IsTrue(t.match(text, 8));
+ Assert::AreEqual((size_t)8, t.interval.start);
+ Assert::AreEqual((size_t)9, t.interval.end);
+ Assert::AreEqual((size_t)0, t.hit_offset);
+ Assert::IsFalse(t.match(text, 21));
+ Assert::IsTrue(t.match(text, 21, _countof(text), match_case_insensitive));
+ Assert::AreEqual((size_t)21, t.interval.start);
+ Assert::AreEqual((size_t)22, t.interval.end);
+ Assert::AreEqual((size_t)3, t.hit_offset);
+ }
+
+ {
+ stdex::parser::wstring t(L"this");
+ Assert::IsFalse(t.match(text));
+ Assert::IsTrue(t.match(text, 0, sizeof(text), match_case_insensitive));
+ Assert::AreEqual((size_t)0, t.interval.start);
+ Assert::AreEqual((size_t)4, t.interval.end);
+ }
+
+ {
+ wany_cu chr;
+ witerations t(make_shared_no_delete(&chr), 1, 5);
+ Assert::IsTrue(t.match(text));
+ Assert::AreEqual((size_t)0, t.interval.start);
+ Assert::AreEqual((size_t)5, t.interval.end);
+ }
+
+ {
+ wspace_cu nospace(true);
+ witerations t(make_shared_no_delete(&nospace), 1);
+ Assert::IsTrue(t.match(text));
+ Assert::AreEqual((size_t)0, t.interval.start);
+ Assert::AreEqual((size_t)4, t.interval.end);
+ }
+
+ {
+ wcu chr_t(L't'), chr_h(L'h'), chr_i(L'i'), chr_s(L's');
+ wspace_cu space;
+ wsequence t({
+ make_shared_no_delete>(&chr_t),
+ make_shared_no_delete>(&chr_h),
+ make_shared_no_delete>(&chr_i),
+ make_shared_no_delete>(&chr_s),
+ make_shared_no_delete>(&space) });
+ Assert::IsFalse(t.match(text));
+ Assert::IsTrue(t.match(text, 0, _countof(text), match_case_insensitive));
+ Assert::AreEqual((size_t)0, t.interval.start);
+ Assert::AreEqual((size_t)5, t.interval.end);
+ }
+
+ {
+ stdex::parser::wstring apple(L"apple"), orange(L"orange"), _this(L"this");
+ wspace_cu space;
+ wbranch t({
+ make_shared_no_delete>(&apple),
+ make_shared_no_delete>(&orange),
+ make_shared_no_delete>(&_this),
+ make_shared_no_delete>(&space) });
+ Assert::IsFalse(t.match(text));
+ Assert::IsTrue(t.match(text, 0, _countof(text), match_case_insensitive));
+ Assert::AreEqual((size_t)2, t.hit_offset);
+ Assert::AreEqual((size_t)0, t.interval.start);
+ Assert::AreEqual((size_t)4, t.interval.end);
+ }
+
+ {
+ wstring_branch t(L"apple", L"orange", L"this", nullptr);
+ Assert::IsFalse(t.match(text));
+ Assert::IsTrue(t.match(text, 0, _countof(text), match_case_insensitive));
+ Assert::AreEqual((size_t)2, t.hit_offset);
+ Assert::AreEqual((size_t)0, t.interval.start);
+ Assert::AreEqual((size_t)4, t.interval.end);
+ }
+
+ {
+ wcu chr_s(L's'), chr_h(L'h'), chr_i(L'i'), chr_t(L't');
+ wpermutation t({
+ make_shared_no_delete>(&chr_s),
+ make_shared_no_delete>(&chr_h),
+ make_shared_no_delete>(&chr_i),
+ make_shared_no_delete>(&chr_t) });
+ Assert::IsFalse(t.match(text));
+ Assert::IsTrue(t.match(text, 0, _countof(text), match_case_insensitive));
+ Assert::AreEqual((size_t)0, t.interval.start);
+ Assert::AreEqual((size_t)4, t.interval.end);
+ }
+ }
+
+ TEST_METHOD(sgml_test)
+ {
+ static const char text[] = "V kožuščku zlobnega mizarja stopiclja fant\nin kliče 1234567890.";
+
+ {
+ sgml_noop t;
+ Assert::IsTrue(t.match(text));
+ Assert::AreEqual((size_t)0, t.interval.start);
+ Assert::AreEqual((size_t)0, t.interval.end);
+ }
+
+ {
+ sgml_cp t("v");
+ Assert::IsFalse(t.match(text));
+ Assert::IsTrue(t.match(text, 0, _countof(text), match_case_insensitive));
+ Assert::AreEqual((size_t)0, t.interval.start);
+ Assert::AreEqual((size_t)1, t.interval.end);
+ }
+
+ {
+ sgml_cp t("Ž");
+ Assert::IsFalse(t.match(text, 4));
+ Assert::IsTrue(t.match(text, 4, _countof(text), match_case_insensitive));
+ Assert::AreEqual((size_t)4, t.interval.start);
+ Assert::AreEqual((size_t)12, t.interval.end);
+ }
+
+ {
+ sgml_space_cp t;
+ Assert::IsFalse(t.match(text));
+ Assert::IsTrue(t.match(text, 1));
+ Assert::AreEqual((size_t)1, t.interval.start);
+ Assert::AreEqual((size_t)2, t.interval.end);
+ Assert::IsTrue(t.match(text, 79));
+ Assert::AreEqual((size_t)79, t.interval.start);
+ Assert::AreEqual((size_t)85, t.interval.end);
+ }
+
+ {
+ sgml_string_branch t("apple", "orange", "KoŽuŠčKu", nullptr);
+ Assert::IsFalse(t.match(text, 2));
+ Assert::IsTrue(t.match(text, 2, _countof(text), match_case_insensitive));
+ Assert::AreEqual((size_t)2, t.hit_offset);
+ Assert::AreEqual((size_t)2, t.interval.start);
+ Assert::AreEqual((size_t)31, t.interval.end);
+ }
+ }
+ };
+}
diff --git a/UnitTests/pch.h b/UnitTests/pch.h
index 720e3e21d..63cd120af 100644
--- a/UnitTests/pch.h
+++ b/UnitTests/pch.h
@@ -15,7 +15,7 @@
#include
#include
#include
-//#include
+#include
#include
#include
#include
diff --git a/include/stdex/parser.h b/include/stdex/parser.h
new file mode 100644
index 000000000..9e7e888ca
--- /dev/null
+++ b/include/stdex/parser.h
@@ -0,0 +1,6531 @@
+/*
+ SPDX-License-Identifier: MIT
+ Copyright © 2023 Amebis
+*/
+
+#pragma once
+
+#include "interval.h"
+#include "memory.h"
+#include "sal.h"
+#include "sgml.h"
+#include "string.h"
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#ifdef _WIN32
+#include
+#else
+#include
+#include
+#endif
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4100)
+#endif
+
+namespace stdex
+{
+ namespace parser
+ {
+ ///
+ /// Flags used in basic_tester::match() methods
+ ///
+ constexpr int match_default = 0;
+ constexpr int match_case_insensitive = 0x1;
+ constexpr int match_multiline = 0x2;
+
+ ///
+ /// Base template for all testers
+ ///
+ template
+ class basic_tester
+ {
+ public:
+ basic_tester(_In_ const std::locale& locale = std::locale()) : m_locale(locale) {}
+ virtual ~basic_tester() {}
+
+ bool search(
+ _In_reads_or_z_(end) const T* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ for (size_t i = start; i < end && text[i]; i++)
+ if (match(text, i, end, flags))
+ return true;
+ return false;
+ }
+
+ virtual bool match(
+ _In_reads_or_z_(end) const T* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default) = 0;
+
+ template
+ inline bool match(
+ const std::basic_string& text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ return match(text.c_str(), start, std::min(end, text.size()), flags);
+ }
+
+ virtual void invalidate()
+ {
+ interval.start = 1;
+ interval.end = 0;
+ }
+
+ protected:
+ /// \cond internal
+ const wchar_t* next_sgml_cp(_In_ const char* text, _In_ size_t start, _In_ size_t end, _Out_ size_t& chr_end, _Out_ wchar_t(&buf)[3])
+ {
+ if (text[start] == '&') {
+ // Potential entity start
+ const auto& ctype = std::use_facet>(m_locale);
+ for (chr_end = start + 1;; chr_end++) {
+ if (chr_end >= end || text[chr_end] == 0) {
+ // Unterminated entity
+ break;
+ }
+ if (text[chr_end] == ';') {
+ // Entity end
+ size_t n = chr_end - start - 1;
+ if (n >= 2 && text[start + 1] == '#') {
+ // Numerical entity
+ char32_t unicode;
+ if (text[start + 2] == 'x' || text[start + 2] == 'X')
+ unicode = strtou32(text + start + 3, n - 2, nullptr, 16);
+ else
+ unicode = strtou32(text + start + 2, n - 1, nullptr, 10);
+#ifdef _WIN32
+ if (unicode < 0x10000) {
+ buf[0] = (wchar_t)unicode;
+ buf[1] = 0;
+ }
+ else {
+ ucs4_to_surrogate_pair(buf, unicode);
+ buf[2] = 0;
+ }
+#else
+ buf[0] = (wchar_t)unicode;
+ buf[1] = 0;
+#endif
+ chr_end++;
+ return buf;
+ }
+ const wchar_t* entity_w = sgml2uni(text + start + 1, n);
+ if (entity_w) {
+ chr_end++;
+ return entity_w;
+ }
+ // Unknown entity.
+ break;
+ }
+ else if (text[chr_end] == '&' || ctype.is(ctype.space, text[chr_end])) {
+ // This char cannot possibly be a part of entity.
+ break;
+ }
+ }
+ }
+ buf[0] = text[start];
+ buf[1] = 0;
+ chr_end = start + 1;
+ return buf;
+ }
+ /// \endcond
+
+ public:
+ interval interval; ///< Test for interval
+
+ protected:
+ const std::locale& m_locale;
+ };
+
+ using tester = basic_tester;
+ using wtester = basic_tester;
+#ifdef _UNICODE
+ using ttester = wtester;
+#else
+ using ttester = tester;
+#endif
+ using sgml_tester = basic_tester;
+
+ ///
+ /// "No-op" match
+ ///
+ template
+ class basic_noop : public basic_tester
+ {
+ public:
+ virtual bool match(
+ _In_reads_or_z_(end) const T* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ if (start < end && text[start]) {
+ interval.start = interval.end = start;
+ return true;
+ }
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+ };
+
+ using noop = basic_noop;
+ using wnoop = basic_noop;
+#ifdef _UNICODE
+ using tnoop = wnoop;
+#else
+ using tnoop = noop;
+#endif
+ using sgml_noop = basic_noop;
+
+ ///
+ /// Test for any code unit
+ ///
+ template
+ class basic_any_cu : public basic_tester
+ {
+ public:
+ virtual bool match(
+ _In_reads_or_z_(end) const T* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ if (start < end && text[start]) {
+ interval.end = (interval.start = start) + 1;
+ return true;
+ }
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+ };
+
+ using any_cu = basic_any_cu;
+ using wany_cu = basic_any_cu;
+#ifdef _UNICODE
+ using tany_cu = wany_cu;
+#else
+ using tany_cu = any_cu;
+#endif
+
+ ///
+ /// Test for any SGML code point
+ ///
+ class sgml_any_cp : public basic_any_cu
+ {
+ public:
+ virtual bool match(
+ _In_reads_or_z_(end) const char* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ if (start < end && text[start]) {
+ if (text[start] == '&') {
+ // SGML entity
+ const auto& ctype = std::use_facet>(m_locale);
+ for (interval.end = start + 1; interval.end < end && text[interval.end]; interval.end++)
+ if (text[interval.end] == ';') {
+ interval.end++;
+ interval.start = start;
+ return true;
+ }
+ else if (text[interval.end] == '&' || ctype.is(ctype.space, text[interval.end]))
+ break;
+ // Unterminated entity
+ }
+ interval.end = (interval.start = start) + 1;
+ return true;
+ }
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+ };
+
+ ///
+ /// Test for specific code unit
+ ///
+ template
+ class basic_cu : public basic_tester
+ {
+ public:
+ basic_cu(T chr, bool invert = false, _In_ const std::locale& locale = std::locale()) :
+ basic_tester(locale),
+ m_chr(chr),
+ m_invert(invert)
+ {}
+
+ virtual bool match(
+ _In_reads_or_z_(end) const T* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ if (start < end && text[start]) {
+ bool r;
+ if (flags & match_case_insensitive) {
+ const auto& ctype = std::use_facet>(m_locale);
+ r = ctype.tolower(text[start]) == ctype.tolower(m_chr);
+ }
+ else
+ r = text[start] == m_chr;
+ if (r && !m_invert || !r && m_invert) {
+ interval.end = (interval.start = start) + 1;
+ return true;
+ }
+ }
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+
+ protected:
+ T m_chr;
+ bool m_invert;
+ };
+
+ using cu = basic_cu;
+ using wcu = basic_cu;
+#ifdef _UNICODE
+ using tcu = wcu;
+#else
+ using tcu = cu;
+#endif
+
+ ///
+ /// Test for specific SGML code point
+ ///
+ class sgml_cp : public sgml_tester
+ {
+ public:
+ sgml_cp(const char* chr, size_t count = (size_t)-1, bool invert = false, _In_ const std::locale& locale = std::locale()) :
+ sgml_tester(locale),
+ m_invert(invert)
+ {
+ assert(chr || !count);
+ wchar_t buf[3];
+ size_t chr_end;
+ m_chr.assign(count ? next_sgml_cp(chr, 0, count, chr_end, buf) : L"");
+ }
+
+ virtual bool match(
+ _In_reads_or_z_(end) const char* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ if (start < end && text[start]) {
+ wchar_t buf[3];
+ const wchar_t* chr = next_sgml_cp(text, start, end, interval.end, buf);
+ bool r = ((flags & match_case_insensitive) ?
+ stdex::strnicmp(chr, (size_t)-1, m_chr.c_str(), m_chr.size(), m_locale) :
+ stdex::strncmp(chr, (size_t)-1, m_chr.c_str(), m_chr.size())) == 0;
+ if (r && !m_invert || !r && m_invert) {
+ interval.start = start;
+ return true;
+ }
+ }
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+
+ protected:
+ std::wstring m_chr;
+ bool m_invert;
+ };
+
+ ///
+ /// Test for any space code unit
+ ///
+ template
+ class basic_space_cu : public basic_tester
+ {
+ public:
+ basic_space_cu(bool invert = false, _In_ const std::locale& locale = std::locale()) :
+ basic_tester(locale),
+ m_invert(invert)
+ {}
+
+ virtual bool match(
+ _In_reads_or_z_(end) const T* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ if (start < end && text[start]) {
+ bool r =
+ ((flags & match_multiline) || !islbreak(text[start])) &&
+ std::use_facet>(m_locale).is(std::ctype_base::space, text[start]);
+ if (r && !m_invert || !r && m_invert) {
+ interval.end = (interval.start = start) + 1;
+ return true;
+ }
+ }
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+
+ protected:
+ bool m_invert;
+ };
+
+ using space_cu = basic_space_cu;
+ using wspace_cu = basic_space_cu;
+#ifdef _UNICODE
+ using tspace_cu = wspace_cu;
+#else
+ using tspace_cu = space_cu;
+#endif
+
+ ///
+ /// Test for any SGML space code point
+ ///
+ class sgml_space_cp : public basic_space_cu
+ {
+ public:
+ sgml_space_cp(_In_ bool invert = false, _In_ const std::locale& locale = std::locale()) :
+ basic_space_cu(invert, locale)
+ {}
+
+ virtual bool match(
+ _In_reads_or_z_(end) const char* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ if (start < end && text[start]) {
+ wchar_t buf[3];
+ const wchar_t* chr = next_sgml_cp(text, start, end, interval.end, buf);
+ const wchar_t* chr_end = chr + stdex::strlen(chr);
+ bool r =
+ ((flags & match_multiline) || !islbreak(chr, (size_t)-1)) &&
+ std::use_facet>(m_locale).scan_not(std::ctype_base::space, chr, chr_end) == chr_end;
+ if (r && !m_invert || !r && m_invert) {
+ interval.start = start;
+ return true;
+ }
+ }
+
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+ };
+
+ ///
+ /// Test for any punctuation code unit
+ ///
+ template
+ class basic_punct_cu : public basic_tester
+ {
+ public:
+ basic_punct_cu(bool invert = false, _In_ const std::locale& locale = std::locale()) :
+ basic_tester(locale),
+ m_invert(invert)
+ {}
+
+ virtual bool match(
+ _In_reads_or_z_(end) const T* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ if (start < end && text[start]) {
+ bool r = std::use_facet>(m_locale).is(std::ctype_base::punct, text[start]);
+ if (r && !m_invert || !r && m_invert) {
+ interval.end = (interval.start = start) + 1;
+ return true;
+ }
+ }
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+
+ protected:
+ bool m_invert;
+ };
+
+ using punct_cu = basic_punct_cu;
+ using wpunct_cu = basic_punct_cu;
+#ifdef _UNICODE
+ using tpunct_cu = wpunct_cu;
+#else
+ using tpunct_cu = punct_cu;
+#endif
+
+ ///
+ /// Test for any SGML punctuation code point
+ ///
+ class sgml_punct_cp : public basic_punct_cu
+ {
+ public:
+ virtual bool match(
+ _In_reads_or_z_(end) const char* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ if (start < end && text[start]) {
+ wchar_t buf[3];
+ const wchar_t* chr = next_sgml_cp(text, start, end, interval.end, buf);
+ const wchar_t* chr_end = chr + stdex::strlen(chr);
+ bool r = std::use_facet>(m_locale).scan_not(std::ctype_base::punct, chr, chr_end) == chr_end;
+ if (r && !m_invert || !r && m_invert) {
+ interval.start = start;
+ return true;
+ }
+ }
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+ };
+
+ ///
+ /// Test for any space or punctuation code unit
+ ///
+ template
+ class basic_space_or_punct_cu : public basic_tester
+ {
+ public:
+ basic_space_or_punct_cu(bool invert = false, _In_ const std::locale& locale = std::locale()) :
+ basic_tester(locale),
+ m_invert(invert)
+ {}
+
+ virtual bool match(
+ _In_reads_or_z_(end) const T* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ if (start < end && text[start]) {
+ bool r =
+ ((flags & match_multiline) || !islbreak(text[start])) &&
+ std::use_facet>(m_locale).is(std::ctype_base::space | std::ctype_base::punct, text[start]);
+ if (r && !m_invert || !r && m_invert) {
+ interval.end = (interval.start = start) + 1;
+ return true;
+ }
+ }
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+
+ protected:
+ bool m_invert;
+ };
+
+ using space_or_punct_cu = basic_space_or_punct_cu;
+ using wspace_or_punct_cu = basic_space_or_punct_cu;
+#ifdef _UNICODE
+ using tspace_or_punct_cu = wspace_or_punct_cu;
+#else
+ using tspace_or_punct_cu = space_or_punct_cu;
+#endif
+
+ ///
+ /// Test for any SGML space or punctuation code point
+ ///
+ class sgml_space_or_punct_cp : public basic_space_or_punct_cu
+ {
+ public:
+ virtual bool match(
+ _In_reads_or_z_(end) const char* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ if (start < end && text[start]) {
+ wchar_t buf[3];
+ const wchar_t* chr = next_sgml_cp(text, start, end, interval.end, buf);
+ const wchar_t* chr_end = chr + stdex::strlen(chr);
+ bool r =
+ ((flags & match_multiline) || !islbreak(chr, (size_t)-1)) &&
+ std::use_facet>(m_locale).scan_not(std::ctype_base::space | std::ctype_base::punct, chr, chr_end) == chr_end;
+ if (r && !m_invert || !r && m_invert) {
+ interval.start = start;
+ return true;
+ }
+ }
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+ };
+
+ ///
+ /// Test for beginning of line
+ ///
+ template
+ class basic_bol : public basic_tester
+ {
+ public:
+ basic_bol(bool invert = false) : m_invert(invert) {}
+
+ virtual bool match(
+ _In_reads_or_z_(end) const T* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ bool r = start == 0 || start <= end && islbreak(text[start - 1]);
+ if (r && !m_invert || !r && m_invert) {
+ interval.end = interval.start = start;
+ return true;
+ }
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+
+ protected:
+ bool m_invert;
+ };
+
+ using bol = basic_bol;
+ using wbol = basic_bol;
+#ifdef _UNICODE
+ using tbol = wbol;
+#else
+ using tbol = bol;
+#endif
+ using sgml_bol = basic_bol;
+
+ ///
+ /// Test for end of line
+ ///
+ template
+ class basic_eol : public basic_tester
+ {
+ public:
+ basic_eol(bool invert = false) : m_invert(invert) {}
+
+ virtual bool match(
+ _In_reads_or_z_(end) const T* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ bool r = islbreak(text[start]);
+ if (r && !m_invert || !r && m_invert) {
+ interval.end = interval.start = start;
+ return true;
+ }
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+
+ protected:
+ bool m_invert;
+ };
+
+ using eol = basic_eol;
+ using weol = basic_eol;
+#ifdef _UNICODE
+ using teol = weol;
+#else
+ using teol = eol;
+#endif
+ using sgml_eol = basic_eol;
+
+ template
+ class basic_set : public basic_tester
+ {
+ public:
+ basic_set(bool invert = false, _In_ const std::locale& locale = std::locale()) :
+ basic_tester(locale),
+ hit_offset((size_t)-1),
+ m_invert(invert)
+ {}
+
+ virtual bool match(
+ _In_reads_or_z_(end) const T* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default) = 0;
+
+ virtual void invalidate()
+ {
+ hit_offset = (size_t)-1;
+ basic_tester::invalidate();
+ }
+
+ public:
+ size_t hit_offset;
+
+ protected:
+ bool m_invert;
+ };
+
+ ///
+ /// Test for any code unit from a given string of code units
+ ///
+ template
+ class basic_cu_set : public basic_set
+ {
+ public:
+ basic_cu_set(
+ _In_reads_or_z_(count) const T* set,
+ _In_ size_t count = (size_t)-1,
+ _In_ bool invert = false,
+ _In_ const std::locale& locale = std::locale()) :
+ basic_set(invert, locale)
+ {
+ if (set)
+ m_set.assign(set, set + stdex::strnlen(set, count));
+ }
+
+ virtual bool match(
+ _In_reads_or_z_(end) const T* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ if (start < end && text[start]) {
+ const T* set = m_set.c_str();
+ const T* r = (flags & match_case_insensitive) ?
+ stdex::strnichr(set, text[start], m_set.size(), m_locale) :
+ stdex::strnchr(set, text[start], m_set.size());
+ if (r && !m_invert || !r && m_invert) {
+ hit_offset = r ? r - set : (size_t)-1;
+ interval.end = (interval.start = start) + 1;
+ return true;
+ }
+ }
+ hit_offset = (size_t)-1;
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+
+ protected:
+ std::basic_string m_set;
+ };
+
+ using cu_set = basic_cu_set;
+ using wcu_set = basic_cu_set;
+#ifdef _UNICODE
+ using tcu_set = wcu_set;
+#else
+ using tcu_set = cu_set;
+#endif
+
+ ///
+ /// Test for any SGML code point from a given string of SGML code points
+ ///
+ class sgml_cp_set : public basic_set
+ {
+ public:
+ sgml_cp_set(const char* set, size_t count = (size_t)-1, bool invert = false, _In_ const std::locale& locale = std::locale()) :
+ basic_set(invert, locale)
+ {
+ if (set)
+ m_set = sgml2str(set, count);
+ }
+
+ virtual bool match(
+ _In_reads_or_z_(end) const char* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ if (start < end && text[start]) {
+ wchar_t buf[3];
+ const wchar_t* chr = next_sgml_cp(text, start, end, interval.end, buf);
+ const wchar_t* set = m_set.c_str();
+ const wchar_t* r = (flags & match_case_insensitive) ?
+ stdex::strnistr(set, chr, m_set.size(), m_locale) :
+ stdex::strnstr(set, chr, m_set.size());
+ if (r && !m_invert || !r && m_invert) {
+ hit_offset = r ? r - set : (size_t)-1;
+ interval.start = start;
+ return true;
+ }
+ }
+ hit_offset = (size_t)-1;
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+
+ protected:
+ std::wstring m_set;
+ };
+
+ ///
+ /// Test for given string
+ ///
+ template
+ class basic_string : public basic_tester
+ {
+ public:
+ basic_string(
+ _In_reads_or_z_(count) const T* str,
+ _In_ size_t count = (size_t)-1,
+ _In_ const std::locale& locale = std::locale()) :
+ basic_tester(locale),
+ m_str(str, str + stdex::strnlen(str, count))
+ {}
+
+ virtual bool match(
+ _In_reads_or_z_(end) const T* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ size_t
+ m = m_str.size(),
+ n = std::min(end - start, m);
+ bool r = ((flags & match_case_insensitive) ?
+ stdex::strnicmp(text + start, n, m_str.c_str(), m, m_locale) :
+ stdex::strncmp(text + start, n, m_str.c_str(), m)) == 0;
+ if (r) {
+ interval.end = (interval.start = start) + n;
+ return true;
+ }
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+
+ protected:
+ std::basic_string m_str;
+ };
+
+ using string = basic_string;
+ using wstring = basic_string;
+#ifdef _UNICODE
+ using tstring = wstring;
+#else
+ using tstring = string;
+#endif
+
+ ///
+ /// Test for SGML given string
+ ///
+ class sgml_string : public sgml_tester
+ {
+ public:
+ sgml_string(const char* str, size_t count = (size_t)-1, _In_ const std::locale& locale = std::locale()) :
+ sgml_tester(locale),
+ m_str(sgml2str(str, count))
+ {}
+
+ virtual bool match(
+ _In_reads_or_z_(end) const char* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ const wchar_t* str = m_str.c_str();
+ const bool case_insensitive = flags & match_case_insensitive ? true : false;
+ const auto& ctype = std::use_facet>(m_locale);
+ for (interval.end = start;;) {
+ if (!*str) {
+ interval.start = start;
+ return true;
+ }
+ if (interval.end >= end || !text[interval.end]) {
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+ wchar_t buf[3];
+ const wchar_t* chr = next_sgml_cp(text, interval.end, end, interval.end, buf);
+ for (; *chr; ++str, ++chr) {
+ if (!*str ||
+ (case_insensitive ? ctype.tolower(*str) != ctype.tolower(*chr) : *str != *chr))
+ {
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+ }
+ }
+ }
+
+ protected:
+ std::wstring m_str;
+ };
+
+ ///
+ /// Test for repeating
+ ///
+ template
+ class basic_iterations : public basic_tester
+ {
+ public:
+ basic_iterations(const std::shared_ptr>& el, size_t min_iterations = 0, size_t max_iterations = (size_t)-1, bool greedy = true) :
+ m_el(el),
+ m_min_iterations(min_iterations),
+ m_max_iterations(max_iterations),
+ m_greedy(greedy)
+ {}
+
+ virtual bool match(
+ _In_reads_or_z_(end) const T* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ interval.start = interval.end = start;
+ for (size_t i = 0; ; i++) {
+ if (!m_greedy && i >= m_min_iterations || i >= m_max_iterations)
+ return true;
+ if (!m_el->match(text, interval.end, end, flags)) {
+ if (i >= m_min_iterations)
+ return true;
+ break;
+ }
+ if (m_el->interval.end == interval.end) {
+ // Element did match, but the matching interval was empty. Quit instead of spinning.
+ return true;
+ }
+ interval.end = m_el->interval.end;
+ }
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+
+ protected:
+ std::shared_ptr> m_el; ///< repeating element
+ size_t m_min_iterations; ///< minimum number of iterations
+ size_t m_max_iterations; ///< maximum number of iterations
+ bool m_greedy; ///< try to match as long sequence as possible
+ };
+
+ using iterations = basic_iterations;
+ using witerations = basic_iterations;
+#ifdef _UNICODE
+ using titerations = witerations;
+#else
+ using titerations = iterations;
+#endif
+ using sgml_iterations = basic_iterations;
+
+ ///
+ /// Base template for collection-holding testers
+ ///
+ template
+ class tester_collection : public basic_tester
+ {
+ protected:
+ tester_collection(_In_ const std::locale& locale) : basic_tester(locale) {}
+
+ public:
+ tester_collection(
+ _In_count_(count) const std::shared_ptr>* el,
+ _In_ size_t count,
+ _In_ const std::locale& locale = std::locale()) :
+ basic_tester(locale)
+ {
+ assert(el || !count);
+ m_collection.reserve(count);
+ for (size_t i = 0; i < count; i++)
+ m_collection.push_back(el[i]);
+ }
+
+ tester_collection(
+ _Inout_ std::vector>>&& collection,
+ _In_ const std::locale& locale = std::locale()) :
+ basic_tester(locale),
+ m_collection(std::move(collection))
+ {}
+
+ virtual void invalidate()
+ {
+ for (auto i = m_collection.begin(); i != m_collection.end(); ++i)
+ (*i)->invalidate();
+ basic_tester::invalidate();
+ }
+
+ protected:
+ std::vector>> m_collection;
+ };
+
+ ///
+ /// Test for sequence
+ ///
+ template
+ class basic_sequence : public tester_collection
+ {
+ public:
+ basic_sequence(
+ _In_count_(count) const std::shared_ptr>* el = nullptr,
+ _In_ size_t count = 0,
+ _In_ const std::locale& locale = std::locale()) :
+ tester_collection(el, count, locale)
+ {}
+
+ basic_sequence(
+ _Inout_ std::vector>>&& collection,
+ _In_ const std::locale& locale = std::locale()) :
+ tester_collection(std::move(collection), locale)
+ {}
+
+ virtual bool match(
+ _In_reads_or_z_(end) const T* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ interval.end = start;
+ for (auto i = m_collection.begin(); i != m_collection.end(); ++i) {
+ if (!(*i)->match(text, interval.end, end, flags)) {
+ for (++i; i != m_collection.end(); ++i)
+ (*i)->invalidate();
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+ interval.end = (*i)->interval.end;
+ }
+ interval.start = start;
+ return true;
+ }
+ };
+
+ using sequence = basic_sequence;
+ using wsequence = basic_sequence;
+#ifdef _UNICODE
+ using tsequence = wsequence;
+#else
+ using tsequence = sequence;
+#endif
+ using sgml_sequence = basic_sequence;
+
+ ///
+ /// Test for any
+ ///
+ template
+ class basic_branch : public tester_collection
+ {
+ protected:
+ basic_branch(_In_ const std::locale& locale) :
+ tester_collection(locale),
+ hit_offset((size_t)-1)
+ {}
+
+ public:
+ basic_branch(
+ _In_count_(count) const std::shared_ptr>* el = nullptr,
+ _In_ size_t count = 0,
+ _In_ const std::locale& locale = std::locale()) :
+ tester_collection(el, count, locale),
+ hit_offset((size_t)-1)
+ {}
+
+ basic_branch(
+ _Inout_ std::vector>>&& collection,
+ _In_ const std::locale& locale = std::locale()) :
+ tester_collection(std::move(collection), locale),
+ hit_offset((size_t)-1)
+ {}
+
+ virtual bool match(
+ _In_reads_or_z_(end) const T* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ hit_offset = 0;
+ for (auto i = m_collection.begin(); i != m_collection.end(); ++i, ++hit_offset) {
+ if ((*i)->match(text, start, end, flags)) {
+ interval = (*i)->interval;
+ for (++i; i != m_collection.end(); ++i)
+ (*i)->invalidate();
+ return true;
+ }
+ }
+ hit_offset = (size_t)-1;
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+
+ virtual void invalidate()
+ {
+ hit_offset = (size_t)-1;
+ tester_collection::invalidate();
+ }
+
+ public:
+ size_t hit_offset;
+ };
+
+ using branch = basic_branch;
+ using wbranch = basic_branch;
+#ifdef _UNICODE
+ using tbranch = wbranch;
+#else
+ using tbranch = branch;
+#endif
+ using sgml_branch = basic_branch;
+
+ ///
+ /// Test for any string
+ ///
+ template >
+ class basic_string_branch : public basic_branch
+ {
+ public:
+ inline basic_string_branch(
+ _In_reads_(count) const T* str_z = nullptr,
+ _In_ size_t count = 0,
+ _In_ const std::locale& locale = std::locale()) :
+ basic_branch(locale)
+ {
+ build(str_z, count);
+ }
+
+ inline basic_string_branch(_In_z_ const T* str, ...) :
+ basic_branch(std::locale())
+ {
+ va_list params;
+ va_start(params, str);
+ build(str, params);
+ va_end(params);
+ }
+
+ inline basic_string_branch(_In_ const std::locale& locale, _In_z_ const T* str, ...) :
+ basic_branch(locale)
+ {
+ va_list params;
+ va_start(params, str);
+ build(str, params);
+ va_end(params);
+ }
+
+ protected:
+ void build(_In_reads_(count) const T* str_z, _In_ size_t count)
+ {
+ assert(str_z || !count);
+ if (count) {
+ size_t offset, n;
+ for (
+ offset = n = 0;
+ offset < count && str_z[offset];
+ offset += stdex::strnlen(str_z + offset, count - offset) + 1, ++n);
+ m_collection.reserve(n);
+ for (
+ offset = 0;
+ offset < count && str_z[offset];
+ offset += stdex::strnlen(str_z + offset, count - offset) + 1)
+ m_collection.push_back(std::move(std::make_shared(str_z + offset, count - offset, m_locale)));
+ }
+ }
+
+ void build(_In_z_ const T* str, _In_ va_list params)
+ {
+ const T* p;
+ for (
+ m_collection.push_back(std::move(std::make_shared(str, (size_t)-1, m_locale)));
+ (p = va_arg(params, const T*)) != nullptr;
+ m_collection.push_back(std::move(std::make_shared(p, (size_t)-1, m_locale))));
+ }
+ };
+
+ using string_branch = basic_string_branch;
+ using wstring_branch = basic_string_branch;
+#ifdef _UNICODE
+ using tstring_branch = wstring_branch;
+#else
+ using tstring_branch = string_branch;
+#endif
+ using sgml_string_branch = basic_string_branch;
+
+ ///
+ /// Test for permutation
+ ///
+ template
+ class basic_permutation : public tester_collection
+ {
+ public:
+ basic_permutation(
+ _In_count_(count) const std::shared_ptr>* el = nullptr,
+ _In_ size_t count = 0,
+ _In_ const std::locale& locale = std::locale()) :
+ tester_collection(el, count, locale)
+ {}
+
+ basic_permutation(
+ _Inout_ std::vector>>&& collection,
+ _In_ const std::locale& locale = std::locale()) :
+ tester_collection(std::move(collection), locale)
+ {}
+
+ virtual bool match(
+ _In_reads_or_z_(end) const T* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ for (auto i = m_collection.begin(); i != m_collection.end(); ++i)
+ (*i)->invalidate();
+ if (match_recursively(text, start, end, flags)) {
+ interval.start = start;
+ return true;
+ }
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+
+ protected:
+ bool match_recursively(
+ _In_reads_or_z_(end) const T* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ bool all_matched = true;
+ for (auto i = m_collection.begin(); i != m_collection.cend(); ++i) {
+ if (!(*i)->interval) {
+ // Element was not matched in permutatuion yet.
+ all_matched = false;
+ if ((*i)->match(text, start, end, flags)) {
+ // Element matched for the first time.
+ if (match_recursively(text, (*i)->interval.end, end, flags)) {
+ // Rest of the elements matched too.
+ return true;
+ }
+ (*i)->invalidate();
+ }
+ }
+ }
+ if (all_matched) {
+ interval.end = start;
+ return true;
+ }
+ return false;
+ }
+ };
+
+ using permutation = basic_permutation;
+ using wpermutation = basic_permutation;
+#ifdef _UNICODE
+ using tpermutation = wpermutation;
+#else
+ using tpermutation = permutation;
+#endif
+ using sgml_permutation = basic_permutation;
+
+ ///
+ /// Base class for integer testing
+ ///
+ template
+ class basic_integer : public basic_tester
+ {
+ public:
+ basic_integer(_In_ const std::locale& locale = std::locale()) :
+ basic_tester(locale),
+ value(0)
+ {}
+
+ virtual void invalidate()
+ {
+ value = 0;
+ basic_tester::invalidate();
+ }
+
+ public:
+ size_t value; ///< Calculated value of the numeral
+ };
+
+ ///
+ /// Test for decimal integer
+ ///
+ template
+ class basic_integer10 : public basic_integer
+ {
+ public:
+ basic_integer10(
+ _In_ const std::shared_ptr>& digit_0,
+ _In_ const std::shared_ptr>& digit_1,
+ _In_ const std::shared_ptr>& digit_2,
+ _In_ const std::shared_ptr>& digit_3,
+ _In_ const std::shared_ptr>& digit_4,
+ _In_ const std::shared_ptr>& digit_5,
+ _In_ const std::shared_ptr>& digit_6,
+ _In_ const std::shared_ptr>& digit_7,
+ _In_ const std::shared_ptr>& digit_8,
+ _In_ const std::shared_ptr>& digit_9,
+ _In_ const std::locale& locale = std::locale()) :
+ basic_integer(locale),
+ m_digit_0(digit_0),
+ m_digit_1(digit_1),
+ m_digit_2(digit_2),
+ m_digit_3(digit_3),
+ m_digit_4(digit_4),
+ m_digit_5(digit_5),
+ m_digit_6(digit_6),
+ m_digit_7(digit_7),
+ m_digit_8(digit_8),
+ m_digit_9(digit_9)
+ {}
+
+ virtual bool match(
+ _In_reads_or_z_(end) const T* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ for (interval.end = start, value = 0; interval.end < end && text[interval.end];) {
+ size_t dig;
+ if (m_digit_0->match(text, interval.end, end, flags)) { dig = 0; interval.end = m_digit_0->interval.end; }
+ else if (m_digit_1->match(text, interval.end, end, flags)) { dig = 1; interval.end = m_digit_1->interval.end; }
+ else if (m_digit_2->match(text, interval.end, end, flags)) { dig = 2; interval.end = m_digit_2->interval.end; }
+ else if (m_digit_3->match(text, interval.end, end, flags)) { dig = 3; interval.end = m_digit_3->interval.end; }
+ else if (m_digit_4->match(text, interval.end, end, flags)) { dig = 4; interval.end = m_digit_4->interval.end; }
+ else if (m_digit_5->match(text, interval.end, end, flags)) { dig = 5; interval.end = m_digit_5->interval.end; }
+ else if (m_digit_6->match(text, interval.end, end, flags)) { dig = 6; interval.end = m_digit_6->interval.end; }
+ else if (m_digit_7->match(text, interval.end, end, flags)) { dig = 7; interval.end = m_digit_7->interval.end; }
+ else if (m_digit_8->match(text, interval.end, end, flags)) { dig = 8; interval.end = m_digit_8->interval.end; }
+ else if (m_digit_9->match(text, interval.end, end, flags)) { dig = 9; interval.end = m_digit_9->interval.end; }
+ else break;
+ value = value * 10 + dig;
+ }
+ if (start < interval.end) {
+ interval.start = start;
+ return true;
+ }
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+
+ protected:
+ std::shared_ptr>
+ m_digit_0,
+ m_digit_1,
+ m_digit_2,
+ m_digit_3,
+ m_digit_4,
+ m_digit_5,
+ m_digit_6,
+ m_digit_7,
+ m_digit_8,
+ m_digit_9;
+ };
+
+ using integer10 = basic_integer10;
+ using winteger10 = basic_integer10;
+#ifdef _UNICODE
+ using tinteger10 = winteger10;
+#else
+ using tinteger10 = integer10;
+#endif
+ using sgml_integer10 = basic_integer10;
+
+ ///
+ /// Test for decimal integer possibly containing thousand separators
+ ///
+ template
+ class basic_integer10ts : public basic_integer
+ {
+ public:
+ basic_integer10ts(
+ _In_ const std::shared_ptr>& digits,
+ _In_ const std::shared_ptr>& separator,
+ _In_ const std::locale& locale = std::locale()) :
+ basic_integer(locale),
+ digit_count(0),
+ has_separators(false),
+ m_digits(digits),
+ m_separator(separator)
+ {}
+
+ virtual bool match(
+ _In_reads_or_z_(end) const T* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ if (m_digits->match(text, start, end, flags)) {
+ // Leading part match.
+ value = m_digits->value;
+ digit_count = m_digits->interval.size();
+ has_separators = false;
+ interval.start = start;
+ interval.end = m_digits->interval.end;
+ if (m_digits->interval.size() <= 3) {
+ // Maybe separated with thousand separators?
+ size_t hit_offset = (size_t)-1;
+ while (m_separator->match(text, interval.end, end, flags) &&
+ (hit_offset == (size_t)-1 || hit_offset == m_separator->hit_offset) && // All separators must be the same, no mixing.
+ m_digits->match(text, m_separator->interval.end, end, flags) &&
+ m_digits->interval.size() == 3)
+ {
+ // Thousand separator and three-digit integer followed.
+ value = value * 1000 + m_digits->value;
+ digit_count += 3;
+ has_separators = true;
+ interval.end = m_digits->interval.end;
+ hit_offset = m_separator->hit_offset;
+ }
+ }
+
+ return true;
+ }
+ value = 0;
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+
+ virtual void invalidate()
+ {
+ digit_count = 0;
+ has_separators = false;
+ basic_integer::invalidate();
+ }
+
+ public:
+ size_t digit_count; ///< Total number of digits in integer
+ bool has_separators; ///< Did integer have any separators?
+
+ protected:
+ std::shared_ptr> m_digits;
+ std::shared_ptr> m_separator;
+ };
+
+ using integer10ts = basic_integer10ts;
+ using winteger10ts = basic_integer10ts;
+#ifdef _UNICODE
+ using tinteger10ts = winteger10ts;
+#else
+ using tinteger10ts = integer10ts;
+#endif
+ using sgml_integer10ts = basic_integer10ts;
+
+ ///
+ /// Test for hexadecimal integer
+ ///
+ template
+ class basic_integer16 : public basic_integer
+ {
+ public:
+ basic_integer16(
+ _In_ const std::shared_ptr>& digit_0,
+ _In_ const std::shared_ptr>& digit_1,
+ _In_ const std::shared_ptr>& digit_2,
+ _In_ const std::shared_ptr>& digit_3,
+ _In_ const std::shared_ptr>& digit_4,
+ _In_ const std::shared_ptr>& digit_5,
+ _In_ const std::shared_ptr>& digit_6,
+ _In_ const std::shared_ptr>& digit_7,
+ _In_ const std::shared_ptr>& digit_8,
+ _In_ const std::shared_ptr>& digit_9,
+ _In_ const std::shared_ptr>& digit_10,
+ _In_ const std::shared_ptr>& digit_11,
+ _In_ const std::shared_ptr>& digit_12,
+ _In_ const std::shared_ptr>& digit_13,
+ _In_ const std::shared_ptr>& digit_14,
+ _In_ const std::shared_ptr>& digit_15,
+ _In_ const std::locale& locale = std::locale()) :
+ basic_integer(locale),
+ m_digit_0(digit_0),
+ m_digit_1(digit_1),
+ m_digit_2(digit_2),
+ m_digit_3(digit_3),
+ m_digit_4(digit_4),
+ m_digit_5(digit_5),
+ m_digit_6(digit_6),
+ m_digit_7(digit_7),
+ m_digit_8(digit_8),
+ m_digit_9(digit_9),
+ m_digit_10(digit_10),
+ m_digit_11(digit_11),
+ m_digit_12(digit_12),
+ m_digit_13(digit_13),
+ m_digit_14(digit_14),
+ m_digit_15(digit_15)
+ {}
+
+ virtual bool match(
+ _In_reads_or_z_(end) const T* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ for (interval.end = start, value = 0; interval.end < end && text[interval.end];) {
+ size_t dig;
+ if (m_digit_0->match(text, interval.end, end, flags)) { dig = 0; interval.end = m_digit_0->interval.end; }
+ else if (m_digit_1->match(text, interval.end, end, flags)) { dig = 1; interval.end = m_digit_1->interval.end; }
+ else if (m_digit_2->match(text, interval.end, end, flags)) { dig = 2; interval.end = m_digit_2->interval.end; }
+ else if (m_digit_3->match(text, interval.end, end, flags)) { dig = 3; interval.end = m_digit_3->interval.end; }
+ else if (m_digit_4->match(text, interval.end, end, flags)) { dig = 4; interval.end = m_digit_4->interval.end; }
+ else if (m_digit_5->match(text, interval.end, end, flags)) { dig = 5; interval.end = m_digit_5->interval.end; }
+ else if (m_digit_6->match(text, interval.end, end, flags)) { dig = 6; interval.end = m_digit_6->interval.end; }
+ else if (m_digit_7->match(text, interval.end, end, flags)) { dig = 7; interval.end = m_digit_7->interval.end; }
+ else if (m_digit_8->match(text, interval.end, end, flags)) { dig = 8; interval.end = m_digit_8->interval.end; }
+ else if (m_digit_9->match(text, interval.end, end, flags)) { dig = 9; interval.end = m_digit_9->interval.end; }
+ else if (m_digit_10->match(text, interval.end, end, flags)) { dig = 10; interval.end = m_digit_10->interval.end; }
+ else if (m_digit_11->match(text, interval.end, end, flags)) { dig = 11; interval.end = m_digit_11->interval.end; }
+ else if (m_digit_12->match(text, interval.end, end, flags)) { dig = 12; interval.end = m_digit_12->interval.end; }
+ else if (m_digit_13->match(text, interval.end, end, flags)) { dig = 13; interval.end = m_digit_13->interval.end; }
+ else if (m_digit_14->match(text, interval.end, end, flags)) { dig = 14; interval.end = m_digit_14->interval.end; }
+ else if (m_digit_15->match(text, interval.end, end, flags)) { dig = 15; interval.end = m_digit_15->interval.end; }
+ else break;
+ value = value * 16 + dig;
+ }
+ if (start < interval.end) {
+ interval.start = start;
+ return true;
+ }
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+
+ protected:
+ std::shared_ptr>
+ m_digit_0,
+ m_digit_1,
+ m_digit_2,
+ m_digit_3,
+ m_digit_4,
+ m_digit_5,
+ m_digit_6,
+ m_digit_7,
+ m_digit_8,
+ m_digit_9,
+ m_digit_10,
+ m_digit_11,
+ m_digit_12,
+ m_digit_13,
+ m_digit_14,
+ m_digit_15;
+ };
+
+ using integer16 = basic_integer16;
+ using winteger16 = basic_integer16;
+#ifdef _UNICODE
+ using tinteger16 = winteger16;
+#else
+ using tinteger16 = integer16;
+#endif
+ using sgml_integer16 = basic_integer16;
+
+ ///
+ /// Test for Roman numeral
+ ///
+ template
+ class basic_roman_numeral : public basic_integer
+ {
+ public:
+ basic_roman_numeral(
+ _In_ const std::shared_ptr>& digit_1,
+ _In_ const std::shared_ptr>& digit_5,
+ _In_ const std::shared_ptr>& digit_10,
+ _In_ const std::shared_ptr>& digit_50,
+ _In_ const std::shared_ptr>& digit_100,
+ _In_ const std::shared_ptr>& digit_500,
+ _In_ const std::shared_ptr>& digit_1000,
+ _In_ const std::shared_ptr>& digit_5000,
+ _In_ const std::shared_ptr>& digit_10000,
+ _In_ const std::locale& locale = std::locale()) :
+ basic_integer(locale),
+ m_digit_1(digit_1),
+ m_digit_5(digit_5),
+ m_digit_10(digit_10),
+ m_digit_50(digit_50),
+ m_digit_100(digit_100),
+ m_digit_500(digit_500),
+ m_digit_1000(digit_1000),
+ m_digit_5000(digit_5000),
+ m_digit_10000(digit_10000)
+ {}
+
+ virtual bool match(
+ _In_reads_or_z_(end) const T* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ size_t
+ dig[5] = { (size_t)-1, (size_t)-1, (size_t)-1, (size_t)-1, (size_t)-1 },
+ end2;
+
+ for (interval.end = start, value = 0; interval.end < end && text[interval.end]; dig[3] = dig[2], dig[2] = dig[1], dig[1] = dig[0], interval.end = end2) {
+ if (m_digit_1 && m_digit_1->match(text, interval.end, end, flags)) { dig[0] = 1; end2 = m_digit_1->interval.end; }
+ else if (m_digit_5 && m_digit_5->match(text, interval.end, end, flags)) { dig[0] = 5; end2 = m_digit_5->interval.end; }
+ else if (m_digit_10 && m_digit_10->match(text, interval.end, end, flags)) { dig[0] = 10; end2 = m_digit_10->interval.end; }
+ else if (m_digit_50 && m_digit_50->match(text, interval.end, end, flags)) { dig[0] = 50; end2 = m_digit_50->interval.end; }
+ else if (m_digit_100 && m_digit_100->match(text, interval.end, end, flags)) { dig[0] = 100; end2 = m_digit_100->interval.end; }
+ else if (m_digit_500 && m_digit_500->match(text, interval.end, end, flags)) { dig[0] = 500; end2 = m_digit_500->interval.end; }
+ else if (m_digit_1000 && m_digit_1000->match(text, interval.end, end, flags)) { dig[0] = 1000; end2 = m_digit_1000->interval.end; }
+ else if (m_digit_5000 && m_digit_5000->match(text, interval.end, end, flags)) { dig[0] = 5000; end2 = m_digit_5000->interval.end; }
+ else if (m_digit_10000 && m_digit_10000->match(text, interval.end, end, flags)) { dig[0] = 10000; end2 = m_digit_10000->interval.end; }
+ else break;
+
+ // Store first digit.
+ if (dig[4] == (size_t)-1) dig[4] = dig[0];
+
+ if (dig[3] == dig[2] && dig[2] == dig[1] && dig[1] == dig[0] && dig[0] != dig[4]) {
+ // Same digit repeated four times. No-go, unless first digit. E.g. XIIII vs. XIV. MMMMMCD allowed, IIII also...
+ break;
+ }
+ if (dig[0] <= dig[1]) {
+ // Digit is less or equal previous one: add.
+ value += dig[0];
+ }
+ else if (
+ dig[1] == 1 && (dig[0] == 5 || dig[0] == 10) ||
+ dig[1] == 10 && (dig[0] == 50 || dig[0] == 100) ||
+ dig[1] == 100 && (dig[0] == 500 || dig[0] == 1000) ||
+ dig[1] == 1000 && (dig[0] == 5000 || dig[0] == 10000))
+ {
+ // Digit is up to two orders bigger than previous one: subtract. But...
+ if (dig[2] < dig[0]) {
+ // Digit is also bigger than pre-previous one. E.g. VIX (V < X => invalid)
+ break;
+ }
+ value -= dig[1]; // Cancel addition in the previous step.
+ dig[0] -= dig[1]; // Combine last two digits.
+ dig[1] = dig[2]; // The true previous digit is now pre-previous one. :)
+ dig[2] = dig[3]; // The true pre-previous digit is now pre-pre-previous one. :)
+ value += dig[0]; // Add combined value.
+ }
+ else {
+ // New digit is too big than the previous one. E.g. VX (V < X => invalid)
+ break;
+ }
+ }
+ if (value) {
+ interval.start = start;
+ return true;
+ }
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+
+ protected:
+ std::shared_ptr>
+ m_digit_1,
+ m_digit_5,
+ m_digit_10,
+ m_digit_50,
+ m_digit_100,
+ m_digit_500,
+ m_digit_1000,
+ m_digit_5000,
+ m_digit_10000;
+ };
+
+ using roman_numeral = basic_roman_numeral;
+ using wroman_numeral = basic_roman_numeral;
+#ifdef _UNICODE
+ using troman_numeral = wroman_numeral;
+#else
+ using troman_numeral = roman_numeral;
+#endif
+ using sgml_roman_numeral = basic_roman_numeral;
+
+ ///
+ /// Test for fraction
+ ///
+ template
+ class basic_fraction : public basic_tester
+ {
+ public:
+ basic_fraction(
+ _In_ const std::shared_ptr>& _numerator,
+ _In_ const std::shared_ptr>& _fraction_line,
+ _In_ const std::shared_ptr>& _denominator,
+ _In_ const std::locale& locale = std::locale()) :
+ basic_tester(locale),
+ numerator(_numerator),
+ fraction_line(_fraction_line),
+ denominator(_denominator)
+ {}
+
+ virtual bool match(
+ _In_reads_or_z_(end) const T* text,
+ _In_ size_t start = 0,
+ _In_ size_t end = (size_t)-1,
+ _In_ int flags = match_default)
+ {
+ assert(text || start >= end);
+ if (numerator->match(text, start, end, flags) &&
+ fraction_line->match(text, numerator->interval.end, end, flags) &&
+ denominator->match(text, fraction_line->interval.end, end, flags))
+ {
+ interval.start = start;
+ interval.end = denominator->interval.end;
+ return true;
+ }
+ numerator->invalidate();
+ fraction_line->invalidate();
+ denominator->invalidate();
+ interval.start = (interval.end = start) + 1;
+ return false;
+ }
+
+ virtual void invalidate()
+ {
+ numerator->invalidate();
+ fraction_line->invalidate();
+ denominator->invalidate();
+ basic_tester::invalidate();
+ }
+
+ public:
+ std::shared_ptr> numerator;
+ std::shared_ptr> fraction_line;
+ std::shared_ptr> denominator;
+ };
+
+ using fraction = basic_fraction;
+ using wfraction = basic_fraction;
+#ifdef _UNICODE
+ using tfraction = wfraction;
+#else
+ using tfraction = fraction;
+#endif
+ using sgml_fraction = basic_fraction