From f41564a3e2ecdf88e6ea4ca2d309c1288173a3b7 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Fri, 18 Jun 2021 23:56:09 +0200 Subject: [PATCH] Add wxRegEx::ConvertFromBasic() helper This will be used to implement support for BREs using PCRE which doesn't support them directly in the upcoming commits. --- include/wx/regex.h | 3 + interface/wx/regex.h | 10 ++ misc/suppressions/codespell-lines | 3 + src/common/regex.cpp | 275 ++++++++++++++++++++++++++++++ tests/regex/wxregextest.cpp | 14 ++ 5 files changed, 305 insertions(+) diff --git a/include/wx/regex.h b/include/wx/regex.h index ac70ff3907..dc3b34c4a5 100644 --- a/include/wx/regex.h +++ b/include/wx/regex.h @@ -144,6 +144,9 @@ public: static wxString QuoteMeta(const wxString& str); + // return the extended RE corresponding to the given basic RE + static wxString ConvertFromBasic(const wxString& bre); + // dtor not virtual, don't derive from this class ~wxRegEx(); diff --git a/interface/wx/regex.h b/interface/wx/regex.h index 0a6f24558c..cc3dc92acf 100644 --- a/interface/wx/regex.h +++ b/interface/wx/regex.h @@ -276,5 +276,15 @@ public: @since 3.1.3 */ static wxString QuoteMeta(const wxString& str); + + /** + Converts a basic regular expression to an extended regex syntax. + + This function can be used to convert @a bre using deprecated wxRE_BASIC + syntax to default (extended) syntax. + + @since 3.1.6 + */ + static wxString ConvertFromBasic(const wxString& bre); }; diff --git a/misc/suppressions/codespell-lines b/misc/suppressions/codespell-lines index 44401db6a6..3e1c2e8461 100644 --- a/misc/suppressions/codespell-lines +++ b/misc/suppressions/codespell-lines @@ -24,3 +24,6 @@ expressions (BRE). EREs are roughly those of the traditional @e egrep, // 2019), i.e. SEH translator seems to work just fine without /EHa too, so // Purpose: helpers for the structured exception handling (SEH) under Win32 * MinGW-w64 versions 7.3 and 8.1 (32-bit binaries use SJLJ exceptions, 64-bit ones use SEH, and all binaries use Win32 threads). + static wxString ConvertFromBasic(const wxString& bre); + This function can be used to convert @a bre using deprecated wxRE_BASIC + static wxString ConvertFromBasic(const wxString& bre); diff --git a/src/common/regex.cpp b/src/common/regex.cpp index 38089f9835..ed86b31c0c 100644 --- a/src/common/regex.cpp +++ b/src/common/regex.cpp @@ -258,6 +258,281 @@ wxString wxRegExImpl::GetErrorMsg(int errorcode, bool badconv) const return szError; } +// Helper function for processing bracket expressions inside a regex. +// +// Advance the iterator until the closing bracket matching the opening one the +// iterator currently points to, i.e.: +// +// Precondition: *it == '[' +// Postcondition: *it == ']' or it == end if failed to find matching ']' +static +wxString::const_iterator +SkipBracketExpression(wxString::const_iterator it, wxString::const_iterator end) +{ + wxASSERT_MSG( *it == '[', "must be at the start of bracket expression" ); + + // Initial ']', possibly after the preceding '^', is different because it + // stands for a literal ']' and not the end of the bracket expression, so + // check for it first. + ++it; + if ( it != end && *it == '^' ) + ++it; + if ( it != end && *it == ']' ) + ++it; + + // Any ']' from now on ends the bracket expression. + for ( ; it != end; ++it ) + { + const wxUniChar c = *it; + + if ( c == ']' ) + break; + + if ( c == '[' ) + { + // Bare '[' on its own is not special, but collating elements and + // character classes are, so check for them and advance past them + // if necessary to avoid misinterpreting the matching closing ']'. + if ( ++it == end ) + break; + + const wxUniChar c = *it; + if ( c == ':' || c == '.' || c == '=' ) + { + for ( ++it; it != end; ++it ) + { + if ( *it == c ) + { + if ( ++it == end ) + break; + + if ( *it == ']' ) + break; + } + } + + if ( it == end ) + break; + } + } + } + + return it; +} + +/* static */ +wxString wxRegEx::ConvertFromBasic(const wxString& bre) +{ + /* + Quoting regex(7): + + Obsolete ("basic") regular expressions differ in several respects. + '|', '+', and '?' are ordinary characters and there is no equivalent + for their functionality. The delimiters for bounds are "\{" and "\}", + with '{' and '}' by themselves ordinary characters. The parentheses + for nested subexpressions are "\(" and "\)", with '(' and ')' by + themselves ordinary characters. '^' is an ordinary character except at + the beginning of the RE or(!) the beginning of a parenthesized + subexpression, '$' is an ordinary character except at the end of the RE + or(!) the end of a parenthesized subexpression, and '*' is an ordinary + character if it appears at the beginning of the RE or the beginning of + a parenthesized subexpression (after a possible leading '^'). + + Finally, there is one new type of atom, a back reference: '\' followed + by a nonzero decimal digit d matches the same sequence of characters + matched by the dth parenthesized subexpression [...] + */ + wxString ere; + ere.reserve(bre.length()); + + enum SinceStart + { + SinceStart_None, // Just at the beginning. + SinceStart_OnlyCaret, // Had just "^" since the beginning. + SinceStart_Some // Had something else since the beginning. + }; + + struct State + { + explicit State(SinceStart sinceStart_) + { + isBackslash = false; + sinceStart = sinceStart_; + } + + bool isBackslash; + SinceStart sinceStart; + }; + + State previous(SinceStart_None); + for ( wxString::const_iterator it = bre.begin(), + end = bre.end(); + it != end; + ++it ) + { + const wxUniChar c = *it; + + // What should be done with the current character? + enum Disposition + { + Disposition_Skip, // Nothing. + Disposition_Append, // Append to output. + Disposition_Escape // ... after escaping it with backslash. + } disposition = Disposition_Append; + + State current(SinceStart_Some); + + if ( previous.isBackslash ) + { + // By default, keep the backslash present in the BRE, it's still + // needed in the ERE too. + disposition = Disposition_Escape; + + switch ( c.GetValue() ) + { + case '(': + // It's the start of a new subexpression. + current.sinceStart = SinceStart_None; + wxFALLTHROUGH; + + case ')': + case '{': + case '}': + // Do not escape to ensure they remain special in the ERE + // as the escaped versions were special in the BRE. + disposition = Disposition_Append; + break; + } + } + else // This character is not escaped. + { + switch ( c.GetValue() ) + { + case '\\': + current.isBackslash = true; + + // Don't do anything with it yet, we'll deal with it later. + disposition = Disposition_Skip; + break; + + case '^': + // Escape unless it appears at the start. + switch ( previous.sinceStart ) + { + case SinceStart_None: + // Don't escape, but do update the state. + current.sinceStart = SinceStart_OnlyCaret; + break; + + case SinceStart_OnlyCaret: + case SinceStart_Some: + disposition = Disposition_Escape; + break; + } + break; + + case '*': + // Escape unless it appears at the start or right after "^". + switch ( previous.sinceStart ) + { + case SinceStart_None: + case SinceStart_OnlyCaret: + disposition = Disposition_Escape; + break; + + case SinceStart_Some: + break; + } + break; + + case '$': + // Escape unless it appears at the end or just before "\)". + disposition = Disposition_Escape; + { + wxString::const_iterator next = it; + ++next; + if ( next == end ) + { + // It is at the end, so has special meaning. + disposition = Disposition_Append; + } + else // Not at the end, but maybe at subexpression end? + { + if ( *next == '\\' ) + { + ++next; + if ( next != end && *next == ')' ) + disposition = Disposition_Append; + } + } + } + break; + + case '|': + case '+': + case '?': + case '(': + case ')': + case '{': + case '}': + // Escape these characters which are not special in a BRE, + // but would be special in a ERE if left unescaped. + disposition = Disposition_Escape; + break; + + case '[': + // Rules are very different for the characters inside the + // bracket expressions and we don't have to change anything + // for them as the syntax is the same for BREs and EREs, so + // just process the entire expression at once. + { + const wxString::const_iterator start = it; + it = SkipBracketExpression(it, end); + + // Copy everything inside without any changes. + ere += wxString(start, it); + + if ( it == end ) + { + // If we reached the end without finding the + // matching ']' there is nothing remaining anyhow. + return ere; + } + + // Note that default Disposition_Append here is fine, + // we'll append the closing ']' to "ere" below. + } + break; + } + } + + switch ( disposition ) + { + case Disposition_Skip: + break; + + case Disposition_Escape: + ere += '\\'; + wxFALLTHROUGH; + + case Disposition_Append: + // Note: don't use "c" here, iterator may have been advanced + // inside the loop. + ere += *it; + break; + } + + previous = current; + } + + // It's an error if a RE ends with a backslash, but we still need to + // preserve this error in the resulting RE. + if ( previous.isBackslash ) + ere += '\\'; + + return ere; +} + bool wxRegExImpl::Compile(const wxString& expr, int flags) { Reinit(); diff --git a/tests/regex/wxregextest.cpp b/tests/regex/wxregextest.cpp index eaaefe2a5d..3d9780d12d 100644 --- a/tests/regex/wxregextest.cpp +++ b/tests/regex/wxregextest.cpp @@ -170,4 +170,18 @@ TEST_CASE("wxRegEx::QuoteMeta", "[regex][meta]") CHECK( wxRegEx::QuoteMeta(":foo.*bar") == ":foo\\.\\*bar" ); } +TEST_CASE("wxRegEx::ConvertFromBasic", "[regex][basic]") +{ + CHECK( wxRegEx::ConvertFromBasic("\\(a\\)b") == "(a)b" ); + CHECK( wxRegEx::ConvertFromBasic("a\\{0,1\\}b") == "a{0,1}b" ); + CHECK( wxRegEx::ConvertFromBasic("*") == "\\*" ); + CHECK( wxRegEx::ConvertFromBasic("**") == "\\**" ); + CHECK( wxRegEx::ConvertFromBasic("^*") == "^\\*" ); + CHECK( wxRegEx::ConvertFromBasic("^^") == "^\\^" ); + CHECK( wxRegEx::ConvertFromBasic("x$y") == "x\\$y" ); + CHECK( wxRegEx::ConvertFromBasic("$$") == "\\$$" ); + CHECK( wxRegEx::ConvertFromBasic("\\(x$\\)") == "(x$)" ); + CHECK( wxRegEx::ConvertFromBasic("[^$\\)]") == "[^$\\)]" ); +} + #endif // wxUSE_REGEX