From f41564a3e2ecdf88e6ea4ca2d309c1288173a3b7 Mon Sep 17 00:00:00 2001
From: Vadim Zeitlin <vadim@wxwidgets.org>
Date: Fri, 18 Jun 2021 23:56:09 +0200
Subject: [PATCH] Add wxRegEx::ConvertFromBasic() helper

This will be used to implement support for BREs using PCRE which doesn't
support them directly in the upcoming commits.
---
 include/wx/regex.h                |   3 +
 interface/wx/regex.h              |  10 ++
 misc/suppressions/codespell-lines |   3 +
 src/common/regex.cpp              | 275 ++++++++++++++++++++++++++++++
 tests/regex/wxregextest.cpp       |  14 ++
 5 files changed, 305 insertions(+)
diff --git a/include/wx/regex.h b/include/wx/regex.h
index ac70ff3907..dc3b34c4a5 100644
--- a/include/wx/regex.h
+++ b/include/wx/regex.h
@@ -144,6 +144,9 @@ public:
 
     static wxString QuoteMeta(const wxString& str);
 
+    // return the extended RE corresponding to the given basic RE
+    static wxString ConvertFromBasic(const wxString& bre);
+
     // dtor not virtual, don't derive from this class
     ~wxRegEx();
 
diff --git a/interface/wx/regex.h b/interface/wx/regex.h
index 0a6f24558c..cc3dc92acf 100644
--- a/interface/wx/regex.h
+++ b/interface/wx/regex.h
@@ -276,5 +276,15 @@ public:
         @since 3.1.3
     */
     static wxString QuoteMeta(const wxString& str);
+
+    /**
+        Converts a basic regular expression to an extended regex syntax.
+
+        This function can be used to convert @a bre using deprecated wxRE_BASIC
+        syntax to default (extended) syntax.
+
+        @since 3.1.6
+     */
+    static wxString ConvertFromBasic(const wxString& bre);
 };
 
diff --git a/misc/suppressions/codespell-lines b/misc/suppressions/codespell-lines
index 44401db6a6..3e1c2e8461 100644
--- a/misc/suppressions/codespell-lines
+++ b/misc/suppressions/codespell-lines
@@ -24,3 +24,6 @@ expressions</em> (BRE). EREs are roughly those of the traditional @e egrep,
     // 2019), i.e. SEH translator seems to work just fine without /EHa too, so
 // Purpose:     helpers for the structured exception handling (SEH) under Win32
 * MinGW-w64 versions 7.3 and 8.1 (32-bit binaries use SJLJ exceptions, 64-bit ones use SEH, and all binaries use Win32 threads).
+    static wxString ConvertFromBasic(const wxString& bre);
+        This function can be used to convert @a bre using deprecated wxRE_BASIC
+    static wxString ConvertFromBasic(const wxString& bre);
diff --git a/src/common/regex.cpp b/src/common/regex.cpp
index 38089f9835..ed86b31c0c 100644
--- a/src/common/regex.cpp
+++ b/src/common/regex.cpp
@@ -258,6 +258,281 @@ wxString wxRegExImpl::GetErrorMsg(int errorcode, bool badconv) const
     return szError;
 }
 
+// Helper function for processing bracket expressions inside a regex.
+//
+// Advance the iterator until the closing bracket matching the opening one the
+// iterator currently points to, i.e.:
+//
+// Precondition: *it == '['
+// Postcondition: *it == ']' or it == end if failed to find matching ']'
+static
+wxString::const_iterator
+SkipBracketExpression(wxString::const_iterator it, wxString::const_iterator end)
+{
+    wxASSERT_MSG( *it == '[', "must be at the start of bracket expression" );
+
+    // Initial ']', possibly after the preceding '^', is different because it
+    // stands for a literal ']' and not the end of the bracket expression, so
+    // check for it first.
+    ++it;
+    if ( it != end && *it == '^' )
+        ++it;
+    if ( it != end && *it == ']' )
+        ++it;
+
+    // Any ']' from now on ends the bracket expression.
+    for ( ; it != end; ++it )
+    {
+        const wxUniChar c = *it;
+
+        if ( c == ']' )
+            break;
+
+        if ( c == '[' )
+        {
+            // Bare '[' on its own is not special, but collating elements and
+            // character classes are, so check for them and advance past them
+            // if necessary to avoid misinterpreting the matching closing ']'.
+            if ( ++it == end )
+                break;
+
+            const wxUniChar c = *it;
+            if ( c == ':' || c == '.' || c == '=' )
+            {
+                for ( ++it; it != end; ++it )
+                {
+                    if ( *it == c )
+                    {
+                        if ( ++it == end )
+                            break;
+
+                        if ( *it == ']' )
+                            break;
+                    }
+                }
+
+                if ( it == end )
+                    break;
+            }
+        }
+    }
+
+    return it;
+}
+
+/* static */
+wxString wxRegEx::ConvertFromBasic(const wxString& bre)
+{
+    /*
+        Quoting regex(7):
+
+        Obsolete ("basic") regular expressions differ in several respects.
+        '|', '+', and '?' are ordinary characters and there is no equivalent
+        for their functionality. The delimiters for bounds are "\{" and "\}",
+        with '{' and '}' by themselves ordinary characters. The parentheses
+        for nested subexpressions are "\(" and "\)", with '(' and ')' by
+        themselves ordinary characters. '^' is an ordinary character except at
+        the beginning of the RE or(!) the beginning of a parenthesized
+        subexpression, '$' is an ordinary character except at the end of the RE
+        or(!) the end of a parenthesized subexpression, and '*' is an ordinary
+        character if it appears at the beginning of the RE or the beginning of
+        a parenthesized subexpression (after a possible leading '^').
+
+        Finally, there is one new type of atom, a back reference: '\' followed
+        by a nonzero decimal digit d matches the same sequence of characters
+        matched by the dth parenthesized subexpression [...]
+     */
+    wxString ere;
+    ere.reserve(bre.length());
+
+    enum SinceStart
+    {
+        SinceStart_None,        // Just at the beginning.
+        SinceStart_OnlyCaret,   // Had just "^" since the beginning.
+        SinceStart_Some         // Had something else since the beginning.
+    };
+
+    struct State
+    {
+        explicit State(SinceStart sinceStart_)
+        {
+            isBackslash = false;
+            sinceStart = sinceStart_;
+        }
+
+        bool isBackslash;
+        SinceStart sinceStart;
+    };
+
+    State previous(SinceStart_None);
+    for ( wxString::const_iterator it = bre.begin(),
+                                  end = bre.end();
+          it != end;
+          ++it )
+    {
+        const wxUniChar c = *it;
+
+        // What should be done with the current character?
+        enum Disposition
+        {
+            Disposition_Skip,   // Nothing.
+            Disposition_Append, // Append to output.
+            Disposition_Escape  // ... after escaping it with backslash.
+        } disposition = Disposition_Append;
+
+        State current(SinceStart_Some);
+
+        if ( previous.isBackslash )
+        {
+            // By default, keep the backslash present in the BRE, it's still
+            // needed in the ERE too.
+            disposition = Disposition_Escape;
+
+            switch ( c.GetValue() )
+            {
+                case '(':
+                    // It's the start of a new subexpression.
+                    current.sinceStart = SinceStart_None;
+                    wxFALLTHROUGH;
+
+                case ')':
+                case '{':
+                case '}':
+                    // Do not escape to ensure they remain special in the ERE
+                    // as the escaped versions were special in the BRE.
+                    disposition = Disposition_Append;
+                    break;
+            }
+        }
+        else // This character is not escaped.
+        {
+            switch ( c.GetValue() )
+            {
+                case '\\':
+                    current.isBackslash = true;
+
+                    // Don't do anything with it yet, we'll deal with it later.
+                    disposition = Disposition_Skip;
+                    break;
+
+                case '^':
+                    // Escape unless it appears at the start.
+                    switch ( previous.sinceStart )
+                    {
+                        case SinceStart_None:
+                            // Don't escape, but do update the state.
+                            current.sinceStart = SinceStart_OnlyCaret;
+                            break;
+
+                        case SinceStart_OnlyCaret:
+                        case SinceStart_Some:
+                            disposition = Disposition_Escape;
+                            break;
+                    }
+                    break;
+
+                case '*':
+                    // Escape unless it appears at the start or right after "^".
+                    switch ( previous.sinceStart )
+                    {
+                        case SinceStart_None:
+                        case SinceStart_OnlyCaret:
+                            disposition = Disposition_Escape;
+                            break;
+
+                        case SinceStart_Some:
+                            break;
+                    }
+                    break;
+
+                case '$':
+                    // Escape unless it appears at the end or just before "\)".
+                    disposition = Disposition_Escape;
+                    {
+                        wxString::const_iterator next = it;
+                        ++next;
+                        if ( next == end )
+                        {
+                            // It is at the end, so has special meaning.
+                            disposition = Disposition_Append;
+                        }
+                        else // Not at the end, but maybe at subexpression end?
+                        {
+                            if ( *next == '\\' )
+                            {
+                                ++next;
+                                if ( next != end && *next == ')' )
+                                    disposition = Disposition_Append;
+                            }
+                        }
+                    }
+                    break;
+
+                case '|':
+                case '+':
+                case '?':
+                case '(':
+                case ')':
+                case '{':
+                case '}':
+                    // Escape these characters which are not special in a BRE,
+                    // but would be special in a ERE if left unescaped.
+                    disposition = Disposition_Escape;
+                    break;
+
+                case '[':
+                    // Rules are very different for the characters inside the
+                    // bracket expressions and we don't have to change anything
+                    // for them as the syntax is the same for BREs and EREs, so
+                    // just process the entire expression at once.
+                    {
+                        const wxString::const_iterator start = it;
+                        it = SkipBracketExpression(it, end);
+
+                        // Copy everything inside without any changes.
+                        ere += wxString(start, it);
+
+                        if ( it == end )
+                        {
+                            // If we reached the end without finding the
+                            // matching ']' there is nothing remaining anyhow.
+                            return ere;
+                        }
+
+                        // Note that default Disposition_Append here is fine,
+                        // we'll append the closing ']' to "ere" below.
+                    }
+                    break;
+            }
+        }
+
+        switch ( disposition )
+        {
+            case Disposition_Skip:
+                break;
+
+            case Disposition_Escape:
+                ere += '\\';
+                wxFALLTHROUGH;
+
+            case Disposition_Append:
+                // Note: don't use "c" here, iterator may have been advanced
+                // inside the loop.
+                ere += *it;
+                break;
+        }
+
+        previous = current;
+    }
+
+    // It's an error if a RE ends with a backslash, but we still need to
+    // preserve this error in the resulting RE.
+    if ( previous.isBackslash )
+        ere += '\\';
+
+    return ere;
+}
+
 bool wxRegExImpl::Compile(const wxString& expr, int flags)
 {
     Reinit();
diff --git a/tests/regex/wxregextest.cpp b/tests/regex/wxregextest.cpp
index eaaefe2a5d..3d9780d12d 100644
--- a/tests/regex/wxregextest.cpp
+++ b/tests/regex/wxregextest.cpp
@@ -170,4 +170,18 @@ TEST_CASE("wxRegEx::QuoteMeta", "[regex][meta]")
     CHECK( wxRegEx::QuoteMeta(":foo.*bar") == ":foo\\.\\*bar" );
 }
 
+TEST_CASE("wxRegEx::ConvertFromBasic", "[regex][basic]")
+{
+    CHECK( wxRegEx::ConvertFromBasic("\\(a\\)b") == "(a)b" );
+    CHECK( wxRegEx::ConvertFromBasic("a\\{0,1\\}b") == "a{0,1}b" );
+    CHECK( wxRegEx::ConvertFromBasic("*") == "\\*" );
+    CHECK( wxRegEx::ConvertFromBasic("**") == "\\**" );
+    CHECK( wxRegEx::ConvertFromBasic("^*") == "^\\*" );
+    CHECK( wxRegEx::ConvertFromBasic("^^") == "^\\^" );
+    CHECK( wxRegEx::ConvertFromBasic("x$y") == "x\\$y" );
+    CHECK( wxRegEx::ConvertFromBasic("$$") == "\\$$" );
+    CHECK( wxRegEx::ConvertFromBasic("\\(x$\\)") == "(x$)" );
+    CHECK( wxRegEx::ConvertFromBasic("[^$\\)]") == "[^$\\)]" );
+}
+
 #endif // wxUSE_REGEX