Add wxRegEx::ConvertFromBasic() helper

This will be used to implement support for BREs using PCRE which doesn't support them directly in the upcoming commits.
2021-06-18 23:56:09 +02:00
parent 020b5f7383
commit f41564a3e2
5 changed files with 305 additions and 0 deletions
--- a/include/wx/regex.h
+++ b/include/wx/regex.h
@@ -144,6 +144,9 @@ public:
    static wxString QuoteMeta(const wxString& str);
    // return the extended RE corresponding to the given basic RE
    static wxString ConvertFromBasic(const wxString& bre);
    // dtor not virtual, don't derive from this class
    ~wxRegEx();
--- a/interface/wx/regex.h
+++ b/interface/wx/regex.h
@@ -276,5 +276,15 @@ public:
        @since 3.1.3
    */
    static wxString QuoteMeta(const wxString& str);
    /**
        Converts a basic regular expression to an extended regex syntax.
        This function can be used to convert @a bre using deprecated wxRE_BASIC
        syntax to default (extended) syntax.
        @since 3.1.6
     */
    static wxString ConvertFromBasic(const wxString& bre);
 };
--- a/misc/suppressions/codespell-lines
+++ b/misc/suppressions/codespell-lines
@@ -24,3 +24,6 @@ expressions</em> (BRE). EREs are roughly those of the traditional @e egrep,
    // 2019), i.e. SEH translator seems to work just fine without /EHa too, so
 // Purpose:     helpers for the structured exception handling (SEH) under Win32
 * MinGW-w64 versions 7.3 and 8.1 (32-bit binaries use SJLJ exceptions, 64-bit ones use SEH, and all binaries use Win32 threads).
    static wxString ConvertFromBasic(const wxString& bre);
        This function can be used to convert @a bre using deprecated wxRE_BASIC
    static wxString ConvertFromBasic(const wxString& bre);
--- a/src/common/regex.cpp
+++ b/src/common/regex.cpp
@@ -258,6 +258,281 @@ wxString wxRegExImpl::GetErrorMsg(int errorcode, bool badconv) const
    return szError;
 }
 // Helper function for processing bracket expressions inside a regex.
 //
 // Advance the iterator until the closing bracket matching the opening one the
 // iterator currently points to, i.e.:
 //
 // Precondition: *it == '['
 // Postcondition: *it == ']' or it == end if failed to find matching ']'
 static
 wxString::const_iterator
 SkipBracketExpression(wxString::const_iterator it, wxString::const_iterator end)
 {
    wxASSERT_MSG( *it == '[', "must be at the start of bracket expression" );
    // Initial ']', possibly after the preceding '^', is different because it
    // stands for a literal ']' and not the end of the bracket expression, so
    // check for it first.
    ++it;
    if ( it != end && *it == '^' )
        ++it;
    if ( it != end && *it == ']' )
        ++it;
    // Any ']' from now on ends the bracket expression.
    for ( ; it != end; ++it )
    {
        const wxUniChar c = *it;
        if ( c == ']' )
            break;
        if ( c == '[' )
        {
            // Bare '[' on its own is not special, but collating elements and
            // character classes are, so check for them and advance past them
            // if necessary to avoid misinterpreting the matching closing ']'.
            if ( ++it == end )
                break;
            const wxUniChar c = *it;
            if ( c == ':' || c == '.' || c == '=' )
            {
                for ( ++it; it != end; ++it )
                {
                    if ( *it == c )
                    {
                        if ( ++it == end )
                            break;
                        if ( *it == ']' )
                            break;
                    }
                }
                if ( it == end )
                    break;
            }
        }
    }
    return it;
 }
 /* static */
 wxString wxRegEx::ConvertFromBasic(const wxString& bre)
 {
    /*
        Quoting regex(7):
        Obsolete ("basic") regular expressions differ in several respects.
        '|', '+', and '?' are ordinary characters and there is no equivalent
        for their functionality. The delimiters for bounds are "\{" and "\}",
        with '{' and '}' by themselves ordinary characters. The parentheses
        for nested subexpressions are "\(" and "\)", with '(' and ')' by
        themselves ordinary characters. '^' is an ordinary character except at
        the beginning of the RE or(!) the beginning of a parenthesized
        subexpression, '$' is an ordinary character except at the end of the RE
        or(!) the end of a parenthesized subexpression, and '*' is an ordinary
        character if it appears at the beginning of the RE or the beginning of
        a parenthesized subexpression (after a possible leading '^').
        Finally, there is one new type of atom, a back reference: '\' followed
        by a nonzero decimal digit d matches the same sequence of characters
        matched by the dth parenthesized subexpression [...]
     */
    wxString ere;
    ere.reserve(bre.length());
    enum SinceStart
    {
        SinceStart_None,        // Just at the beginning.
        SinceStart_OnlyCaret,   // Had just "^" since the beginning.
        SinceStart_Some         // Had something else since the beginning.
    };
    struct State
    {
        explicit State(SinceStart sinceStart_)
        {
            isBackslash = false;
            sinceStart = sinceStart_;
        }
        bool isBackslash;
        SinceStart sinceStart;
    };
    State previous(SinceStart_None);
    for ( wxString::const_iterator it = bre.begin(),
                                  end = bre.end();
          it != end;
          ++it )
    {
        const wxUniChar c = *it;
        // What should be done with the current character?
        enum Disposition
        {
            Disposition_Skip,   // Nothing.
            Disposition_Append, // Append to output.
            Disposition_Escape  // ... after escaping it with backslash.
        } disposition = Disposition_Append;
        State current(SinceStart_Some);
        if ( previous.isBackslash )
        {
            // By default, keep the backslash present in the BRE, it's still
            // needed in the ERE too.
            disposition = Disposition_Escape;
            switch ( c.GetValue() )
            {
                case '(':
                    // It's the start of a new subexpression.
                    current.sinceStart = SinceStart_None;
                    wxFALLTHROUGH;
                case ')':
                case '{':
                case '}':
                    // Do not escape to ensure they remain special in the ERE
                    // as the escaped versions were special in the BRE.
                    disposition = Disposition_Append;
                    break;
            }
        }
        else // This character is not escaped.
        {
            switch ( c.GetValue() )
            {
                case '\\':
                    current.isBackslash = true;
                    // Don't do anything with it yet, we'll deal with it later.
                    disposition = Disposition_Skip;
                    break;
                case '^':
                    // Escape unless it appears at the start.
                    switch ( previous.sinceStart )
                    {
                        case SinceStart_None:
                            // Don't escape, but do update the state.
                            current.sinceStart = SinceStart_OnlyCaret;
                            break;
                        case SinceStart_OnlyCaret:
                        case SinceStart_Some:
                            disposition = Disposition_Escape;
                            break;
                    }
                    break;
                case '*':
                    // Escape unless it appears at the start or right after "^".
                    switch ( previous.sinceStart )
                    {
                        case SinceStart_None:
                        case SinceStart_OnlyCaret:
                            disposition = Disposition_Escape;
                            break;
                        case SinceStart_Some:
                            break;
                    }
                    break;
                case '$':
                    // Escape unless it appears at the end or just before "\)".
                    disposition = Disposition_Escape;
                    {
                        wxString::const_iterator next = it;
                        ++next;
                        if ( next == end )
                        {
                            // It is at the end, so has special meaning.
                            disposition = Disposition_Append;
                        }
                        else // Not at the end, but maybe at subexpression end?
                        {
                            if ( *next == '\\' )
                            {
                                ++next;
                                if ( next != end && *next == ')' )
                                    disposition = Disposition_Append;
                            }
                        }
                    }
                    break;
                case '|':
                case '+':
                case '?':
                case '(':
                case ')':
                case '{':
                case '}':
                    // Escape these characters which are not special in a BRE,
                    // but would be special in a ERE if left unescaped.
                    disposition = Disposition_Escape;
                    break;
                case '[':
                    // Rules are very different for the characters inside the
                    // bracket expressions and we don't have to change anything
                    // for them as the syntax is the same for BREs and EREs, so
                    // just process the entire expression at once.
                    {
                        const wxString::const_iterator start = it;
                        it = SkipBracketExpression(it, end);
                        // Copy everything inside without any changes.
                        ere += wxString(start, it);
                        if ( it == end )
                        {
                            // If we reached the end without finding the
                            // matching ']' there is nothing remaining anyhow.
                            return ere;
                        }
                        // Note that default Disposition_Append here is fine,
                        // we'll append the closing ']' to "ere" below.
                    }
                    break;
            }
        }
        switch ( disposition )
        {
            case Disposition_Skip:
                break;
            case Disposition_Escape:
                ere += '\\';
                wxFALLTHROUGH;
            case Disposition_Append:
                // Note: don't use "c" here, iterator may have been advanced
                // inside the loop.
                ere += *it;
                break;
        }
        previous = current;
    }
    // It's an error if a RE ends with a backslash, but we still need to
    // preserve this error in the resulting RE.
    if ( previous.isBackslash )
        ere += '\\';
    return ere;
 }
 bool wxRegExImpl::Compile(const wxString& expr, int flags)
 {
    Reinit();
--- a/tests/regex/wxregextest.cpp
+++ b/tests/regex/wxregextest.cpp
@@ -170,4 +170,18 @@ TEST_CASE("wxRegEx::QuoteMeta", "[regex][meta]")
    CHECK( wxRegEx::QuoteMeta(":foo.*bar") == ":foo\\.\\*bar" );
 }
 TEST_CASE("wxRegEx::ConvertFromBasic", "[regex][basic]")
 {
    CHECK( wxRegEx::ConvertFromBasic("\\(a\\)b") == "(a)b" );
    CHECK( wxRegEx::ConvertFromBasic("a\\{0,1\\}b") == "a{0,1}b" );
    CHECK( wxRegEx::ConvertFromBasic("*") == "\\*" );
    CHECK( wxRegEx::ConvertFromBasic("**") == "\\**" );
    CHECK( wxRegEx::ConvertFromBasic("^*") == "^\\*" );
    CHECK( wxRegEx::ConvertFromBasic("^^") == "^\\^" );
    CHECK( wxRegEx::ConvertFromBasic("x$y") == "x\\$y" );
    CHECK( wxRegEx::ConvertFromBasic("$$") == "\\$$" );
    CHECK( wxRegEx::ConvertFromBasic("\\(x$\\)") == "(x$)" );
    CHECK( wxRegEx::ConvertFromBasic("[^$\\)]") == "[^$\\)]" );
 }
 #endif // wxUSE_REGEX