Implement wxRegEx using PCRE

Adjust the tests and document the incompatibilities with the previously used regex syntax. In this commit the use of PCRE is conditional on wxUSE_PCRE which is never defined as 1 yet, so the new code is still disabled.
2021-07-17 17:00:19 +02:00
parent 912f4b76ac
commit fa59d5700a
5 changed files with 809 additions and 42 deletions
--- a/include/wx/regex.h
+++ b/include/wx/regex.h
@@ -60,7 +60,10 @@ enum
    wxRE_NOTBOL = 32,
    // '$' doesn't match at the end of line
-    wxRE_NOTEOL = 64
+    wxRE_NOTEOL = 64,
    // don't accept empty string as valid match, try alternatives or fail
    wxRE_NOTEMPTY = 128
 };
 // ----------------------------------------------------------------------------
--- a/interface/wx/regex.h
+++ b/interface/wx/regex.h
@@ -12,13 +12,31 @@
 */
 enum
 {
-    /** Use extended regex syntax. */
+    /**
        Use extended regex syntax.
        This is the default and doesn't need to be specified.
     */
    wxRE_EXTENDED = 0,
-    /** Use advanced RE syntax (built-in regex only). */
+    /**
        Use advanced regex syntax.
        This flag is synonym for wxRE_EXTENDED and doesn't need to be specified
        as this is the default syntax.
     */
    wxRE_ADVANCED = 1,
-    /** Use basic RE syntax. */
+    /**
        Use basic regex syntax.
        Use basic regular expression syntax, close to its POSIX definition,
        but with some extensions still available.
        The word start/end boundary assertions @c "\<" and @c "\>" are only
        available when using basic syntax, use @c "[[:<:]] and @c "[[:>:]]" or
        just more general word boundary assertion @c "\b" when not using it.
     */
    wxRE_BASIC    = 2,
    /** Ignore case in match. */
@@ -51,7 +69,19 @@ enum
    wxRE_NOTBOL = 32,
    /** '$' doesn't match at the end of line. */
-    wxRE_NOTEOL = 64
+    wxRE_NOTEOL = 64,
    /**
        Don't accept empty string as a valid match.
        If the regex matches an empty string, try alternatives, if there are
        any, or fail.
        This flag is not supported if PCRE support is turned off.
        @since 3.1.6
     */
    wxRE_NOTEMPTY = 128
 };
 /**
@@ -60,26 +90,19 @@ enum
    wxRegEx represents a regular expression.  This class provides support
    for regular expressions matching and also replacement.
-    It is built on top of either the system library (if it has support
+    In wxWidgets 3.1.6 or later, it is built on top of PCRE library
-    for POSIX regular expressions - which is the case of the most modern
+    (https://www.pcre.org/). In the previous versions of wxWidgets, this class
-    Unices) or uses the built in Henry Spencer's library.  Henry Spencer
+    uses Henry Spencer's library and behaved slightly differently, see below
-    would appreciate being given credit in the documentation of software
+    for the discussion of the changes if you're upgrading from an older
-    which uses his library, but that is not a requirement.
+    version.
-    Regular expressions, as defined by POSIX, come in two flavours: @e extended
+    Note that while C++11 and later provides @c std::regex and related classes,
-    and @e basic.  The builtin library also adds a third flavour
+    this class is still useful as it provides the following important
-    of expression @ref overview_resyntax "advanced", which is not available
+    advantages:
    when using the system library.
-    Unicode is fully supported only when using the builtin library.
+    - Support for richer regular expressions syntax.
-    When using the system library in Unicode mode, the expressions and data
+    - Much better performance in many common cases, by a factor of 10-100.
-    are translated to the default 8-bit encoding before being passed to
+    - Consistent behaviour, including performance, on all platforms.
    the library.
    On platforms where a system library is available, the default is to use
    the builtin library for Unicode builds, and the system library otherwise.
    It is possible to use the other if preferred by selecting it when building
    the wxWidgets.
    @library{wxbase}
    @category{data}
@@ -118,6 +141,57 @@ enum
    std::cout << "text now contains " << count << " hidden addresses" << std::endl;
    std::cout << originalText << std::endl;
    @endcode
    @section regex_pcre_changes Changes in the PCRE-based version
    This section describes the difference in regex syntax in the new PCRE-based
    wxRegEx version compared to the previously used version which implemented
    POSIX regex support.
    The main change is that both extended (::wxRE_EXTENDED) and advanced
    (::wxRE_ADVANCED) regex syntax is now the same as PCRE syntax described at
    https://www.pcre.org/current/doc/html/pcre2syntax.html
    Basic regular expressions (::wxRE_BASIC) are still different, but their
    use is deprecated and PCRE extensions are still accepted in them, please
    avoid using them.
    Other changes are:
    - Negated character classes, i.e. @c [^....], now always match newline
      character, regardless of whether ::wxRE_NEWLINE was used or not. The dot
      metacharacter still has the same meaning, i.e. it matches newline by
      default but not when ::wxRE_NEWLINE is specified.
    - Previously POSIX-specified behaviour of handling unmatched right
      parenthesis @c ')' as a literal character was implemented, but now this
      is a (regex) compilation error.
    - Empty alternation branches were previously ignored, i.e. matching @c a||b
      worked the same as matching just @c a|b, but now actually matches an
      empty string. The new ::wxRE_NOTEMPTY flag can be used to disable empty
      matches.
    - Using @c \U to embed Unicode code points into the pattern is not
      supported any more, use the still supported @c \u, followed by exactly
      four hexadecimal digits, or @c \x, followed by exactly two hexadecimal
      digits, instead.
    - POSIX collating elements inside square brackets, i.e. @c [.XXX.] and
      @c [:XXXX:] are not supported by PCRE and result in regex compilation
      errors.
    - Backslash can be used to escape the character following it even inside
      square brackets now, while it loses its special meaning in POSIX regexes
      when it occurs inside square brackets.
    - For completeness, PCRE syntax which previously resulted in errors, e.g.
      @c "(?:...)" and similar constructs, are now accepted and behave as
      expected. Other regexes syntactically invalid according to POSIX are are
      re-interpreted as sequences of literal characters with PCRE, e.g. @c "{1"
      is just a sequence of two literal characters now, where it previously was
      a compilation error.
 */
 class wxRegEx
 {
--- a/src/common/regex.cpp
+++ b/src/common/regex.cpp
@@ -37,16 +37,234 @@
 #   include <sys/types.h>
 #endif
 #include <regex.h>
 // WXREGEX_USING_BUILTIN    defined when using the built-in regex lib
 // WXREGEX_USING_RE_SEARCH  defined when using re_search in the GNU regex lib
 // WXREGEX_CONVERT_TO_MB    defined when the regex lib is using chars and
 //                          wxChar is wide, so conversion to UTF-8 must be done
 // wxRegChar                the character type used by the regular expression engine
 //
 #if wxUSE_PCRE
    // Use the same code unit width for PCRE as we use for wxString.
 #   if !wxUSE_UNICODE || wxUSE_UNICODE_UTF8
 #       define PCRE2_CODE_UNIT_WIDTH 8
        typedef char wxRegChar;
 #   elif wxUSE_UNICODE_UTF16
 #       define PCRE2_CODE_UNIT_WIDTH 16
        typedef wchar_t wxRegChar;
 #   else
 #       define PCRE2_CODE_UNIT_WIDTH 32
        typedef wchar_t wxRegChar;
 #   endif
    typedef wxRegChar wxRegErrorChar;
    // We currently always use PCRE as a static library under MSW.
 #   ifdef __WINDOWS__
 #       define PCRE2_STATIC
 #   endif
 #   include <pcre2.h>
 #   if wxUSE_UNICODE_UTF8
 #       define WXREGEX_CONVERT_TO_MB
 #   endif
 #   define WX_NO_REGEX_ADVANCED
 // There is an existing pcre2posix library which provides regxxx()
 // implementations, but we don't use it because:
 //
 //  0. The plan is to stop using POSIX API soon anyhow.
 //  1. It's yet another system library to depend on.
 //  2. We can add non-standard "len" parameter to regexec().
 //  3. We want to use PCRE2_ALT_BSUX for compatibility, but we can't
 //     set it using just the POSIX API.
 //
 // So implement these functions ourselves.
 namespace
 {
 // Define POSIX constants and structs ourselves too.
 #define REG_EXTENDED  0         // Unused, for compatibility only.
 #define REG_ICASE     0x0001    // Same as PCRE2_CASELESS.
 #define REG_NEWLINE   0x0002    // Same as PCRE2_MULTILINE.
 #define REG_NOTBOL    0x0004    // Same as PCRE2_NOTBOL.
 #define REG_NOTEOL    0x0008    // Same as PCRE2_NOTEOL.
 #define REG_NOSUB     0x0020    // Don't return matches.
 #define REG_NOTEMPTY  0x0100    // Same as PCRE2_NOTEMPTY.
 enum
 {
    REG_NOERROR = 0,    // Must be 0.
    REG_NOMATCH,        // Returned from regexec().
    REG_BADPAT,         // Catch-all error returned from regcomp().
    REG_ESPACE          // Catch-all errir returned from regexec().
 };
 typedef size_t regoff_t;
 struct regex_t
 {
    // This is the only "public" field -- not that it really matters anyhow for
    // this private struct.
    size_t re_nsub;
    pcre2_code* code;
    pcre2_match_data* match_data;
    int errorcode;
    regoff_t erroroffset;
 };
 struct regmatch_t
 {
    regoff_t rm_so;
    regoff_t rm_eo;
 };
 int wx_regcomp(regex_t* preg, const wxRegChar* pattern, int cflags)
 {
    // PCRE2_UTF is required in order to handle non-ASCII characters when using
    // 8-bit version of the library.
    //
    // Use PCRE2_ALT_BSUX because we want to handle \uXXXX for compatibility
    // with the previously used regex library and because it's useful.
    int options = PCRE2_UTF | PCRE2_ALT_BSUX;
    if ( cflags & REG_ICASE )
        options |= PCRE2_CASELESS;
    // Default behaviour of the old regex library corresponds to DOTALL i.e.
    // dot matches any character, but wxRE_NEWLINE enables both MULTILINE (so
    // that ^/$ match after/before newline in addition to matching at the
    // start/end of string) and disables the special handling of "\n", i.e. we
    // must use DOTALL with it.
    if ( cflags & REG_NEWLINE )
        options |= PCRE2_MULTILINE;
    else
        options |= PCRE2_DOTALL;
    preg->code = pcre2_compile
                 (
                    (PCRE2_SPTR)pattern,
                    PCRE2_ZERO_TERMINATED,
                    options,
                    &preg->errorcode,
                    &preg->erroroffset,
                    NULL                    // use default context
                 );
    if ( !preg->code )
    {
        // Don't bother translating PCRE error to the most appropriate POSIX
        // error code, there is no way to do it losslessly and the main thing
        // that matters is the error message and not the error code anyhow.
        return REG_BADPAT;
    }
    preg->match_data = pcre2_match_data_create_from_pattern(preg->code, NULL);
    return REG_NOERROR;
 }
 int
 wx_regexec(const regex_t* preg, const wxRegChar* string, size_t len,
           size_t nmatch, regmatch_t* pmatch, int eflags)
 {
    int options = 0;
    if ( eflags & REG_NOTBOL )
        options |= PCRE2_NOTBOL;
    if ( eflags & REG_NOTEOL )
        options |= PCRE2_NOTEOL;
    if ( eflags & REG_NOTEMPTY )
        options |= PCRE2_NOTEMPTY;
    const int rc = pcre2_match
                   (
                        preg->code,
                        (PCRE2_SPTR)string,
                        len,
                        0,                      // start offset
                        options,
                        preg->match_data,
                        NULL                    // use default context
                   );
    if ( rc == PCRE2_ERROR_NOMATCH )
        return REG_NOMATCH;
    if ( rc < 0 )
        return REG_ESPACE;
    // Successful match, fill in pmatch array if necessary.
    if ( pmatch )
    {
        const PCRE2_SIZE* const
            ovector = pcre2_get_ovector_pointer(preg->match_data);
        const size_t nmatchActual = static_cast<size_t>(rc);
        for ( size_t n = 0; n < nmatch; ++n )
        {
            regmatch_t& m = pmatch[n];
            if ( n < nmatchActual )
            {
                m.rm_so = ovector[n*2] == PCRE2_UNSET ? -1 : ovector[n*2];
                m.rm_eo = ovector[n*2+1] == PCRE2_UNSET ? -1 : ovector[n*2+1];
            }
            else
            {
                m.rm_so =
                m.rm_eo = static_cast<regoff_t>(-1);
            }
        }
    }
    return REG_NOERROR;
 }
 size_t
 wx_regerror(int errcode, const regex_t* preg, wxRegErrorChar* errbuf, size_t errbuf_size)
 {
    // We don't use the passed in POSIX error code other than to check that we
    // do have an error but rely on PCRE error code from regex_t.
    wxRegErrorChar buffer[256];
    int len;
    if ( errcode == REG_NOERROR )
        len = wxSnprintf(buffer, WXSIZEOF(buffer), "no error");
    else
        len = pcre2_get_error_message(preg->errorcode, (PCRE2_UCHAR*)buffer, sizeof(buffer));
    if ( len < 0 )
        len = wxSnprintf(buffer, WXSIZEOF(buffer), "PCRE error %d", preg->errorcode);
    if ( errbuf && errbuf_size )
        wxStrlcpy(errbuf, buffer, errbuf_size);
    return len;
 }
 void wx_regfree(regex_t* preg)
 {
    pcre2_match_data_free(preg->match_data);
    pcre2_code_free(preg->code);
 }
 } // anonymous namespace
 #else // !wxUSE_PCRE
 #include <regex.h>
 typedef char wxRegErrorChar;
 #ifdef __REG_NOFRONT
 #   define WXREGEX_USING_BUILTIN
    typedef wxChar wxRegChar;
 #else
    typedef char wxRegChar;
 #   ifdef HAVE_RE_SEARCH
 #       define WXREGEX_USING_RE_SEARCH
 #   else
@@ -66,6 +284,8 @@
 #   define wx_regerror regerror
 #endif
 #endif // wxUSE_PCRE/!wxUSE_PCRE
 // ----------------------------------------------------------------------------
 // private classes
 // ----------------------------------------------------------------------------
@@ -133,13 +353,6 @@ private:
 #endif // WXREGEX_USING_RE_SEARCH
 // the character type used by the regular expression engine
 #ifndef WXREGEX_CONVERT_TO_MB
 typedef wxChar wxRegChar;
 #else
 typedef char wxRegChar;
 #endif
 // the real implementation of wxRegEx
 class wxRegExImpl
 {
@@ -152,7 +365,7 @@ public:
    bool IsValid() const { return m_isCompiled; }
    // RE operations
-    bool Compile(const wxString& expr, int flags = 0);
+    bool Compile(wxString expr, int flags = 0);
    bool Matches(const wxRegChar *str, int flags, size_t len) const;
    bool GetMatch(size_t *start, size_t *len, size_t index = 0) const;
    size_t GetMatchCount() const;
@@ -227,11 +440,11 @@ wxString wxRegExImpl::GetErrorMsg(int errorcode) const
    int len = wx_regerror(errorcode, &m_RegEx, NULL, 0);
    if ( len > 0 )
    {
-        wxCharBuffer errbuf(len);
+        wxCharTypeBuffer<wxRegErrorChar> errbuf(len);
        (void)wx_regerror(errorcode, &m_RegEx, errbuf.data(), errbuf.length());
-        szError = wxConvLibc.cMB2WX(errbuf);
+        szError = errbuf;
    }
    if ( szError.empty() ) // regerror() returned 0 or conversion failed
@@ -386,6 +599,16 @@ wxString wxRegEx::ConvertFromBasic(const wxString& bre)
                    // as the escaped versions were special in the BRE.
                    disposition = Disposition_Append;
                    break;
                case '<':
                case '>':
                    // Map word boundaries extensions to POSIX syntax
                    // understood by PCRE.
                    ere += "[[:";
                    ere += c;
                    ere += ":]]";
                    disposition = Disposition_Skip;
                    break;
            }
        }
        else // This character is not escaped.
@@ -517,11 +740,294 @@ wxString wxRegEx::ConvertFromBasic(const wxString& bre)
    return ere;
 }
-bool wxRegExImpl::Compile(const wxString& expr, int flags)
+#if wxUSE_PCRE
 // Small helper for converting selected PCRE compilation options to string.
 static wxString PCREOptionsToString(int opts)
 {
    wxString s;
    if ( opts & PCRE2_CASELESS )
        s += 'i';
    if ( opts & PCRE2_MULTILINE )
        s += 'm';
    if ( opts & PCRE2_DOTALL )
        s += 's';
    if ( opts & PCRE2_EXTENDED )
        s += 'x';
    return s;
 }
 // Convert metasyntax, i.e. directors and embedded options, to PCRE syntax.
 //
 // See TCL re_syntax man page for more details.
 static wxString ConvertMetasyntax(wxString expr, int& flags)
 {
    // First check for directors that must occur only at the beginning.
    const int DIRECTOR_PREFIX_LEN = 3;
    if ( expr.length() > DIRECTOR_PREFIX_LEN && expr.StartsWith("***") )
    {
        switch ( expr[DIRECTOR_PREFIX_LEN].GetValue() )
        {
            // "***:" director indicates that the regex uses ARE syntax.
            case ':':
                flags &= ~wxRE_BASIC;
                flags |= wxRE_ADVANCED;
                expr.erase(0, DIRECTOR_PREFIX_LEN + 1);
                break;
            // "***=" director means that the rest is a literal string.
            case '=':
                // We could use PCRE2_LITERAL, but for now just use the "\Q"
                // escape that should result in the same way -- maybe even less
                // efficiently, but we probably don't really care about
                // performance in this very special case.
                flags &= ~(wxRE_BASIC | wxRE_ADVANCED);
                expr.replace(0, DIRECTOR_PREFIX_LEN + 1, "\\Q");
                break;
            default:
                // This is an invalid director that will result in a compile
                // error anyhow, so don't bother special-casing it and just
                // don't do anything to compile it and get an error later.
                break;
        }
    }
    // Then check for the embedded options that may occur at the beginning of
    // an ARE, but possibly after a director (necessarily the "***:" one).
    if ( (flags & wxRE_ADVANCED) && expr.StartsWith("(?") )
    {
        // String with the options: we use this for the options we don't know
        // about.
        wxString optsString;
        // PCRE options to enable or disable.
        int opts = 0,
            negopts = 0;
        // (Last) syntax selected by the options.
        enum Syntax
        {
            Syntax_None,
            Syntax_Basic,
            Syntax_Extended,
            Syntax_Literal
        } syntax = Syntax_None;
        const wxString::iterator end = expr.end();
        const wxString::iterator start = expr.begin() + 2;
        for ( wxString::iterator it = start; it != end; ++it )
        {
            if ( *it == ')' )
            {
                optsString += PCREOptionsToString(opts);
                if ( negopts )
                {
                    optsString += "-";
                    optsString += PCREOptionsToString(negopts);
                }
                size_t posAfterOpts;
                if ( optsString.empty() )
                {
                    expr.erase(expr.begin(), ++it);
                    posAfterOpts = 0;
                }
                else
                {
                    expr.replace(start, it, optsString);
                    posAfterOpts = optsString.length() + 3; // (?opts)
                }
                // Finally deal with the syntax selection.
                flags &= ~wxRE_ADVANCED;
                switch ( syntax )
                {
                    case Syntax_None:
                        flags |= wxRE_ADVANCED;
                        break;
                    case Syntax_Basic:
                        flags |= wxRE_BASIC;
                        break;
                    case Syntax_Extended:
                        flags |= wxRE_EXTENDED;
                        break;
                    case Syntax_Literal:
                        // As above, we could also use the LITERAL option, but
                        // this is simpler.
                        expr.insert(posAfterOpts, "\\Q");
                        break;
                }
                break;
            }
            // Avoid misinterpreting other constructs (non-capturing groups,
            // look ahead assertions etc) as options, which always consist in
            // alphabetic characters only.
            if ( *it < 'a' || *it > 'z' )
                break;
            switch ( (*it).GetValue() )
            {
                case 'b':
                    syntax = Syntax_Basic;
                    break;
                case 'e':
                    syntax = Syntax_Extended;
                    break;
                case 'q':
                    syntax = Syntax_Literal;
                    break;
                case 'm':
                case 'n':
                    // This option corresponds to MULTILINE PCRE option,
                    // without DOTALL, so enable the former and disable the
                    // latter.
                    negopts &= ~PCRE2_MULTILINE;
                    opts |= PCRE2_MULTILINE;
                    wxFALLTHROUGH;
                case 'p':
                    // This option corresponds to the default PCRE behaviour,
                    // but we use DOTALL by default, so turn it off (this might
                    // be unnecessary if wxRE_NEWLINE is also used, but it does
                    // no harm).
                    negopts |= PCRE2_DOTALL;
                    break;
                case 'w':
                    // This option corresponds to using both MULTILINE and
                    // DOTALL with PCRE.
                    negopts &= ~(PCRE2_MULTILINE | PCRE2_DOTALL);
                    opts |= PCRE2_MULTILINE | PCRE2_DOTALL;
                    break;
                case 'c':
                    // Disable case-insensitive matching.
                    negopts |= PCRE2_CASELESS;
                    break;
                case 't':
                    // Disable extended syntax.
                    negopts |= PCRE2_EXTENDED;
                    break;
                case 's':
                    // This option reverts to the default behaviour in the old
                    // regex library or enables DOTALL in PCRE, which is much
                    // more useful and common, so use it with PCRE meaning.
                    negopts &= ~PCRE2_DOTALL;
                    opts |= PCRE2_DOTALL;
                    break;
                    // These options have the same meaning as in PCRE.
                case 'i':
                    negopts &= ~PCRE2_CASELESS;
                    opts |= PCRE2_CASELESS;
                    break;
                case 'x':
                    negopts &= ~PCRE2_EXTENDED;
                    opts |= PCRE2_EXTENDED;
                    break;
                default:
                    // Keep the rest: could be a valid PCRE option or invalid
                    // option for both libraries, in which case we'll get an
                    // error, which is what we want.
                    optsString += *it;
                    break;
            }
        }
    }
    return expr;
 }
 // Convert "advanced" word boundary assertions to the syntax understood by PCRE.
 //
 // These extensions (known as "TCL extensions" because TCL uses the same regex
 // library previous wx versions used) worked before, so preserve them for
 // compatibility.
 //
 // Note that this does not take into account "\<" and "\>" (GNU extensions) as
 // those are only valid when using BREs and so are taken care of above.
 static wxString ConvertWordBoundaries(const wxString& expr)
 {
    wxString out;
    out.reserve(expr.length());
    for ( wxString::const_iterator it = expr.begin(),
                                  end = expr.end();
          it != end;
          ++it )
    {
        if ( *it == '\\' )
        {
            ++it;
            if ( it == end )
            {
                out.append('\\');
                break;
            }
            const char* replacement = NULL;
            switch ( (*it).GetValue() )
            {
                case 'm':
                    replacement = "[[:<:]]";
                    break;
                case 'M':
                    replacement = "[[:>:]]";
                    break;
                case 'y':
                    replacement = "\\b";
                    break;
                case 'Y':
                    replacement = "\\B";
                    break;
            }
            if ( replacement )
            {
                out.append(replacement);
                continue;
            }
            out.append('\\');
        }
        out.append(*it);
    }
    return out;
 }
 #endif // wxUSE_PCRE
 bool wxRegExImpl::Compile(wxString expr, int flags)
 {
    Reinit();
-#ifdef WX_NO_REGEX_ADVANCED
+#if wxUSE_PCRE
 #   define FLAVORS (wxRE_ADVANCED | wxRE_BASIC)
 #elif defined(WX_NO_REGEX_ADVANCED)
 #   define FLAVORS wxRE_BASIC
 #else
 #   define FLAVORS (wxRE_ADVANCED | wxRE_BASIC)
@@ -531,6 +1037,23 @@ bool wxRegExImpl::Compile(const wxString& expr, int flags)
    wxASSERT_MSG( !(flags & ~(FLAVORS | wxRE_ICASE | wxRE_NOSUB | wxRE_NEWLINE)),
                  wxT("unrecognized flags in wxRegEx::Compile") );
 #if wxUSE_PCRE
    // Deal with the directors and embedded options first (this can modify
    // flags).
    expr = ConvertMetasyntax(expr, flags);
    // PCRE doesn't support BREs, translate them to EREs.
    if ( flags & wxRE_BASIC )
    {
        expr = wxRegEx::ConvertFromBasic(expr);
        flags &= ~wxRE_BASIC;
    }
    else if ( flags & wxRE_ADVANCED )
    {
        expr = ConvertWordBoundaries(expr);
    }
 #endif // wxUSE_PCRE
    // translate our flags to regcomp() ones
    int flagsRE = 0;
    if ( !(flags & wxRE_BASIC) )
@@ -605,7 +1128,11 @@ bool wxRegExImpl::Compile(const wxString& expr, int flags)
                    // extended syntax. '(?' is used for extensions by perl-
                    // like REs (e.g. advanced), and is not valid for POSIX
                    // extended, so ignore them always.
-                    if ( cptr[1] != wxT('?') )
+                    if ( cptr[1] != wxT('?')
 #if wxUSE_PCRE
                        && cptr[1] != wxT('*')
 #endif
                            )
                        m_nMatches++;
                }
            }
@@ -651,7 +1178,7 @@ bool wxRegExImpl::Matches(const wxRegChar *str,
    wxCHECK_MSG( IsValid(), false, wxT("must successfully Compile() first") );
    // translate our flags to regexec() ones
-    wxASSERT_MSG( !(flags & ~(wxRE_NOTBOL | wxRE_NOTEOL)),
+    wxASSERT_MSG( !(flags & ~(wxRE_NOTBOL | wxRE_NOTEOL | wxRE_NOTEMPTY)),
                  wxT("unrecognized flags in wxRegEx::Matches") );
    int flagsRE = 0;
@@ -659,6 +1186,10 @@ bool wxRegExImpl::Matches(const wxRegChar *str,
        flagsRE |= REG_NOTBOL;
    if ( flags & wxRE_NOTEOL )
        flagsRE |= REG_NOTEOL;
 #if wxUSE_PCRE
    if ( flags & wxRE_NOTEMPTY )
        flagsRE |= REG_NOTEMPTY;
 #endif // wxUSE_PCRE
    // allocate matches array if needed
    wxRegExImpl *self = wxConstCast(this, wxRegExImpl);
--- a/tests/regex/regextest.cpp
+++ b/tests/regex/regextest.cpp
@@ -159,12 +159,24 @@ bool RegExTestCase::parseFlags(const wxString& flags)
            // we don't fully support these flags, but they don't stop us
            // checking for success of failure of the match, so treat as noop
-            case 'A': case 'B': case 'E': case 'H':
+            case 'A': case 'B': case 'H':
            case 'I': case 'L': case 'M': case 'N':
            case 'P': case 'Q': case 'R': case 'S':
-            case 'T': case 'U': case '%':
+            case 'T': case '%':
                break;
            // Skip tests checking for backslash inside bracket expressions:
            // this works completely differently in PCRE where backslash is
            // special, even inside [], from POSIX.
            case 'E':
                return false;
            // Also skip the (there is only one) test using POSIX-specified
            // handling of unmatched ')' as a non-special character -- PCRE
            // doesn't support this and it doesn't seem worth implementing
            // support for this ourselves neither.
            case 'U':
                return false;
            // match options
            case '^': m_matchFlags |= wxRE_NOTBOL; break;
            case '$': m_matchFlags |= wxRE_NOTEOL; break;
@@ -199,6 +211,122 @@ void RegExTestCase::runTest()
        return;
    }
    // Skip, or accommodate, some test cases from the original test suite that
    // are known not to work with PCRE:
    // Several regexes use syntax which is valid in PCRE and so their
    // compilation doesn't fail as expected:
    if (m_mode == 'e') {
        static const char* validForPCRE[] =
        {
            // Non-capturing group.
            "a(?:b)c",
            // Possessive quantifiers.
            "a++", "a?+","a*+",
            // Quoting from pcre2pattern(1):
            //
            //      An opening curly bracket [...] that does not match the
            //      syntax of a quantifier, is taken as a literal character.
            "a{1,2,3}", "a{1", "a{1n}", "a\\{0,1", "a{0,1\\",
            // From the same page:
            //
            //      The numbers must be less than 65536
            //
            // (rather than 256 limit for POSIX).
            "a{257}", "a{1000}",
            // Also:
            //
            //      If a minus character is required in a class, it must be
            //      escaped with a backslash or appear in a position where it
            //      cannot be interpreted as indicating a range, typically as
            //      the first or last character in the class, or immediately
            //      after a range.
            //
            // (while POSIX wants the last case to be an error).
            "a[a-b-c]",
            // PCRE allows quantifiers after word boundary assertions, so skip
            // the tests checking that using them results in an error.
            "[[:<:]]*", "[[:>:]]*", "\\<*", "\\>*", "\\y*", "\\Y*",
            // PCRE only interprets "\x" and "\u" specially when they're
            // followed by exactly 2 or 4 hexadecimal digits and just lets them
            // match "x" or "u" otherwise, instead of giving an error.
            "a\\xq", "a\\u008x",
            // And "\U" always just matches "U", PCRE doesn't support it as
            // Unicode escape at all (even with PCRE2_EXTRA_ALT_BSUX).
            "a\\U0000008x",
            // "\z" is the "end of string" assertion and not an error in PCRE.
            "a\\z",
            // Recursive backreferences are explicitly allowed in PCRE.
            "a((b)\\1)",
            // Backreferences with index greater than 8 are interpreted as
            // octal escapes, unfortunately.
            "a((((((((((b\\10))))))))))c", "a\\12b",
        };
        for (size_t n = 0; n < WXSIZEOF(validForPCRE); ++n) {
            if (m_pattern == validForPCRE[n])
                return;
        }
    }
    if (m_mode == 'm') {
        // PCRE doesn't support POSIX collating elements, so we have to skip
        // those too.
        if (m_pattern.find("[.") != wxString::npos || m_pattern.find("[:") != wxString::npos)
            return;
        // "\b" is a word boundary assertion in PCRE and so is "\B", so the
        // tests relying on them being escapes for ASCII backspace and
        // backslash respectively must be skipped.
        if (m_pattern.find("\\b") != wxString::npos || m_pattern.find("\\B") != wxString::npos)
            return;
        // As explained above, "\U" is not supported by PCRE, only "\u" is.
        if (m_pattern == "a\\U00000008x")
            m_pattern = "a\\u0008x";
        // And "\x" is supported only when followed by 2 digits, not 4.
        else if (m_pattern == "a\\x0008x")
            m_pattern = "a\\x08x";
        // "\12" can be a backreferences or an octal escape in PCRE, but never
        // literal "12" as this test expects it to be.
        if (m_pattern == "a\\12b")
            return;
        // Switching to "extended" mode is supposed to turn off "\W"
        // interpretation, but it doesn't work with PCRE.
        if (m_pattern == "(?e)\\W+")
            return;
        // None of the tests in "tricky cases" section passes with PCRE. It's
        // not really clear if PCRE is wrong or the original test suite was or
        // even if these regexes are ambiguous, but for now explicitly anchor
        // them at the end to force them to pass even with PCRE, as without it
        // they would match less than expected.
        if (m_pattern == "(week|wee)(night|knights)" ||
            m_pattern == "a(bc*).*\\1" ||
            m_pattern == "a(b.[bc]*)+")
            m_pattern += '$';
    }
    // This test uses an empty alternative branch: in POSIX, this is ignored,
    // while with PCRE it matches an empty string and we must set NOTEMPTY flag
    // explicitly to disable this.
    if (m_pattern == "a||b" && m_flags == "NS" ) {
        m_matchFlags |= wxRE_NOTEMPTY;
    }
    // Provide more information about the test case if it fails.
    wxString str;
    wxArrayString::const_iterator it;
@@ -285,6 +413,21 @@ void RegExTestCase::doTest(int flavor)
        // i - check the match returns the offsets given
        else if (m_mode == 'i')
        {
 #if wxUSE_UNICODE_UTF8
            // Values returned by GetMatch() are indices into UTF-8 string, but
            // the values expected by the test are indices in a UTF-16 or -32
            // string, so convert them. Note that the indices are correct, as
            // using substr(start, len) must return the match itself, it's just
            // that they differ when using UTF-8 internally.
            if ( start < INT_MAX )
            {
                if ( start + len > 0 )
                    len = m_data.substr(start, len).wc_str().length();
                start = m_data.substr(0, start).wc_str().length();
            }
 #endif // wxUSE_UNICODE_UTF8
            if (start > INT_MAX)
                result = wxT("-1 -1");
            else if (start + len > 0)
--- a/tests/regex/wxregextest.cpp
+++ b/tests/regex/wxregextest.cpp
@@ -59,7 +59,7 @@ TEST_CASE("wxRegEx::Compile", "[regex][compile]")
    CHECK_FALSE( re.Compile("foo[") );
    CHECK_FALSE( re.Compile("foo[bar") );
    CHECK      ( re.Compile("foo[bar]") );
-    CHECK_FALSE( re.Compile("foo{1") );
+    // Not invalid for PCRE: CHECK_FALSE( re.Compile("foo{1") );
    CHECK      ( re.Compile("foo{1}") );
    CHECK      ( re.Compile("foo{1,2}") );
    CHECK      ( re.Compile("foo*") );
@@ -184,4 +184,20 @@ TEST_CASE("wxRegEx::ConvertFromBasic", "[regex][basic]")
    CHECK( wxRegEx::ConvertFromBasic("[^$\\)]") == "[^$\\)]" );
 }
 #ifdef wxHAS_REGEX_ADVANCED
 TEST_CASE("wxRegEx::Unicode", "[regex][unicode]")
 {
    const wxString cyrillicCapitalA(L"\u0410");
    const wxString cyrillicSmallA(L"\u0430");
    wxRegEx re(cyrillicCapitalA, wxRE_ICASE);
    REQUIRE( re.IsValid() );
    REQUIRE( re.Matches(cyrillicSmallA) );
    CHECK( re.GetMatch(cyrillicSmallA) == cyrillicSmallA );
 }
 #endif // wxHAS_REGEX_ADVANCED
 #endif // wxUSE_REGEX