Implement wxRegEx using PCRE

Adjust the tests and document the incompatibilities with the previously used regex syntax. In this commit the use of PCRE is conditional on wxUSE_PCRE which is never defined as 1 yet, so the new code is still disabled.
2021-07-17 17:00:19 +02:00
parent 912f4b76ac
commit fa59d5700a
5 changed files with 809 additions and 42 deletions
--- a/include/wx/regex.h
+++ b/include/wx/regex.h
@@ -60,7 +60,10 @@ enum
    wxRE_NOTBOL = 32,

    // '$' doesn't match at the end of line
-    wxRE_NOTEOL = 64
+    wxRE_NOTEOL = 64,
+
+    // don't accept empty string as valid match, try alternatives or fail
+    wxRE_NOTEMPTY = 128
 };

 // ----------------------------------------------------------------------------
--- a/interface/wx/regex.h
+++ b/interface/wx/regex.h
@@ -12,13 +12,31 @@
 */
 enum
 {
-    /** Use extended regex syntax. */
+    /**
+        Use extended regex syntax.
+
+        This is the default and doesn't need to be specified.
+     */
    wxRE_EXTENDED = 0,

-    /** Use advanced RE syntax (built-in regex only). */
+    /**
+        Use advanced regex syntax.
+
+        This flag is synonym for wxRE_EXTENDED and doesn't need to be specified
+        as this is the default syntax.
+     */
    wxRE_ADVANCED = 1,

-    /** Use basic RE syntax. */
+    /**
+        Use basic regex syntax.
+
+        Use basic regular expression syntax, close to its POSIX definition,
+        but with some extensions still available.
+
+        The word start/end boundary assertions @c "\<" and @c "\>" are only
+        available when using basic syntax, use @c "[[:<:]] and @c "[[:>:]]" or
+        just more general word boundary assertion @c "\b" when not using it.
+     */
    wxRE_BASIC    = 2,

    /** Ignore case in match. */
@@ -51,7 +69,19 @@ enum
    wxRE_NOTBOL = 32,

    /** '$' doesn't match at the end of line. */
-    wxRE_NOTEOL = 64
+    wxRE_NOTEOL = 64,
+
+    /**
+        Don't accept empty string as a valid match.
+
+        If the regex matches an empty string, try alternatives, if there are
+        any, or fail.
+
+        This flag is not supported if PCRE support is turned off.
+
+        @since 3.1.6
+     */
+    wxRE_NOTEMPTY = 128
 };

 /**
@@ -60,26 +90,19 @@ enum
    wxRegEx represents a regular expression.  This class provides support
    for regular expressions matching and also replacement.

-    It is built on top of either the system library (if it has support
-    for POSIX regular expressions - which is the case of the most modern
-    Unices) or uses the built in Henry Spencer's library.  Henry Spencer
-    would appreciate being given credit in the documentation of software
-    which uses his library, but that is not a requirement.
+    In wxWidgets 3.1.6 or later, it is built on top of PCRE library
+    (https://www.pcre.org/). In the previous versions of wxWidgets, this class
+    uses Henry Spencer's library and behaved slightly differently, see below
+    for the discussion of the changes if you're upgrading from an older
+    version.

-    Regular expressions, as defined by POSIX, come in two flavours: @e extended
-    and @e basic.  The builtin library also adds a third flavour
-    of expression @ref overview_resyntax "advanced", which is not available
-    when using the system library.
+    Note that while C++11 and later provides @c std::regex and related classes,
+    this class is still useful as it provides the following important
+    advantages:

-    Unicode is fully supported only when using the builtin library.
-    When using the system library in Unicode mode, the expressions and data
-    are translated to the default 8-bit encoding before being passed to
-    the library.
-
-    On platforms where a system library is available, the default is to use
-    the builtin library for Unicode builds, and the system library otherwise.
-    It is possible to use the other if preferred by selecting it when building
-    the wxWidgets.
+    - Support for richer regular expressions syntax.
+    - Much better performance in many common cases, by a factor of 10-100.
+    - Consistent behaviour, including performance, on all platforms.

    @library{wxbase}
    @category{data}
@@ -118,6 +141,57 @@ enum
    std::cout << "text now contains " << count << " hidden addresses" << std::endl;
    std::cout << originalText << std::endl;
    @endcode
+
+
+    @section regex_pcre_changes Changes in the PCRE-based version
+
+    This section describes the difference in regex syntax in the new PCRE-based
+    wxRegEx version compared to the previously used version which implemented
+    POSIX regex support.
+
+    The main change is that both extended (::wxRE_EXTENDED) and advanced
+    (::wxRE_ADVANCED) regex syntax is now the same as PCRE syntax described at
+    https://www.pcre.org/current/doc/html/pcre2syntax.html
+
+    Basic regular expressions (::wxRE_BASIC) are still different, but their
+    use is deprecated and PCRE extensions are still accepted in them, please
+    avoid using them.
+
+    Other changes are:
+
+    - Negated character classes, i.e. @c [^....], now always match newline
+      character, regardless of whether ::wxRE_NEWLINE was used or not. The dot
+      metacharacter still has the same meaning, i.e. it matches newline by
+      default but not when ::wxRE_NEWLINE is specified.
+
+    - Previously POSIX-specified behaviour of handling unmatched right
+      parenthesis @c ')' as a literal character was implemented, but now this
+      is a (regex) compilation error.
+
+    - Empty alternation branches were previously ignored, i.e. matching @c a||b
+      worked the same as matching just @c a|b, but now actually matches an
+      empty string. The new ::wxRE_NOTEMPTY flag can be used to disable empty
+      matches.
+
+    - Using @c \U to embed Unicode code points into the pattern is not
+      supported any more, use the still supported @c \u, followed by exactly
+      four hexadecimal digits, or @c \x, followed by exactly two hexadecimal
+      digits, instead.
+
+    - POSIX collating elements inside square brackets, i.e. @c [.XXX.] and
+      @c [:XXXX:] are not supported by PCRE and result in regex compilation
+      errors.
+
+    - Backslash can be used to escape the character following it even inside
+      square brackets now, while it loses its special meaning in POSIX regexes
+      when it occurs inside square brackets.
+
+    - For completeness, PCRE syntax which previously resulted in errors, e.g.
+      @c "(?:...)" and similar constructs, are now accepted and behave as
+      expected. Other regexes syntactically invalid according to POSIX are are
+      re-interpreted as sequences of literal characters with PCRE, e.g. @c "{1"
+      is just a sequence of two literal characters now, where it previously was
+      a compilation error.
 */
 class wxRegEx
 {
--- a/src/common/regex.cpp
+++ b/src/common/regex.cpp
@@ -37,16 +37,234 @@
 #   include <sys/types.h>
 #endif

-#include <regex.h>
-
 // WXREGEX_USING_BUILTIN    defined when using the built-in regex lib
 // WXREGEX_USING_RE_SEARCH  defined when using re_search in the GNU regex lib
 // WXREGEX_CONVERT_TO_MB    defined when the regex lib is using chars and
 //                          wxChar is wide, so conversion to UTF-8 must be done
+// wxRegChar                the character type used by the regular expression engine
 //
+
+#if wxUSE_PCRE
+    // Use the same code unit width for PCRE as we use for wxString.
+#   if !wxUSE_UNICODE || wxUSE_UNICODE_UTF8
+#       define PCRE2_CODE_UNIT_WIDTH 8
+        typedef char wxRegChar;
+#   elif wxUSE_UNICODE_UTF16
+#       define PCRE2_CODE_UNIT_WIDTH 16
+        typedef wchar_t wxRegChar;
+#   else
+#       define PCRE2_CODE_UNIT_WIDTH 32
+        typedef wchar_t wxRegChar;
+#   endif
+    typedef wxRegChar wxRegErrorChar;
+
+    // We currently always use PCRE as a static library under MSW.
+#   ifdef __WINDOWS__
+#       define PCRE2_STATIC
+#   endif
+
+#   include <pcre2.h>
+
+#   if wxUSE_UNICODE_UTF8
+#       define WXREGEX_CONVERT_TO_MB
+#   endif
+
+#   define WX_NO_REGEX_ADVANCED
+
+// There is an existing pcre2posix library which provides regxxx()
+// implementations, but we don't use it because:
+//
+//  0. The plan is to stop using POSIX API soon anyhow.
+//  1. It's yet another system library to depend on.
+//  2. We can add non-standard "len" parameter to regexec().
+//  3. We want to use PCRE2_ALT_BSUX for compatibility, but we can't
+//     set it using just the POSIX API.
+//
+// So implement these functions ourselves.
+namespace
+{
+
+// Define POSIX constants and structs ourselves too.
+
+#define REG_EXTENDED  0         // Unused, for compatibility only.
+
+#define REG_ICASE     0x0001    // Same as PCRE2_CASELESS.
+#define REG_NEWLINE   0x0002    // Same as PCRE2_MULTILINE.
+#define REG_NOTBOL    0x0004    // Same as PCRE2_NOTBOL.
+#define REG_NOTEOL    0x0008    // Same as PCRE2_NOTEOL.
+#define REG_NOSUB     0x0020    // Don't return matches.
+#define REG_NOTEMPTY  0x0100    // Same as PCRE2_NOTEMPTY.
+
+enum
+{
+    REG_NOERROR = 0,    // Must be 0.
+    REG_NOMATCH,        // Returned from regexec().
+    REG_BADPAT,         // Catch-all error returned from regcomp().
+    REG_ESPACE          // Catch-all errir returned from regexec().
+};
+
+typedef size_t regoff_t;
+
+struct regex_t
+{
+    // This is the only "public" field -- not that it really matters anyhow for
+    // this private struct.
+    size_t re_nsub;
+
+    pcre2_code* code;
+    pcre2_match_data* match_data;
+
+    int errorcode;
+    regoff_t erroroffset;
+};
+
+struct regmatch_t
+{
+    regoff_t rm_so;
+    regoff_t rm_eo;
+};
+
+int wx_regcomp(regex_t* preg, const wxRegChar* pattern, int cflags)
+{
+    // PCRE2_UTF is required in order to handle non-ASCII characters when using
+    // 8-bit version of the library.
+    //
+    // Use PCRE2_ALT_BSUX because we want to handle \uXXXX for compatibility
+    // with the previously used regex library and because it's useful.
+    int options = PCRE2_UTF | PCRE2_ALT_BSUX;
+
+    if ( cflags & REG_ICASE )
+        options |= PCRE2_CASELESS;
+
+    // Default behaviour of the old regex library corresponds to DOTALL i.e.
+    // dot matches any character, but wxRE_NEWLINE enables both MULTILINE (so
+    // that ^/$ match after/before newline in addition to matching at the
+    // start/end of string) and disables the special handling of "\n", i.e. we
+    // must use DOTALL with it.
+    if ( cflags & REG_NEWLINE )
+        options |= PCRE2_MULTILINE;
+    else
+        options |= PCRE2_DOTALL;
+
+    preg->code = pcre2_compile
+                 (
+                    (PCRE2_SPTR)pattern,
+                    PCRE2_ZERO_TERMINATED,
+                    options,
+                    &preg->errorcode,
+                    &preg->erroroffset,
+                    NULL                    // use default context
+                 );
+
+    if ( !preg->code )
+    {
+        // Don't bother translating PCRE error to the most appropriate POSIX
+        // error code, there is no way to do it losslessly and the main thing
+        // that matters is the error message and not the error code anyhow.
+        return REG_BADPAT;
+    }
+
+    preg->match_data = pcre2_match_data_create_from_pattern(preg->code, NULL);
+
+    return REG_NOERROR;
+}
+
+int
+wx_regexec(const regex_t* preg, const wxRegChar* string, size_t len,
+           size_t nmatch, regmatch_t* pmatch, int eflags)
+{
+    int options = 0;
+
+    if ( eflags & REG_NOTBOL )
+        options |= PCRE2_NOTBOL;
+    if ( eflags & REG_NOTEOL )
+        options |= PCRE2_NOTEOL;
+    if ( eflags & REG_NOTEMPTY )
+        options |= PCRE2_NOTEMPTY;
+
+    const int rc = pcre2_match
+                   (
+                        preg->code,
+                        (PCRE2_SPTR)string,
+                        len,
+                        0,                      // start offset
+                        options,
+                        preg->match_data,
+                        NULL                    // use default context
+                   );
+
+    if ( rc == PCRE2_ERROR_NOMATCH )
+        return REG_NOMATCH;
+
+    if ( rc < 0 )
+        return REG_ESPACE;
+
+    // Successful match, fill in pmatch array if necessary.
+    if ( pmatch )
+    {
+        const PCRE2_SIZE* const
+            ovector = pcre2_get_ovector_pointer(preg->match_data);
+
+        const size_t nmatchActual = static_cast<size_t>(rc);
+        for ( size_t n = 0; n < nmatch; ++n )
+        {
+            regmatch_t& m = pmatch[n];
+
+            if ( n < nmatchActual )
+            {
+                m.rm_so = ovector[n*2] == PCRE2_UNSET ? -1 : ovector[n*2];
+                m.rm_eo = ovector[n*2+1] == PCRE2_UNSET ? -1 : ovector[n*2+1];
+            }
+            else
+            {
+                m.rm_so =
+                m.rm_eo = static_cast<regoff_t>(-1);
+            }
+        }
+    }
+
+    return REG_NOERROR;
+}
+
+size_t
+wx_regerror(int errcode, const regex_t* preg, wxRegErrorChar* errbuf, size_t errbuf_size)
+{
+    // We don't use the passed in POSIX error code other than to check that we
+    // do have an error but rely on PCRE error code from regex_t.
+    wxRegErrorChar buffer[256];
+    int len;
+    if ( errcode == REG_NOERROR )
+        len = wxSnprintf(buffer, WXSIZEOF(buffer), "no error");
+    else
+        len = pcre2_get_error_message(preg->errorcode, (PCRE2_UCHAR*)buffer, sizeof(buffer));
+
+    if ( len < 0 )
+        len = wxSnprintf(buffer, WXSIZEOF(buffer), "PCRE error %d", preg->errorcode);
+
+    if ( errbuf && errbuf_size )
+        wxStrlcpy(errbuf, buffer, errbuf_size);
+
+    return len;
+}
+
+void wx_regfree(regex_t* preg)
+{
+    pcre2_match_data_free(preg->match_data);
+    pcre2_code_free(preg->code);
+}
+
+} // anonymous namespace
+
+#else // !wxUSE_PCRE
+
+#include <regex.h>
+typedef char wxRegErrorChar;
 #ifdef __REG_NOFRONT
 #   define WXREGEX_USING_BUILTIN
+    typedef wxChar wxRegChar;
 #else
+    typedef char wxRegChar;
+
 #   ifdef HAVE_RE_SEARCH
 #       define WXREGEX_USING_RE_SEARCH
 #   else
@@ -66,6 +284,8 @@
 #   define wx_regerror regerror
 #endif

+#endif // wxUSE_PCRE/!wxUSE_PCRE
+
 // ----------------------------------------------------------------------------
 // private classes
 // ----------------------------------------------------------------------------
@@ -133,13 +353,6 @@ private:

 #endif // WXREGEX_USING_RE_SEARCH

-// the character type used by the regular expression engine
-#ifndef WXREGEX_CONVERT_TO_MB
-typedef wxChar wxRegChar;
-#else
-typedef char wxRegChar;
-#endif
-
 // the real implementation of wxRegEx
 class wxRegExImpl
 {
@@ -152,7 +365,7 @@ public:
    bool IsValid() const { return m_isCompiled; }

    // RE operations
-    bool Compile(const wxString& expr, int flags = 0);
+    bool Compile(wxString expr, int flags = 0);
    bool Matches(const wxRegChar *str, int flags, size_t len) const;
    bool GetMatch(size_t *start, size_t *len, size_t index = 0) const;
    size_t GetMatchCount() const;
@@ -227,11 +440,11 @@ wxString wxRegExImpl::GetErrorMsg(int errorcode) const
    int len = wx_regerror(errorcode, &m_RegEx, NULL, 0);
    if ( len > 0 )
    {
-        wxCharBuffer errbuf(len);
+        wxCharTypeBuffer<wxRegErrorChar> errbuf(len);

        (void)wx_regerror(errorcode, &m_RegEx, errbuf.data(), errbuf.length());

-        szError = wxConvLibc.cMB2WX(errbuf);
+        szError = errbuf;
    }

    if ( szError.empty() ) // regerror() returned 0 or conversion failed
@@ -386,6 +599,16 @@ wxString wxRegEx::ConvertFromBasic(const wxString& bre)
                    // as the escaped versions were special in the BRE.
                    disposition = Disposition_Append;
                    break;
+
+                case '<':
+                case '>':
+                    // Map word boundaries extensions to POSIX syntax
+                    // understood by PCRE.
+                    ere += "[[:";
+                    ere += c;
+                    ere += ":]]";
+                    disposition = Disposition_Skip;
+                    break;
            }
        }
        else // This character is not escaped.
@@ -517,11 +740,294 @@ wxString wxRegEx::ConvertFromBasic(const wxString& bre)
    return ere;
 }

-bool wxRegExImpl::Compile(const wxString& expr, int flags)
+#if wxUSE_PCRE
+
+// Small helper for converting selected PCRE compilation options to string.
+static wxString PCREOptionsToString(int opts)
+{
+    wxString s;
+
+    if ( opts & PCRE2_CASELESS )
+        s += 'i';
+    if ( opts & PCRE2_MULTILINE )
+        s += 'm';
+    if ( opts & PCRE2_DOTALL )
+        s += 's';
+    if ( opts & PCRE2_EXTENDED )
+        s += 'x';
+
+    return s;
+}
+
+// Convert metasyntax, i.e. directors and embedded options, to PCRE syntax.
+//
+// See TCL re_syntax man page for more details.
+static wxString ConvertMetasyntax(wxString expr, int& flags)
+{
+    // First check for directors that must occur only at the beginning.
+    const int DIRECTOR_PREFIX_LEN = 3;
+    if ( expr.length() > DIRECTOR_PREFIX_LEN && expr.StartsWith("***") )
+    {
+        switch ( expr[DIRECTOR_PREFIX_LEN].GetValue() )
+        {
+            // "***:" director indicates that the regex uses ARE syntax.
+            case ':':
+                flags &= ~wxRE_BASIC;
+                flags |= wxRE_ADVANCED;
+                expr.erase(0, DIRECTOR_PREFIX_LEN + 1);
+                break;
+
+            // "***=" director means that the rest is a literal string.
+            case '=':
+                // We could use PCRE2_LITERAL, but for now just use the "\Q"
+                // escape that should result in the same way -- maybe even less
+                // efficiently, but we probably don't really care about
+                // performance in this very special case.
+                flags &= ~(wxRE_BASIC | wxRE_ADVANCED);
+                expr.replace(0, DIRECTOR_PREFIX_LEN + 1, "\\Q");
+                break;
+
+            default:
+                // This is an invalid director that will result in a compile
+                // error anyhow, so don't bother special-casing it and just
+                // don't do anything to compile it and get an error later.
+                break;
+        }
+    }
+
+    // Then check for the embedded options that may occur at the beginning of
+    // an ARE, but possibly after a director (necessarily the "***:" one).
+    if ( (flags & wxRE_ADVANCED) && expr.StartsWith("(?") )
+    {
+        // String with the options: we use this for the options we don't know
+        // about.
+        wxString optsString;
+
+        // PCRE options to enable or disable.
+        int opts = 0,
+            negopts = 0;
+
+        // (Last) syntax selected by the options.
+        enum Syntax
+        {
+            Syntax_None,
+            Syntax_Basic,
+            Syntax_Extended,
+            Syntax_Literal
+        } syntax = Syntax_None;
+
+        const wxString::iterator end = expr.end();
+        const wxString::iterator start = expr.begin() + 2;
+
+        for ( wxString::iterator it = start; it != end; ++it )
+        {
+            if ( *it == ')' )
+            {
+                optsString += PCREOptionsToString(opts);
+
+                if ( negopts )
+                {
+                    optsString += "-";
+                    optsString += PCREOptionsToString(negopts);
+                }
+
+                size_t posAfterOpts;
+                if ( optsString.empty() )
+                {
+                    expr.erase(expr.begin(), ++it);
+                    posAfterOpts = 0;
+                }
+                else
+                {
+                    expr.replace(start, it, optsString);
+                    posAfterOpts = optsString.length() + 3; // (?opts)
+                }
+
+                // Finally deal with the syntax selection.
+                flags &= ~wxRE_ADVANCED;
+
+                switch ( syntax )
+                {
+                    case Syntax_None:
+                        flags |= wxRE_ADVANCED;
+                        break;
+
+                    case Syntax_Basic:
+                        flags |= wxRE_BASIC;
+                        break;
+
+                    case Syntax_Extended:
+                        flags |= wxRE_EXTENDED;
+                        break;
+
+                    case Syntax_Literal:
+                        // As above, we could also use the LITERAL option, but
+                        // this is simpler.
+                        expr.insert(posAfterOpts, "\\Q");
+                        break;
+                }
+
+                break;
+            }
+
+            // Avoid misinterpreting other constructs (non-capturing groups,
+            // look ahead assertions etc) as options, which always consist in
+            // alphabetic characters only.
+            if ( *it < 'a' || *it > 'z' )
+                break;
+
+            switch ( (*it).GetValue() )
+            {
+                case 'b':
+                    syntax = Syntax_Basic;
+                    break;
+
+                case 'e':
+                    syntax = Syntax_Extended;
+                    break;
+
+                case 'q':
+                    syntax = Syntax_Literal;
+                    break;
+
+                case 'm':
+                case 'n':
+                    // This option corresponds to MULTILINE PCRE option,
+                    // without DOTALL, so enable the former and disable the
+                    // latter.
+                    negopts &= ~PCRE2_MULTILINE;
+                    opts |= PCRE2_MULTILINE;
+                    wxFALLTHROUGH;
+
+                case 'p':
+                    // This option corresponds to the default PCRE behaviour,
+                    // but we use DOTALL by default, so turn it off (this might
+                    // be unnecessary if wxRE_NEWLINE is also used, but it does
+                    // no harm).
+                    negopts |= PCRE2_DOTALL;
+                    break;
+
+                case 'w':
+                    // This option corresponds to using both MULTILINE and
+                    // DOTALL with PCRE.
+                    negopts &= ~(PCRE2_MULTILINE | PCRE2_DOTALL);
+                    opts |= PCRE2_MULTILINE | PCRE2_DOTALL;
+                    break;
+
+                case 'c':
+                    // Disable case-insensitive matching.
+                    negopts |= PCRE2_CASELESS;
+                    break;
+
+                case 't':
+                    // Disable extended syntax.
+                    negopts |= PCRE2_EXTENDED;
+                    break;
+
+                case 's':
+                    // This option reverts to the default behaviour in the old
+                    // regex library or enables DOTALL in PCRE, which is much
+                    // more useful and common, so use it with PCRE meaning.
+                    negopts &= ~PCRE2_DOTALL;
+                    opts |= PCRE2_DOTALL;
+                    break;
+
+                    // These options have the same meaning as in PCRE.
+                case 'i':
+                    negopts &= ~PCRE2_CASELESS;
+                    opts |= PCRE2_CASELESS;
+                    break;
+
+                case 'x':
+                    negopts &= ~PCRE2_EXTENDED;
+                    opts |= PCRE2_EXTENDED;
+                    break;
+
+                default:
+                    // Keep the rest: could be a valid PCRE option or invalid
+                    // option for both libraries, in which case we'll get an
+                    // error, which is what we want.
+                    optsString += *it;
+                    break;
+            }
+        }
+    }
+
+    return expr;
+}
+
+// Convert "advanced" word boundary assertions to the syntax understood by PCRE.
+//
+// These extensions (known as "TCL extensions" because TCL uses the same regex
+// library previous wx versions used) worked before, so preserve them for
+// compatibility.
+//
+// Note that this does not take into account "\<" and "\>" (GNU extensions) as
+// those are only valid when using BREs and so are taken care of above.
+static wxString ConvertWordBoundaries(const wxString& expr)
+{
+    wxString out;
+    out.reserve(expr.length());
+
+    for ( wxString::const_iterator it = expr.begin(),
+                                  end = expr.end();
+          it != end;
+          ++it )
+    {
+        if ( *it == '\\' )
+        {
+            ++it;
+            if ( it == end )
+            {
+                out.append('\\');
+                break;
+            }
+
+            const char* replacement = NULL;
+            switch ( (*it).GetValue() )
+            {
+                case 'm':
+                    replacement = "[[:<:]]";
+                    break;
+
+                case 'M':
+                    replacement = "[[:>:]]";
+                    break;
+
+                case 'y':
+                    replacement = "\\b";
+                    break;
+
+                case 'Y':
+                    replacement = "\\B";
+                    break;
+            }
+
+            if ( replacement )
+            {
+                out.append(replacement);
+
+                continue;
+            }
+
+            out.append('\\');
+        }
+
+        out.append(*it);
+    }
+
+    return out;
+}
+
+#endif // wxUSE_PCRE
+
+bool wxRegExImpl::Compile(wxString expr, int flags)
 {
    Reinit();

-#ifdef WX_NO_REGEX_ADVANCED
+#if wxUSE_PCRE
+#   define FLAVORS (wxRE_ADVANCED | wxRE_BASIC)
+#elif defined(WX_NO_REGEX_ADVANCED)
 #   define FLAVORS wxRE_BASIC
 #else
 #   define FLAVORS (wxRE_ADVANCED | wxRE_BASIC)
@@ -531,6 +1037,23 @@ bool wxRegExImpl::Compile(const wxString& expr, int flags)
    wxASSERT_MSG( !(flags & ~(FLAVORS | wxRE_ICASE | wxRE_NOSUB | wxRE_NEWLINE)),
                  wxT("unrecognized flags in wxRegEx::Compile") );

+#if wxUSE_PCRE
+    // Deal with the directors and embedded options first (this can modify
+    // flags).
+    expr = ConvertMetasyntax(expr, flags);
+
+    // PCRE doesn't support BREs, translate them to EREs.
+    if ( flags & wxRE_BASIC )
+    {
+        expr = wxRegEx::ConvertFromBasic(expr);
+        flags &= ~wxRE_BASIC;
+    }
+    else if ( flags & wxRE_ADVANCED )
+    {
+        expr = ConvertWordBoundaries(expr);
+    }
+#endif // wxUSE_PCRE
+
    // translate our flags to regcomp() ones
    int flagsRE = 0;
    if ( !(flags & wxRE_BASIC) )
@@ -605,7 +1128,11 @@ bool wxRegExImpl::Compile(const wxString& expr, int flags)
                    // extended syntax. '(?' is used for extensions by perl-
                    // like REs (e.g. advanced), and is not valid for POSIX
                    // extended, so ignore them always.
-                    if ( cptr[1] != wxT('?') )
+                    if ( cptr[1] != wxT('?')
+#if wxUSE_PCRE
+                        && cptr[1] != wxT('*')
+#endif
+                            )
                        m_nMatches++;
                }
            }
@@ -651,7 +1178,7 @@ bool wxRegExImpl::Matches(const wxRegChar *str,
    wxCHECK_MSG( IsValid(), false, wxT("must successfully Compile() first") );

    // translate our flags to regexec() ones
-    wxASSERT_MSG( !(flags & ~(wxRE_NOTBOL | wxRE_NOTEOL)),
+    wxASSERT_MSG( !(flags & ~(wxRE_NOTBOL | wxRE_NOTEOL | wxRE_NOTEMPTY)),
                  wxT("unrecognized flags in wxRegEx::Matches") );

    int flagsRE = 0;
@@ -659,6 +1186,10 @@ bool wxRegExImpl::Matches(const wxRegChar *str,
        flagsRE |= REG_NOTBOL;
    if ( flags & wxRE_NOTEOL )
        flagsRE |= REG_NOTEOL;
+#if wxUSE_PCRE
+    if ( flags & wxRE_NOTEMPTY )
+        flagsRE |= REG_NOTEMPTY;
+#endif // wxUSE_PCRE

    // allocate matches array if needed
    wxRegExImpl *self = wxConstCast(this, wxRegExImpl);
--- a/tests/regex/regextest.cpp
+++ b/tests/regex/regextest.cpp
@@ -159,12 +159,24 @@ bool RegExTestCase::parseFlags(const wxString& flags)

            // we don't fully support these flags, but they don't stop us
            // checking for success of failure of the match, so treat as noop
-            case 'A': case 'B': case 'E': case 'H':
+            case 'A': case 'B': case 'H':
            case 'I': case 'L': case 'M': case 'N':
            case 'P': case 'Q': case 'R': case 'S':
-            case 'T': case 'U': case '%':
+            case 'T': case '%':
                break;

+            // Skip tests checking for backslash inside bracket expressions:
+            // this works completely differently in PCRE where backslash is
+            // special, even inside [], from POSIX.
+            case 'E':
+                return false;
+            // Also skip the (there is only one) test using POSIX-specified
+            // handling of unmatched ')' as a non-special character -- PCRE
+            // doesn't support this and it doesn't seem worth implementing
+            // support for this ourselves neither.
+            case 'U':
+                return false;
+
            // match options
            case '^': m_matchFlags |= wxRE_NOTBOL; break;
            case '$': m_matchFlags |= wxRE_NOTEOL; break;
@@ -199,6 +211,122 @@ void RegExTestCase::runTest()
        return;
    }

+    // Skip, or accommodate, some test cases from the original test suite that
+    // are known not to work with PCRE:
+
+    // Several regexes use syntax which is valid in PCRE and so their
+    // compilation doesn't fail as expected:
+    if (m_mode == 'e') {
+        static const char* validForPCRE[] =
+        {
+            // Non-capturing group.
+            "a(?:b)c",
+
+            // Possessive quantifiers.
+            "a++", "a?+","a*+",
+
+            // Quoting from pcre2pattern(1):
+            //
+            //      An opening curly bracket [...] that does not match the
+            //      syntax of a quantifier, is taken as a literal character.
+            "a{1,2,3}", "a{1", "a{1n}", "a\\{0,1", "a{0,1\\",
+
+            // From the same page:
+            //
+            //      The numbers must be less than 65536
+            //
+            // (rather than 256 limit for POSIX).
+            "a{257}", "a{1000}",
+
+            // Also:
+            //
+            //      If a minus character is required in a class, it must be
+            //      escaped with a backslash or appear in a position where it
+            //      cannot be interpreted as indicating a range, typically as
+            //      the first or last character in the class, or immediately
+            //      after a range.
+            //
+            // (while POSIX wants the last case to be an error).
+            "a[a-b-c]",
+
+            // PCRE allows quantifiers after word boundary assertions, so skip
+            // the tests checking that using them results in an error.
+            "[[:<:]]*", "[[:>:]]*", "\\<*", "\\>*", "\\y*", "\\Y*",
+
+            // PCRE only interprets "\x" and "\u" specially when they're
+            // followed by exactly 2 or 4 hexadecimal digits and just lets them
+            // match "x" or "u" otherwise, instead of giving an error.
+            "a\\xq", "a\\u008x",
+
+            // And "\U" always just matches "U", PCRE doesn't support it as
+            // Unicode escape at all (even with PCRE2_EXTRA_ALT_BSUX).
+            "a\\U0000008x",
+
+            // "\z" is the "end of string" assertion and not an error in PCRE.
+            "a\\z",
+
+            // Recursive backreferences are explicitly allowed in PCRE.
+            "a((b)\\1)",
+
+            // Backreferences with index greater than 8 are interpreted as
+            // octal escapes, unfortunately.
+            "a((((((((((b\\10))))))))))c", "a\\12b",
+        };
+
+        for (size_t n = 0; n < WXSIZEOF(validForPCRE); ++n) {
+            if (m_pattern == validForPCRE[n])
+                return;
+        }
+    }
+
+    if (m_mode == 'm') {
+        // PCRE doesn't support POSIX collating elements, so we have to skip
+        // those too.
+        if (m_pattern.find("[.") != wxString::npos || m_pattern.find("[:") != wxString::npos)
+            return;
+
+        // "\b" is a word boundary assertion in PCRE and so is "\B", so the
+        // tests relying on them being escapes for ASCII backspace and
+        // backslash respectively must be skipped.
+        if (m_pattern.find("\\b") != wxString::npos || m_pattern.find("\\B") != wxString::npos)
+            return;
+
+        // As explained above, "\U" is not supported by PCRE, only "\u" is.
+        if (m_pattern == "a\\U00000008x")
+            m_pattern = "a\\u0008x";
+        // And "\x" is supported only when followed by 2 digits, not 4.
+        else if (m_pattern == "a\\x0008x")
+            m_pattern = "a\\x08x";
+
+        // "\12" can be a backreferences or an octal escape in PCRE, but never
+        // literal "12" as this test expects it to be.
+        if (m_pattern == "a\\12b")
+            return;
+
+        // Switching to "extended" mode is supposed to turn off "\W"
+        // interpretation, but it doesn't work with PCRE.
+        if (m_pattern == "(?e)\\W+")
+            return;
+
+        // None of the tests in "tricky cases" section passes with PCRE. It's
+        // not really clear if PCRE is wrong or the original test suite was or
+        // even if these regexes are ambiguous, but for now explicitly anchor
+        // them at the end to force them to pass even with PCRE, as without it
+        // they would match less than expected.
+        if (m_pattern == "(week|wee)(night|knights)" ||
+            m_pattern == "a(bc*).*\\1" ||
+            m_pattern == "a(b.[bc]*)+")
+            m_pattern += '$';
+    }
+
+    // This test uses an empty alternative branch: in POSIX, this is ignored,
+    // while with PCRE it matches an empty string and we must set NOTEMPTY flag
+    // explicitly to disable this.
+    if (m_pattern == "a||b" && m_flags == "NS" ) {
+        m_matchFlags |= wxRE_NOTEMPTY;
+    }
+
+
    // Provide more information about the test case if it fails.
    wxString str;
    wxArrayString::const_iterator it;
@@ -285,6 +413,21 @@ void RegExTestCase::doTest(int flavor)
        // i - check the match returns the offsets given
        else if (m_mode == 'i')
        {
+#if wxUSE_UNICODE_UTF8
+            // Values returned by GetMatch() are indices into UTF-8 string, but
+            // the values expected by the test are indices in a UTF-16 or -32
+            // string, so convert them. Note that the indices are correct, as
+            // using substr(start, len) must return the match itself, it's just
+            // that they differ when using UTF-8 internally.
+            if ( start < INT_MAX )
+            {
+                if ( start + len > 0 )
+                    len = m_data.substr(start, len).wc_str().length();
+
+                start = m_data.substr(0, start).wc_str().length();
+            }
+#endif // wxUSE_UNICODE_UTF8
+
            if (start > INT_MAX)
                result = wxT("-1 -1");
            else if (start + len > 0)
--- a/tests/regex/wxregextest.cpp
+++ b/tests/regex/wxregextest.cpp
@@ -59,7 +59,7 @@ TEST_CASE("wxRegEx::Compile", "[regex][compile]")
    CHECK_FALSE( re.Compile("foo[") );
    CHECK_FALSE( re.Compile("foo[bar") );
    CHECK      ( re.Compile("foo[bar]") );
-    CHECK_FALSE( re.Compile("foo{1") );
+    // Not invalid for PCRE: CHECK_FALSE( re.Compile("foo{1") );
    CHECK      ( re.Compile("foo{1}") );
    CHECK      ( re.Compile("foo{1,2}") );
    CHECK      ( re.Compile("foo*") );
@@ -184,4 +184,20 @@ TEST_CASE("wxRegEx::ConvertFromBasic", "[regex][basic]")
    CHECK( wxRegEx::ConvertFromBasic("[^$\\)]") == "[^$\\)]" );
 }

+#ifdef wxHAS_REGEX_ADVANCED
+
+TEST_CASE("wxRegEx::Unicode", "[regex][unicode]")
+{
+    const wxString cyrillicCapitalA(L"\u0410");
+    const wxString cyrillicSmallA(L"\u0430");
+
+    wxRegEx re(cyrillicCapitalA, wxRE_ICASE);
+    REQUIRE( re.IsValid() );
+
+    REQUIRE( re.Matches(cyrillicSmallA) );
+    CHECK( re.GetMatch(cyrillicSmallA) == cyrillicSmallA );
+}
+
+#endif // wxHAS_REGEX_ADVANCED
+
 #endif // wxUSE_REGEX