From fa59d5700a4a2e56eb29651255bb6974e4e4bb2f Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Sat, 17 Jul 2021 17:00:19 +0200 Subject: [PATCH] Implement wxRegEx using PCRE Adjust the tests and document the incompatibilities with the previously used regex syntax. In this commit the use of PCRE is conditional on wxUSE_PCRE which is never defined as 1 yet, so the new code is still disabled. --- include/wx/regex.h | 5 +- interface/wx/regex.h | 118 ++++++-- src/common/regex.cpp | 563 +++++++++++++++++++++++++++++++++++- tests/regex/regextest.cpp | 147 +++++++++- tests/regex/wxregextest.cpp | 18 +- 5 files changed, 809 insertions(+), 42 deletions(-) diff --git a/include/wx/regex.h b/include/wx/regex.h index dc3b34c4a5..79676de88f 100644 --- a/include/wx/regex.h +++ b/include/wx/regex.h @@ -60,7 +60,10 @@ enum wxRE_NOTBOL = 32, // '$' doesn't match at the end of line - wxRE_NOTEOL = 64 + wxRE_NOTEOL = 64, + + // don't accept empty string as valid match, try alternatives or fail + wxRE_NOTEMPTY = 128 }; // ---------------------------------------------------------------------------- diff --git a/interface/wx/regex.h b/interface/wx/regex.h index cc3dc92acf..711cb3d673 100644 --- a/interface/wx/regex.h +++ b/interface/wx/regex.h @@ -12,13 +12,31 @@ */ enum { - /** Use extended regex syntax. */ + /** + Use extended regex syntax. + + This is the default and doesn't need to be specified. + */ wxRE_EXTENDED = 0, - /** Use advanced RE syntax (built-in regex only). */ + /** + Use advanced regex syntax. + + This flag is synonym for wxRE_EXTENDED and doesn't need to be specified + as this is the default syntax. + */ wxRE_ADVANCED = 1, - /** Use basic RE syntax. */ + /** + Use basic regex syntax. + + Use basic regular expression syntax, close to its POSIX definition, + but with some extensions still available. + + The word start/end boundary assertions @c "\<" and @c "\>" are only + available when using basic syntax, use @c "[[:<:]] and @c "[[:>:]]" or + just more general word boundary assertion @c "\b" when not using it. + */ wxRE_BASIC = 2, /** Ignore case in match. */ @@ -51,7 +69,19 @@ enum wxRE_NOTBOL = 32, /** '$' doesn't match at the end of line. */ - wxRE_NOTEOL = 64 + wxRE_NOTEOL = 64, + + /** + Don't accept empty string as a valid match. + + If the regex matches an empty string, try alternatives, if there are + any, or fail. + + This flag is not supported if PCRE support is turned off. + + @since 3.1.6 + */ + wxRE_NOTEMPTY = 128 }; /** @@ -60,26 +90,19 @@ enum wxRegEx represents a regular expression. This class provides support for regular expressions matching and also replacement. - It is built on top of either the system library (if it has support - for POSIX regular expressions - which is the case of the most modern - Unices) or uses the built in Henry Spencer's library. Henry Spencer - would appreciate being given credit in the documentation of software - which uses his library, but that is not a requirement. + In wxWidgets 3.1.6 or later, it is built on top of PCRE library + (https://www.pcre.org/). In the previous versions of wxWidgets, this class + uses Henry Spencer's library and behaved slightly differently, see below + for the discussion of the changes if you're upgrading from an older + version. - Regular expressions, as defined by POSIX, come in two flavours: @e extended - and @e basic. The builtin library also adds a third flavour - of expression @ref overview_resyntax "advanced", which is not available - when using the system library. + Note that while C++11 and later provides @c std::regex and related classes, + this class is still useful as it provides the following important + advantages: - Unicode is fully supported only when using the builtin library. - When using the system library in Unicode mode, the expressions and data - are translated to the default 8-bit encoding before being passed to - the library. - - On platforms where a system library is available, the default is to use - the builtin library for Unicode builds, and the system library otherwise. - It is possible to use the other if preferred by selecting it when building - the wxWidgets. + - Support for richer regular expressions syntax. + - Much better performance in many common cases, by a factor of 10-100. + - Consistent behaviour, including performance, on all platforms. @library{wxbase} @category{data} @@ -118,6 +141,57 @@ enum std::cout << "text now contains " << count << " hidden addresses" << std::endl; std::cout << originalText << std::endl; @endcode + + + @section regex_pcre_changes Changes in the PCRE-based version + + This section describes the difference in regex syntax in the new PCRE-based + wxRegEx version compared to the previously used version which implemented + POSIX regex support. + + The main change is that both extended (::wxRE_EXTENDED) and advanced + (::wxRE_ADVANCED) regex syntax is now the same as PCRE syntax described at + https://www.pcre.org/current/doc/html/pcre2syntax.html + + Basic regular expressions (::wxRE_BASIC) are still different, but their + use is deprecated and PCRE extensions are still accepted in them, please + avoid using them. + + Other changes are: + + - Negated character classes, i.e. @c [^....], now always match newline + character, regardless of whether ::wxRE_NEWLINE was used or not. The dot + metacharacter still has the same meaning, i.e. it matches newline by + default but not when ::wxRE_NEWLINE is specified. + + - Previously POSIX-specified behaviour of handling unmatched right + parenthesis @c ')' as a literal character was implemented, but now this + is a (regex) compilation error. + + - Empty alternation branches were previously ignored, i.e. matching @c a||b + worked the same as matching just @c a|b, but now actually matches an + empty string. The new ::wxRE_NOTEMPTY flag can be used to disable empty + matches. + + - Using @c \U to embed Unicode code points into the pattern is not + supported any more, use the still supported @c \u, followed by exactly + four hexadecimal digits, or @c \x, followed by exactly two hexadecimal + digits, instead. + + - POSIX collating elements inside square brackets, i.e. @c [.XXX.] and + @c [:XXXX:] are not supported by PCRE and result in regex compilation + errors. + + - Backslash can be used to escape the character following it even inside + square brackets now, while it loses its special meaning in POSIX regexes + when it occurs inside square brackets. + + - For completeness, PCRE syntax which previously resulted in errors, e.g. + @c "(?:...)" and similar constructs, are now accepted and behave as + expected. Other regexes syntactically invalid according to POSIX are are + re-interpreted as sequences of literal characters with PCRE, e.g. @c "{1" + is just a sequence of two literal characters now, where it previously was + a compilation error. */ class wxRegEx { diff --git a/src/common/regex.cpp b/src/common/regex.cpp index 77b7d746ba..2c5f1816e7 100644 --- a/src/common/regex.cpp +++ b/src/common/regex.cpp @@ -37,16 +37,234 @@ # include #endif -#include - // WXREGEX_USING_BUILTIN defined when using the built-in regex lib // WXREGEX_USING_RE_SEARCH defined when using re_search in the GNU regex lib // WXREGEX_CONVERT_TO_MB defined when the regex lib is using chars and // wxChar is wide, so conversion to UTF-8 must be done +// wxRegChar the character type used by the regular expression engine // + +#if wxUSE_PCRE + // Use the same code unit width for PCRE as we use for wxString. +# if !wxUSE_UNICODE || wxUSE_UNICODE_UTF8 +# define PCRE2_CODE_UNIT_WIDTH 8 + typedef char wxRegChar; +# elif wxUSE_UNICODE_UTF16 +# define PCRE2_CODE_UNIT_WIDTH 16 + typedef wchar_t wxRegChar; +# else +# define PCRE2_CODE_UNIT_WIDTH 32 + typedef wchar_t wxRegChar; +# endif + typedef wxRegChar wxRegErrorChar; + + // We currently always use PCRE as a static library under MSW. +# ifdef __WINDOWS__ +# define PCRE2_STATIC +# endif + +# include + +# if wxUSE_UNICODE_UTF8 +# define WXREGEX_CONVERT_TO_MB +# endif + +# define WX_NO_REGEX_ADVANCED + +// There is an existing pcre2posix library which provides regxxx() +// implementations, but we don't use it because: +// +// 0. The plan is to stop using POSIX API soon anyhow. +// 1. It's yet another system library to depend on. +// 2. We can add non-standard "len" parameter to regexec(). +// 3. We want to use PCRE2_ALT_BSUX for compatibility, but we can't +// set it using just the POSIX API. +// +// So implement these functions ourselves. +namespace +{ + +// Define POSIX constants and structs ourselves too. + +#define REG_EXTENDED 0 // Unused, for compatibility only. + +#define REG_ICASE 0x0001 // Same as PCRE2_CASELESS. +#define REG_NEWLINE 0x0002 // Same as PCRE2_MULTILINE. +#define REG_NOTBOL 0x0004 // Same as PCRE2_NOTBOL. +#define REG_NOTEOL 0x0008 // Same as PCRE2_NOTEOL. +#define REG_NOSUB 0x0020 // Don't return matches. +#define REG_NOTEMPTY 0x0100 // Same as PCRE2_NOTEMPTY. + +enum +{ + REG_NOERROR = 0, // Must be 0. + REG_NOMATCH, // Returned from regexec(). + REG_BADPAT, // Catch-all error returned from regcomp(). + REG_ESPACE // Catch-all errir returned from regexec(). +}; + +typedef size_t regoff_t; + +struct regex_t +{ + // This is the only "public" field -- not that it really matters anyhow for + // this private struct. + size_t re_nsub; + + pcre2_code* code; + pcre2_match_data* match_data; + + int errorcode; + regoff_t erroroffset; +}; + +struct regmatch_t +{ + regoff_t rm_so; + regoff_t rm_eo; +}; + +int wx_regcomp(regex_t* preg, const wxRegChar* pattern, int cflags) +{ + // PCRE2_UTF is required in order to handle non-ASCII characters when using + // 8-bit version of the library. + // + // Use PCRE2_ALT_BSUX because we want to handle \uXXXX for compatibility + // with the previously used regex library and because it's useful. + int options = PCRE2_UTF | PCRE2_ALT_BSUX; + + if ( cflags & REG_ICASE ) + options |= PCRE2_CASELESS; + + // Default behaviour of the old regex library corresponds to DOTALL i.e. + // dot matches any character, but wxRE_NEWLINE enables both MULTILINE (so + // that ^/$ match after/before newline in addition to matching at the + // start/end of string) and disables the special handling of "\n", i.e. we + // must use DOTALL with it. + if ( cflags & REG_NEWLINE ) + options |= PCRE2_MULTILINE; + else + options |= PCRE2_DOTALL; + + preg->code = pcre2_compile + ( + (PCRE2_SPTR)pattern, + PCRE2_ZERO_TERMINATED, + options, + &preg->errorcode, + &preg->erroroffset, + NULL // use default context + ); + + if ( !preg->code ) + { + // Don't bother translating PCRE error to the most appropriate POSIX + // error code, there is no way to do it losslessly and the main thing + // that matters is the error message and not the error code anyhow. + return REG_BADPAT; + } + + preg->match_data = pcre2_match_data_create_from_pattern(preg->code, NULL); + + return REG_NOERROR; +} + +int +wx_regexec(const regex_t* preg, const wxRegChar* string, size_t len, + size_t nmatch, regmatch_t* pmatch, int eflags) +{ + int options = 0; + + if ( eflags & REG_NOTBOL ) + options |= PCRE2_NOTBOL; + if ( eflags & REG_NOTEOL ) + options |= PCRE2_NOTEOL; + if ( eflags & REG_NOTEMPTY ) + options |= PCRE2_NOTEMPTY; + + const int rc = pcre2_match + ( + preg->code, + (PCRE2_SPTR)string, + len, + 0, // start offset + options, + preg->match_data, + NULL // use default context + ); + + if ( rc == PCRE2_ERROR_NOMATCH ) + return REG_NOMATCH; + + if ( rc < 0 ) + return REG_ESPACE; + + // Successful match, fill in pmatch array if necessary. + if ( pmatch ) + { + const PCRE2_SIZE* const + ovector = pcre2_get_ovector_pointer(preg->match_data); + + const size_t nmatchActual = static_cast(rc); + for ( size_t n = 0; n < nmatch; ++n ) + { + regmatch_t& m = pmatch[n]; + + if ( n < nmatchActual ) + { + m.rm_so = ovector[n*2] == PCRE2_UNSET ? -1 : ovector[n*2]; + m.rm_eo = ovector[n*2+1] == PCRE2_UNSET ? -1 : ovector[n*2+1]; + } + else + { + m.rm_so = + m.rm_eo = static_cast(-1); + } + } + } + + return REG_NOERROR; +} + +size_t +wx_regerror(int errcode, const regex_t* preg, wxRegErrorChar* errbuf, size_t errbuf_size) +{ + // We don't use the passed in POSIX error code other than to check that we + // do have an error but rely on PCRE error code from regex_t. + wxRegErrorChar buffer[256]; + int len; + if ( errcode == REG_NOERROR ) + len = wxSnprintf(buffer, WXSIZEOF(buffer), "no error"); + else + len = pcre2_get_error_message(preg->errorcode, (PCRE2_UCHAR*)buffer, sizeof(buffer)); + + if ( len < 0 ) + len = wxSnprintf(buffer, WXSIZEOF(buffer), "PCRE error %d", preg->errorcode); + + if ( errbuf && errbuf_size ) + wxStrlcpy(errbuf, buffer, errbuf_size); + + return len; +} + +void wx_regfree(regex_t* preg) +{ + pcre2_match_data_free(preg->match_data); + pcre2_code_free(preg->code); +} + +} // anonymous namespace + +#else // !wxUSE_PCRE + +#include +typedef char wxRegErrorChar; #ifdef __REG_NOFRONT # define WXREGEX_USING_BUILTIN + typedef wxChar wxRegChar; #else + typedef char wxRegChar; + # ifdef HAVE_RE_SEARCH # define WXREGEX_USING_RE_SEARCH # else @@ -66,6 +284,8 @@ # define wx_regerror regerror #endif +#endif // wxUSE_PCRE/!wxUSE_PCRE + // ---------------------------------------------------------------------------- // private classes // ---------------------------------------------------------------------------- @@ -133,13 +353,6 @@ private: #endif // WXREGEX_USING_RE_SEARCH -// the character type used by the regular expression engine -#ifndef WXREGEX_CONVERT_TO_MB -typedef wxChar wxRegChar; -#else -typedef char wxRegChar; -#endif - // the real implementation of wxRegEx class wxRegExImpl { @@ -152,7 +365,7 @@ public: bool IsValid() const { return m_isCompiled; } // RE operations - bool Compile(const wxString& expr, int flags = 0); + bool Compile(wxString expr, int flags = 0); bool Matches(const wxRegChar *str, int flags, size_t len) const; bool GetMatch(size_t *start, size_t *len, size_t index = 0) const; size_t GetMatchCount() const; @@ -227,11 +440,11 @@ wxString wxRegExImpl::GetErrorMsg(int errorcode) const int len = wx_regerror(errorcode, &m_RegEx, NULL, 0); if ( len > 0 ) { - wxCharBuffer errbuf(len); + wxCharTypeBuffer errbuf(len); (void)wx_regerror(errorcode, &m_RegEx, errbuf.data(), errbuf.length()); - szError = wxConvLibc.cMB2WX(errbuf); + szError = errbuf; } if ( szError.empty() ) // regerror() returned 0 or conversion failed @@ -386,6 +599,16 @@ wxString wxRegEx::ConvertFromBasic(const wxString& bre) // as the escaped versions were special in the BRE. disposition = Disposition_Append; break; + + case '<': + case '>': + // Map word boundaries extensions to POSIX syntax + // understood by PCRE. + ere += "[[:"; + ere += c; + ere += ":]]"; + disposition = Disposition_Skip; + break; } } else // This character is not escaped. @@ -517,11 +740,294 @@ wxString wxRegEx::ConvertFromBasic(const wxString& bre) return ere; } -bool wxRegExImpl::Compile(const wxString& expr, int flags) +#if wxUSE_PCRE + +// Small helper for converting selected PCRE compilation options to string. +static wxString PCREOptionsToString(int opts) +{ + wxString s; + + if ( opts & PCRE2_CASELESS ) + s += 'i'; + if ( opts & PCRE2_MULTILINE ) + s += 'm'; + if ( opts & PCRE2_DOTALL ) + s += 's'; + if ( opts & PCRE2_EXTENDED ) + s += 'x'; + + return s; +} + +// Convert metasyntax, i.e. directors and embedded options, to PCRE syntax. +// +// See TCL re_syntax man page for more details. +static wxString ConvertMetasyntax(wxString expr, int& flags) +{ + // First check for directors that must occur only at the beginning. + const int DIRECTOR_PREFIX_LEN = 3; + if ( expr.length() > DIRECTOR_PREFIX_LEN && expr.StartsWith("***") ) + { + switch ( expr[DIRECTOR_PREFIX_LEN].GetValue() ) + { + // "***:" director indicates that the regex uses ARE syntax. + case ':': + flags &= ~wxRE_BASIC; + flags |= wxRE_ADVANCED; + expr.erase(0, DIRECTOR_PREFIX_LEN + 1); + break; + + // "***=" director means that the rest is a literal string. + case '=': + // We could use PCRE2_LITERAL, but for now just use the "\Q" + // escape that should result in the same way -- maybe even less + // efficiently, but we probably don't really care about + // performance in this very special case. + flags &= ~(wxRE_BASIC | wxRE_ADVANCED); + expr.replace(0, DIRECTOR_PREFIX_LEN + 1, "\\Q"); + break; + + default: + // This is an invalid director that will result in a compile + // error anyhow, so don't bother special-casing it and just + // don't do anything to compile it and get an error later. + break; + } + } + + // Then check for the embedded options that may occur at the beginning of + // an ARE, but possibly after a director (necessarily the "***:" one). + if ( (flags & wxRE_ADVANCED) && expr.StartsWith("(?") ) + { + // String with the options: we use this for the options we don't know + // about. + wxString optsString; + + // PCRE options to enable or disable. + int opts = 0, + negopts = 0; + + // (Last) syntax selected by the options. + enum Syntax + { + Syntax_None, + Syntax_Basic, + Syntax_Extended, + Syntax_Literal + } syntax = Syntax_None; + + const wxString::iterator end = expr.end(); + const wxString::iterator start = expr.begin() + 2; + + for ( wxString::iterator it = start; it != end; ++it ) + { + if ( *it == ')' ) + { + optsString += PCREOptionsToString(opts); + + if ( negopts ) + { + optsString += "-"; + optsString += PCREOptionsToString(negopts); + } + + size_t posAfterOpts; + if ( optsString.empty() ) + { + expr.erase(expr.begin(), ++it); + posAfterOpts = 0; + } + else + { + expr.replace(start, it, optsString); + posAfterOpts = optsString.length() + 3; // (?opts) + } + + // Finally deal with the syntax selection. + flags &= ~wxRE_ADVANCED; + + switch ( syntax ) + { + case Syntax_None: + flags |= wxRE_ADVANCED; + break; + + case Syntax_Basic: + flags |= wxRE_BASIC; + break; + + case Syntax_Extended: + flags |= wxRE_EXTENDED; + break; + + case Syntax_Literal: + // As above, we could also use the LITERAL option, but + // this is simpler. + expr.insert(posAfterOpts, "\\Q"); + break; + } + + break; + } + + // Avoid misinterpreting other constructs (non-capturing groups, + // look ahead assertions etc) as options, which always consist in + // alphabetic characters only. + if ( *it < 'a' || *it > 'z' ) + break; + + switch ( (*it).GetValue() ) + { + case 'b': + syntax = Syntax_Basic; + break; + + case 'e': + syntax = Syntax_Extended; + break; + + case 'q': + syntax = Syntax_Literal; + break; + + case 'm': + case 'n': + // This option corresponds to MULTILINE PCRE option, + // without DOTALL, so enable the former and disable the + // latter. + negopts &= ~PCRE2_MULTILINE; + opts |= PCRE2_MULTILINE; + wxFALLTHROUGH; + + case 'p': + // This option corresponds to the default PCRE behaviour, + // but we use DOTALL by default, so turn it off (this might + // be unnecessary if wxRE_NEWLINE is also used, but it does + // no harm). + negopts |= PCRE2_DOTALL; + break; + + case 'w': + // This option corresponds to using both MULTILINE and + // DOTALL with PCRE. + negopts &= ~(PCRE2_MULTILINE | PCRE2_DOTALL); + opts |= PCRE2_MULTILINE | PCRE2_DOTALL; + break; + + case 'c': + // Disable case-insensitive matching. + negopts |= PCRE2_CASELESS; + break; + + case 't': + // Disable extended syntax. + negopts |= PCRE2_EXTENDED; + break; + + case 's': + // This option reverts to the default behaviour in the old + // regex library or enables DOTALL in PCRE, which is much + // more useful and common, so use it with PCRE meaning. + negopts &= ~PCRE2_DOTALL; + opts |= PCRE2_DOTALL; + break; + + // These options have the same meaning as in PCRE. + case 'i': + negopts &= ~PCRE2_CASELESS; + opts |= PCRE2_CASELESS; + break; + + case 'x': + negopts &= ~PCRE2_EXTENDED; + opts |= PCRE2_EXTENDED; + break; + + default: + // Keep the rest: could be a valid PCRE option or invalid + // option for both libraries, in which case we'll get an + // error, which is what we want. + optsString += *it; + break; + } + } + } + + return expr; +} + +// Convert "advanced" word boundary assertions to the syntax understood by PCRE. +// +// These extensions (known as "TCL extensions" because TCL uses the same regex +// library previous wx versions used) worked before, so preserve them for +// compatibility. +// +// Note that this does not take into account "\<" and "\>" (GNU extensions) as +// those are only valid when using BREs and so are taken care of above. +static wxString ConvertWordBoundaries(const wxString& expr) +{ + wxString out; + out.reserve(expr.length()); + + for ( wxString::const_iterator it = expr.begin(), + end = expr.end(); + it != end; + ++it ) + { + if ( *it == '\\' ) + { + ++it; + if ( it == end ) + { + out.append('\\'); + break; + } + + const char* replacement = NULL; + switch ( (*it).GetValue() ) + { + case 'm': + replacement = "[[:<:]]"; + break; + + case 'M': + replacement = "[[:>:]]"; + break; + + case 'y': + replacement = "\\b"; + break; + + case 'Y': + replacement = "\\B"; + break; + } + + if ( replacement ) + { + out.append(replacement); + + continue; + } + + out.append('\\'); + } + + out.append(*it); + } + + return out; +} + +#endif // wxUSE_PCRE + +bool wxRegExImpl::Compile(wxString expr, int flags) { Reinit(); -#ifdef WX_NO_REGEX_ADVANCED +#if wxUSE_PCRE +# define FLAVORS (wxRE_ADVANCED | wxRE_BASIC) +#elif defined(WX_NO_REGEX_ADVANCED) # define FLAVORS wxRE_BASIC #else # define FLAVORS (wxRE_ADVANCED | wxRE_BASIC) @@ -531,6 +1037,23 @@ bool wxRegExImpl::Compile(const wxString& expr, int flags) wxASSERT_MSG( !(flags & ~(FLAVORS | wxRE_ICASE | wxRE_NOSUB | wxRE_NEWLINE)), wxT("unrecognized flags in wxRegEx::Compile") ); +#if wxUSE_PCRE + // Deal with the directors and embedded options first (this can modify + // flags). + expr = ConvertMetasyntax(expr, flags); + + // PCRE doesn't support BREs, translate them to EREs. + if ( flags & wxRE_BASIC ) + { + expr = wxRegEx::ConvertFromBasic(expr); + flags &= ~wxRE_BASIC; + } + else if ( flags & wxRE_ADVANCED ) + { + expr = ConvertWordBoundaries(expr); + } +#endif // wxUSE_PCRE + // translate our flags to regcomp() ones int flagsRE = 0; if ( !(flags & wxRE_BASIC) ) @@ -605,7 +1128,11 @@ bool wxRegExImpl::Compile(const wxString& expr, int flags) // extended syntax. '(?' is used for extensions by perl- // like REs (e.g. advanced), and is not valid for POSIX // extended, so ignore them always. - if ( cptr[1] != wxT('?') ) + if ( cptr[1] != wxT('?') +#if wxUSE_PCRE + && cptr[1] != wxT('*') +#endif + ) m_nMatches++; } } @@ -651,7 +1178,7 @@ bool wxRegExImpl::Matches(const wxRegChar *str, wxCHECK_MSG( IsValid(), false, wxT("must successfully Compile() first") ); // translate our flags to regexec() ones - wxASSERT_MSG( !(flags & ~(wxRE_NOTBOL | wxRE_NOTEOL)), + wxASSERT_MSG( !(flags & ~(wxRE_NOTBOL | wxRE_NOTEOL | wxRE_NOTEMPTY)), wxT("unrecognized flags in wxRegEx::Matches") ); int flagsRE = 0; @@ -659,6 +1186,10 @@ bool wxRegExImpl::Matches(const wxRegChar *str, flagsRE |= REG_NOTBOL; if ( flags & wxRE_NOTEOL ) flagsRE |= REG_NOTEOL; +#if wxUSE_PCRE + if ( flags & wxRE_NOTEMPTY ) + flagsRE |= REG_NOTEMPTY; +#endif // wxUSE_PCRE // allocate matches array if needed wxRegExImpl *self = wxConstCast(this, wxRegExImpl); diff --git a/tests/regex/regextest.cpp b/tests/regex/regextest.cpp index 4902c852d1..1178341c95 100644 --- a/tests/regex/regextest.cpp +++ b/tests/regex/regextest.cpp @@ -159,12 +159,24 @@ bool RegExTestCase::parseFlags(const wxString& flags) // we don't fully support these flags, but they don't stop us // checking for success of failure of the match, so treat as noop - case 'A': case 'B': case 'E': case 'H': + case 'A': case 'B': case 'H': case 'I': case 'L': case 'M': case 'N': case 'P': case 'Q': case 'R': case 'S': - case 'T': case 'U': case '%': + case 'T': case '%': break; + // Skip tests checking for backslash inside bracket expressions: + // this works completely differently in PCRE where backslash is + // special, even inside [], from POSIX. + case 'E': + return false; + // Also skip the (there is only one) test using POSIX-specified + // handling of unmatched ')' as a non-special character -- PCRE + // doesn't support this and it doesn't seem worth implementing + // support for this ourselves neither. + case 'U': + return false; + // match options case '^': m_matchFlags |= wxRE_NOTBOL; break; case '$': m_matchFlags |= wxRE_NOTEOL; break; @@ -199,6 +211,122 @@ void RegExTestCase::runTest() return; } + // Skip, or accommodate, some test cases from the original test suite that + // are known not to work with PCRE: + + // Several regexes use syntax which is valid in PCRE and so their + // compilation doesn't fail as expected: + if (m_mode == 'e') { + static const char* validForPCRE[] = + { + // Non-capturing group. + "a(?:b)c", + + // Possessive quantifiers. + "a++", "a?+","a*+", + + // Quoting from pcre2pattern(1): + // + // An opening curly bracket [...] that does not match the + // syntax of a quantifier, is taken as a literal character. + "a{1,2,3}", "a{1", "a{1n}", "a\\{0,1", "a{0,1\\", + + // From the same page: + // + // The numbers must be less than 65536 + // + // (rather than 256 limit for POSIX). + "a{257}", "a{1000}", + + // Also: + // + // If a minus character is required in a class, it must be + // escaped with a backslash or appear in a position where it + // cannot be interpreted as indicating a range, typically as + // the first or last character in the class, or immediately + // after a range. + // + // (while POSIX wants the last case to be an error). + "a[a-b-c]", + + // PCRE allows quantifiers after word boundary assertions, so skip + // the tests checking that using them results in an error. + "[[:<:]]*", "[[:>:]]*", "\\<*", "\\>*", "\\y*", "\\Y*", + + // PCRE only interprets "\x" and "\u" specially when they're + // followed by exactly 2 or 4 hexadecimal digits and just lets them + // match "x" or "u" otherwise, instead of giving an error. + "a\\xq", "a\\u008x", + + // And "\U" always just matches "U", PCRE doesn't support it as + // Unicode escape at all (even with PCRE2_EXTRA_ALT_BSUX). + "a\\U0000008x", + + // "\z" is the "end of string" assertion and not an error in PCRE. + "a\\z", + + // Recursive backreferences are explicitly allowed in PCRE. + "a((b)\\1)", + + // Backreferences with index greater than 8 are interpreted as + // octal escapes, unfortunately. + "a((((((((((b\\10))))))))))c", "a\\12b", + }; + + for (size_t n = 0; n < WXSIZEOF(validForPCRE); ++n) { + if (m_pattern == validForPCRE[n]) + return; + } + } + + if (m_mode == 'm') { + // PCRE doesn't support POSIX collating elements, so we have to skip + // those too. + if (m_pattern.find("[.") != wxString::npos || m_pattern.find("[:") != wxString::npos) + return; + + // "\b" is a word boundary assertion in PCRE and so is "\B", so the + // tests relying on them being escapes for ASCII backspace and + // backslash respectively must be skipped. + if (m_pattern.find("\\b") != wxString::npos || m_pattern.find("\\B") != wxString::npos) + return; + + // As explained above, "\U" is not supported by PCRE, only "\u" is. + if (m_pattern == "a\\U00000008x") + m_pattern = "a\\u0008x"; + // And "\x" is supported only when followed by 2 digits, not 4. + else if (m_pattern == "a\\x0008x") + m_pattern = "a\\x08x"; + + // "\12" can be a backreferences or an octal escape in PCRE, but never + // literal "12" as this test expects it to be. + if (m_pattern == "a\\12b") + return; + + // Switching to "extended" mode is supposed to turn off "\W" + // interpretation, but it doesn't work with PCRE. + if (m_pattern == "(?e)\\W+") + return; + + // None of the tests in "tricky cases" section passes with PCRE. It's + // not really clear if PCRE is wrong or the original test suite was or + // even if these regexes are ambiguous, but for now explicitly anchor + // them at the end to force them to pass even with PCRE, as without it + // they would match less than expected. + if (m_pattern == "(week|wee)(night|knights)" || + m_pattern == "a(bc*).*\\1" || + m_pattern == "a(b.[bc]*)+") + m_pattern += '$'; + } + + // This test uses an empty alternative branch: in POSIX, this is ignored, + // while with PCRE it matches an empty string and we must set NOTEMPTY flag + // explicitly to disable this. + if (m_pattern == "a||b" && m_flags == "NS" ) { + m_matchFlags |= wxRE_NOTEMPTY; + } + + // Provide more information about the test case if it fails. wxString str; wxArrayString::const_iterator it; @@ -285,6 +413,21 @@ void RegExTestCase::doTest(int flavor) // i - check the match returns the offsets given else if (m_mode == 'i') { +#if wxUSE_UNICODE_UTF8 + // Values returned by GetMatch() are indices into UTF-8 string, but + // the values expected by the test are indices in a UTF-16 or -32 + // string, so convert them. Note that the indices are correct, as + // using substr(start, len) must return the match itself, it's just + // that they differ when using UTF-8 internally. + if ( start < INT_MAX ) + { + if ( start + len > 0 ) + len = m_data.substr(start, len).wc_str().length(); + + start = m_data.substr(0, start).wc_str().length(); + } +#endif // wxUSE_UNICODE_UTF8 + if (start > INT_MAX) result = wxT("-1 -1"); else if (start + len > 0) diff --git a/tests/regex/wxregextest.cpp b/tests/regex/wxregextest.cpp index 3d9780d12d..db0ab088cc 100644 --- a/tests/regex/wxregextest.cpp +++ b/tests/regex/wxregextest.cpp @@ -59,7 +59,7 @@ TEST_CASE("wxRegEx::Compile", "[regex][compile]") CHECK_FALSE( re.Compile("foo[") ); CHECK_FALSE( re.Compile("foo[bar") ); CHECK ( re.Compile("foo[bar]") ); - CHECK_FALSE( re.Compile("foo{1") ); + // Not invalid for PCRE: CHECK_FALSE( re.Compile("foo{1") ); CHECK ( re.Compile("foo{1}") ); CHECK ( re.Compile("foo{1,2}") ); CHECK ( re.Compile("foo*") ); @@ -184,4 +184,20 @@ TEST_CASE("wxRegEx::ConvertFromBasic", "[regex][basic]") CHECK( wxRegEx::ConvertFromBasic("[^$\\)]") == "[^$\\)]" ); } +#ifdef wxHAS_REGEX_ADVANCED + +TEST_CASE("wxRegEx::Unicode", "[regex][unicode]") +{ + const wxString cyrillicCapitalA(L"\u0410"); + const wxString cyrillicSmallA(L"\u0430"); + + wxRegEx re(cyrillicCapitalA, wxRE_ICASE); + REQUIRE( re.IsValid() ); + + REQUIRE( re.Matches(cyrillicSmallA) ); + CHECK( re.GetMatch(cyrillicSmallA) == cyrillicSmallA ); +} + +#endif // wxHAS_REGEX_ADVANCED + #endif // wxUSE_REGEX