Merge branch 'regex-prepare-pcre'

Cleanup and simplify wxRegEx code. See https://github.com/wxWidgets/wxWidgets/pull/2437
2021-07-17 17:13:38 +02:00
parent fa1f7e74ce 4dd77dabe8
commit 304d5e6c59
13 changed files with 490 additions and 107 deletions
--- a/include/wx/regex.h
+++ b/include/wx/regex.h
@@ -144,6 +144,9 @@ public:

    static wxString QuoteMeta(const wxString& str);

+    // return the extended RE corresponding to the given basic RE
+    static wxString ConvertFromBasic(const wxString& bre);
+
    // dtor not virtual, don't derive from this class
    ~wxRegEx();

--- a/interface/wx/regex.h
+++ b/interface/wx/regex.h
@@ -276,5 +276,15 @@ public:
        @since 3.1.3
    */
    static wxString QuoteMeta(const wxString& str);
+
+    /**
+        Converts a basic regular expression to an extended regex syntax.
+
+        This function can be used to convert @a bre using deprecated wxRE_BASIC
+        syntax to default (extended) syntax.
+
+        @since 3.1.6
+     */
+    static wxString ConvertFromBasic(const wxString& bre);
 };

--- a/misc/suppressions/codespell-lines
+++ b/misc/suppressions/codespell-lines
@@ -24,3 +24,6 @@ expressions</em> (BRE). EREs are roughly those of the traditional @e egrep,
    // 2019), i.e. SEH translator seems to work just fine without /EHa too, so
 // Purpose:     helpers for the structured exception handling (SEH) under Win32
 * MinGW-w64 versions 7.3 and 8.1 (32-bit binaries use SJLJ exceptions, 64-bit ones use SEH, and all binaries use Win32 threads).
+    static wxString ConvertFromBasic(const wxString& bre);
+        This function can be used to convert @a bre using deprecated wxRE_BASIC
+    static wxString ConvertFromBasic(const wxString& bre);
--- a/src/common/regex.cpp
+++ b/src/common/regex.cpp
@@ -41,31 +41,27 @@

 // WXREGEX_USING_BUILTIN    defined when using the built-in regex lib
 // WXREGEX_USING_RE_SEARCH  defined when using re_search in the GNU regex lib
-// WXREGEX_IF_NEED_LEN()    wrap the len parameter only used with the built-in
-//                          or GNU regex
 // WXREGEX_CONVERT_TO_MB    defined when the regex lib is using chars and
-//                          wxChar is wide, so conversion must be done
-// WXREGEX_CHAR(x)          Convert wxChar to wxRegChar
+//                          wxChar is wide, so conversion to UTF-8 must be done
 //
 #ifdef __REG_NOFRONT
 #   define WXREGEX_USING_BUILTIN
-#   define WXREGEX_IF_NEED_LEN(x) ,x
-#   if wxUSE_UNICODE
-#       define WXREGEX_CHAR(x) (x).wc_str()
-#   else
-#       define WXREGEX_CHAR(x) (x).mb_str()
-#   endif
 #else
 #   ifdef HAVE_RE_SEARCH
-#       define WXREGEX_IF_NEED_LEN(x) ,x
 #       define WXREGEX_USING_RE_SEARCH
 #   else
-#       define WXREGEX_IF_NEED_LEN(x)
+        // We can't use length, so just drop it in this wrapper.
+        inline int
+        wx_regexec(const regex_t* preg, const char* string, size_t,
+                   size_t nmatch, regmatch_t* pmatch, int eflags)
+        {
+            return regexec(preg, string, nmatch, pmatch, eflags);
+        }
 #   endif
 #   if wxUSE_UNICODE
 #       define WXREGEX_CONVERT_TO_MB
 #   endif
-#   define WXREGEX_CHAR(x) (x).mb_str()
+#   define wx_regcomp regcomp
 #   define wx_regfree regfree
 #   define wx_regerror regerror
 #endif
@@ -157,8 +153,7 @@ public:

    // RE operations
    bool Compile(const wxString& expr, int flags = 0);
-    bool Matches(const wxRegChar *str, int flags
-                 WXREGEX_IF_NEED_LEN(size_t len)) const;
+    bool Matches(const wxRegChar *str, int flags, size_t len) const;
    bool GetMatch(size_t *start, size_t *len, size_t index = 0) const;
    size_t GetMatchCount() const;
    int Replace(wxString *pattern, const wxString& replacement,
@@ -166,7 +161,7 @@ public:

 private:
    // return the string containing the error message for the given err code
-    wxString GetErrorMsg(int errorcode, bool badconv) const;
+    wxString GetErrorMsg(int errorcode) const;

    // init the members
    void Init()
@@ -224,33 +219,22 @@ wxRegExImpl::~wxRegExImpl()
    Free();
 }

-wxString wxRegExImpl::GetErrorMsg(int errorcode, bool badconv) const
+wxString wxRegExImpl::GetErrorMsg(int errorcode) const
 {
-#ifdef WXREGEX_CONVERT_TO_MB
-    // currently only needed when using system library in Unicode mode
-    if ( badconv )
-    {
-        return _("conversion to 8-bit encoding failed");
-    }
-#else
-    // 'use' badconv to avoid a compiler warning
-    (void)badconv;
-#endif
-
    wxString szError;

    // first get the string length needed
    int len = wx_regerror(errorcode, &m_RegEx, NULL, 0);
    if ( len > 0 )
    {
-        char* szcmbError = new char[++len];
+        wxCharBuffer errbuf(len);

-        (void)wx_regerror(errorcode, &m_RegEx, szcmbError, len);
+        (void)wx_regerror(errorcode, &m_RegEx, errbuf.data(), errbuf.length());

-        szError = wxConvLibc.cMB2WX(szcmbError);
-        delete [] szcmbError;
+        szError = wxConvLibc.cMB2WX(errbuf);
    }
-    else // regerror() returned 0
+
+    if ( szError.empty() ) // regerror() returned 0 or conversion failed
    {
        szError = _("unknown error");
    }
@@ -258,6 +242,281 @@ wxString wxRegExImpl::GetErrorMsg(int errorcode, bool badconv) const
    return szError;
 }

+// Helper function for processing bracket expressions inside a regex.
+//
+// Advance the iterator until the closing bracket matching the opening one the
+// iterator currently points to, i.e.:
+//
+// Precondition: *it == '['
+// Postcondition: *it == ']' or it == end if failed to find matching ']'
+static
+wxString::const_iterator
+SkipBracketExpression(wxString::const_iterator it, wxString::const_iterator end)
+{
+    wxASSERT_MSG( *it == '[', "must be at the start of bracket expression" );
+
+    // Initial ']', possibly after the preceding '^', is different because it
+    // stands for a literal ']' and not the end of the bracket expression, so
+    // check for it first.
+    ++it;
+    if ( it != end && *it == '^' )
+        ++it;
+    if ( it != end && *it == ']' )
+        ++it;
+
+    // Any ']' from now on ends the bracket expression.
+    for ( ; it != end; ++it )
+    {
+        const wxUniChar c = *it;
+
+        if ( c == ']' )
+            break;
+
+        if ( c == '[' )
+        {
+            // Bare '[' on its own is not special, but collating elements and
+            // character classes are, so check for them and advance past them
+            // if necessary to avoid misinterpreting the matching closing ']'.
+            if ( ++it == end )
+                break;
+
+            const wxUniChar c = *it;
+            if ( c == ':' || c == '.' || c == '=' )
+            {
+                for ( ++it; it != end; ++it )
+                {
+                    if ( *it == c )
+                    {
+                        if ( ++it == end )
+                            break;
+
+                        if ( *it == ']' )
+                            break;
+                    }
+                }
+
+                if ( it == end )
+                    break;
+            }
+        }
+    }
+
+    return it;
+}
+
+/* static */
+wxString wxRegEx::ConvertFromBasic(const wxString& bre)
+{
+    /*
+        Quoting regex(7):
+
+        Obsolete ("basic") regular expressions differ in several respects.
+        '|', '+', and '?' are ordinary characters and there is no equivalent
+        for their functionality. The delimiters for bounds are "\{" and "\}",
+        with '{' and '}' by themselves ordinary characters. The parentheses
+        for nested subexpressions are "\(" and "\)", with '(' and ')' by
+        themselves ordinary characters. '^' is an ordinary character except at
+        the beginning of the RE or(!) the beginning of a parenthesized
+        subexpression, '$' is an ordinary character except at the end of the RE
+        or(!) the end of a parenthesized subexpression, and '*' is an ordinary
+        character if it appears at the beginning of the RE or the beginning of
+        a parenthesized subexpression (after a possible leading '^').
+
+        Finally, there is one new type of atom, a back reference: '\' followed
+        by a nonzero decimal digit d matches the same sequence of characters
+        matched by the dth parenthesized subexpression [...]
+     */
+    wxString ere;
+    ere.reserve(bre.length());
+
+    enum SinceStart
+    {
+        SinceStart_None,        // Just at the beginning.
+        SinceStart_OnlyCaret,   // Had just "^" since the beginning.
+        SinceStart_Some         // Had something else since the beginning.
+    };
+
+    struct State
+    {
+        explicit State(SinceStart sinceStart_)
+        {
+            isBackslash = false;
+            sinceStart = sinceStart_;
+        }
+
+        bool isBackslash;
+        SinceStart sinceStart;
+    };
+
+    State previous(SinceStart_None);
+    for ( wxString::const_iterator it = bre.begin(),
+                                  end = bre.end();
+          it != end;
+          ++it )
+    {
+        const wxUniChar c = *it;
+
+        // What should be done with the current character?
+        enum Disposition
+        {
+            Disposition_Skip,   // Nothing.
+            Disposition_Append, // Append to output.
+            Disposition_Escape  // ... after escaping it with backslash.
+        } disposition = Disposition_Append;
+
+        State current(SinceStart_Some);
+
+        if ( previous.isBackslash )
+        {
+            // By default, keep the backslash present in the BRE, it's still
+            // needed in the ERE too.
+            disposition = Disposition_Escape;
+
+            switch ( c.GetValue() )
+            {
+                case '(':
+                    // It's the start of a new subexpression.
+                    current.sinceStart = SinceStart_None;
+                    wxFALLTHROUGH;
+
+                case ')':
+                case '{':
+                case '}':
+                    // Do not escape to ensure they remain special in the ERE
+                    // as the escaped versions were special in the BRE.
+                    disposition = Disposition_Append;
+                    break;
+            }
+        }
+        else // This character is not escaped.
+        {
+            switch ( c.GetValue() )
+            {
+                case '\\':
+                    current.isBackslash = true;
+
+                    // Don't do anything with it yet, we'll deal with it later.
+                    disposition = Disposition_Skip;
+                    break;
+
+                case '^':
+                    // Escape unless it appears at the start.
+                    switch ( previous.sinceStart )
+                    {
+                        case SinceStart_None:
+                            // Don't escape, but do update the state.
+                            current.sinceStart = SinceStart_OnlyCaret;
+                            break;
+
+                        case SinceStart_OnlyCaret:
+                        case SinceStart_Some:
+                            disposition = Disposition_Escape;
+                            break;
+                    }
+                    break;
+
+                case '*':
+                    // Escape unless it appears at the start or right after "^".
+                    switch ( previous.sinceStart )
+                    {
+                        case SinceStart_None:
+                        case SinceStart_OnlyCaret:
+                            disposition = Disposition_Escape;
+                            break;
+
+                        case SinceStart_Some:
+                            break;
+                    }
+                    break;
+
+                case '$':
+                    // Escape unless it appears at the end or just before "\)".
+                    disposition = Disposition_Escape;
+                    {
+                        wxString::const_iterator next = it;
+                        ++next;
+                        if ( next == end )
+                        {
+                            // It is at the end, so has special meaning.
+                            disposition = Disposition_Append;
+                        }
+                        else // Not at the end, but maybe at subexpression end?
+                        {
+                            if ( *next == '\\' )
+                            {
+                                ++next;
+                                if ( next != end && *next == ')' )
+                                    disposition = Disposition_Append;
+                            }
+                        }
+                    }
+                    break;
+
+                case '|':
+                case '+':
+                case '?':
+                case '(':
+                case ')':
+                case '{':
+                case '}':
+                    // Escape these characters which are not special in a BRE,
+                    // but would be special in a ERE if left unescaped.
+                    disposition = Disposition_Escape;
+                    break;
+
+                case '[':
+                    // Rules are very different for the characters inside the
+                    // bracket expressions and we don't have to change anything
+                    // for them as the syntax is the same for BREs and EREs, so
+                    // just process the entire expression at once.
+                    {
+                        const wxString::const_iterator start = it;
+                        it = SkipBracketExpression(it, end);
+
+                        // Copy everything inside without any changes.
+                        ere += wxString(start, it);
+
+                        if ( it == end )
+                        {
+                            // If we reached the end without finding the
+                            // matching ']' there is nothing remaining anyhow.
+                            return ere;
+                        }
+
+                        // Note that default Disposition_Append here is fine,
+                        // we'll append the closing ']' to "ere" below.
+                    }
+                    break;
+            }
+        }
+
+        switch ( disposition )
+        {
+            case Disposition_Skip:
+                break;
+
+            case Disposition_Escape:
+                ere += '\\';
+                wxFALLTHROUGH;
+
+            case Disposition_Append:
+                // Note: don't use "c" here, iterator may have been advanced
+                // inside the loop.
+                ere += *it;
+                break;
+        }
+
+        previous = current;
+    }
+
+    // It's an error if a RE ends with a backslash, but we still need to
+    // preserve this error in the resulting RE.
+    if ( previous.isBackslash )
+        ere += '\\';
+
+    return ere;
+}
+
 bool wxRegExImpl::Compile(const wxString& expr, int flags)
 {
    Reinit();
@@ -290,22 +549,24 @@ bool wxRegExImpl::Compile(const wxString& expr, int flags)
    if ( flags & wxRE_NEWLINE )
        flagsRE |= REG_NEWLINE;

+#ifndef WXREGEX_CONVERT_TO_MB
+    const wxChar *exprstr = expr.wx_str();
+#else
+    const wxScopedCharBuffer exprbuf = expr.utf8_str();
+    const char* const exprstr = exprbuf.data();
+#endif
+
    // compile it
 #ifdef WXREGEX_USING_BUILTIN
-    bool conv = true;
-    // FIXME-UTF8: use wc_str() after removing ANSI build
-    int errorcode = wx_re_comp(&m_RegEx, expr.c_str(), expr.length(), flagsRE);
+    int errorcode = wx_re_comp(&m_RegEx, exprstr, expr.length(), flagsRE);
 #else
-    // FIXME-UTF8: this is potentially broken, we shouldn't even try it
-    //             and should always use builtin regex library (or PCRE?)
-    const wxWX2MBbuf conv = expr.mbc_str();
-    int errorcode = conv ? regcomp(&m_RegEx, conv, flagsRE) : REG_BADPAT;
+    int errorcode = wx_regcomp(&m_RegEx, exprstr, flagsRE);
 #endif

    if ( errorcode )
    {
        wxLogError(_("Invalid regular expression '%s': %s"),
-                   expr.c_str(), GetErrorMsg(errorcode, !conv).c_str());
+                   expr, GetErrorMsg(errorcode));

        m_isCompiled = false;
    }
@@ -384,8 +645,8 @@ static int ReSearch(const regex_t *preg,
 #endif // WXREGEX_USING_RE_SEARCH

 bool wxRegExImpl::Matches(const wxRegChar *str,
-                          int flags
-                          WXREGEX_IF_NEED_LEN(size_t len)) const
+                          int flags,
+                          size_t len) const
 {
    wxCHECK_MSG( IsValid(), false, wxT("must successfully Compile() first") );

@@ -412,9 +673,9 @@ bool wxRegExImpl::Matches(const wxRegChar *str,
 #if defined WXREGEX_USING_BUILTIN
    int rc = wx_re_exec(&self->m_RegEx, str, len, NULL, m_nMatches, matches, flagsRE);
 #elif defined WXREGEX_USING_RE_SEARCH
-    int rc = str ? ReSearch(&self->m_RegEx, str, len, matches, flagsRE) : REG_BADPAT;
+    int rc = ReSearch(&self->m_RegEx, str, len, matches, flagsRE);
 #else
-    int rc = str ? regexec(&self->m_RegEx, str, m_nMatches, matches, flagsRE) : REG_BADPAT;
+    int rc = wx_regexec(&self->m_RegEx, str, len, m_nMatches, matches, flagsRE);
 #endif

    switch ( rc )
@@ -426,7 +687,7 @@ bool wxRegExImpl::Matches(const wxRegChar *str,
        default:
            // an error occurred
            wxLogError(_("Failed to find match for regular expression: %s"),
-                       GetErrorMsg(rc, !str).c_str());
+                       GetErrorMsg(rc));
            wxFALLTHROUGH;

        case REG_NOMATCH:
@@ -470,15 +731,9 @@ int wxRegExImpl::Replace(wxString *text,
    const wxChar *textstr = text->c_str();
    size_t textlen = text->length();
 #else
-    const wxWX2MBbuf textstr = WXREGEX_CHAR(*text);
-    if (!textstr)
-    {
-        wxLogError(_("Failed to find match for regular expression: %s"),
-                   GetErrorMsg(0, true).c_str());
-        return 0;
-    }
-    size_t textlen = strlen(textstr);
-    text->clear();
+    const wxScopedCharBuffer textbuf = text->utf8_str();
+    const char* const textstr = textbuf.data();
+    size_t textlen = textbuf.length();
 #endif

    // the replacement text
@@ -508,14 +763,9 @@ int wxRegExImpl::Replace(wxString *text,
    // note that "^" shouldn't match after the first call to Matches() so we
    // use wxRE_NOTBOL to prevent it from happening
    while ( (!maxMatches || countRepl < maxMatches) &&
-             Matches(
-#ifndef WXREGEX_CONVERT_TO_MB
-                    textstr + matchStart,
-#else
-                    textstr.data() + matchStart,
-#endif
-                    countRepl ? wxRE_NOTBOL : 0
-                    WXREGEX_IF_NEED_LEN(textlen - matchStart)) )
+             Matches(textstr + matchStart,
+                     countRepl ? wxRE_NOTBOL : 0,
+                     textlen - matchStart) )
    {
        // the string possibly contains back references: we need to calculate
        // the replacement text anew after each match
@@ -559,14 +809,8 @@ int wxRegExImpl::Replace(wxString *text,
                    }
                    else
                    {
-                        textNew += wxString(
-#ifndef WXREGEX_CONVERT_TO_MB
-                                textstr
-#else
-                                textstr.data()
-#endif
-                                + matchStart + start,
-                                *wxConvCurrent, len);
+                        textNew += wxString(textstr + matchStart + start,
+                                            wxConvUTF8, len);

                        mayHaveBackrefs = true;
                    }
@@ -592,11 +836,7 @@ int wxRegExImpl::Replace(wxString *text,
        if (result.capacity() < result.length() + start + textNew.length())
            result.reserve(2 * result.length());

-#ifndef WXREGEX_CONVERT_TO_MB
-        result.append(*text, matchStart, start);
-#else
-        result.append(wxString(textstr.data() + matchStart, *wxConvCurrent, start));
-#endif
+        result.append(wxString(textstr + matchStart, wxConvUTF8, start));
        matchStart += start;
        result.append(textNew);

@@ -605,11 +845,7 @@ int wxRegExImpl::Replace(wxString *text,
        matchStart += len;
    }

-#ifndef WXREGEX_CONVERT_TO_MB
-    result.append(*text, matchStart, wxString::npos);
-#else
-    result.append(wxString(textstr.data() + matchStart, *wxConvCurrent));
-#endif
+    result.append(wxString(textstr + matchStart, wxConvUTF8));
    *text = result;

    return countRepl;
@@ -651,8 +887,15 @@ bool wxRegEx::Matches(const wxString& str, int flags) const
 {
    wxCHECK_MSG( IsValid(), false, wxT("must successfully Compile() first") );

-    return m_impl->Matches(WXREGEX_CHAR(str), flags
-                            WXREGEX_IF_NEED_LEN(str.length()));
+#ifndef WXREGEX_CONVERT_TO_MB
+    const wxChar* const textstr = str.c_str();
+    const size_t textlen = str.length();
+#else
+    const wxScopedCharBuffer textstr = str.utf8_str();
+    const size_t textlen = textstr.length();
+#endif
+
+    return m_impl->Matches(textstr, flags, textlen);
 }

 bool wxRegEx::GetMatch(size_t *start, size_t *len, size_t index) const
@@ -668,7 +911,11 @@ wxString wxRegEx::GetMatch(const wxString& text, size_t index) const
    if ( !GetMatch(&start, &len, index) )
        return wxEmptyString;

+#ifndef WXREGEX_CONVERT_TO_MB
    return text.Mid(start, len);
+#else
+    return wxString::FromUTF8(text.utf8_str().data() + start, len);
+#endif
 }

 size_t wxRegEx::GetMatchCount() const
--- a/tests/benchmarks/Makefile.in
+++ b/tests/benchmarks/Makefile.in
@@ -57,6 +57,7 @@ BENCH_OBJECTS =  \
 	bench_ipcclient.o \
 	bench_log.o \
 	bench_mbconv.o \
+	bench_regex.o \
 	bench_strings.o \
 	bench_tls.o \
 	bench_printfbench.o
@@ -299,6 +300,9 @@ bench_log.o: $(srcdir)/log.cpp
 bench_mbconv.o: $(srcdir)/mbconv.cpp
 	$(CXXC) -c -o $@ $(BENCH_CXXFLAGS) $(srcdir)/mbconv.cpp

+bench_regex.o: $(srcdir)/regex.cpp
+	$(CXXC) -c -o $@ $(BENCH_CXXFLAGS) $(srcdir)/regex.cpp
+
 bench_strings.o: $(srcdir)/strings.cpp
 	$(CXXC) -c -o $@ $(BENCH_CXXFLAGS) $(srcdir)/strings.cpp

--- a/tests/benchmarks/bench.bkl
+++ b/tests/benchmarks/bench.bkl
@@ -16,6 +16,7 @@
            ipcclient.cpp
            log.cpp
            mbconv.cpp
+            regex.cpp
            strings.cpp
            tls.cpp
            printfbench.cpp
--- a/tests/benchmarks/bench_vc8_bench.vcproj
+++ b/tests/benchmarks/bench_vc8_bench.vcproj
@@ -838,6 +838,10 @@
 				RelativePath=".\printfbench.cpp"
 				>
 			</File>
+			<File
+				RelativePath=".\regex.cpp"
+				>
+			</File>
 			<File
 				RelativePath=".\strings.cpp"
 				>
--- a/tests/benchmarks/bench_vc9_bench.vcproj
+++ b/tests/benchmarks/bench_vc9_bench.vcproj
@@ -810,6 +810,10 @@
 				RelativePath=".\printfbench.cpp"
 				>
 			</File>
+			<File
+				RelativePath=".\regex.cpp"
+				>
+			</File>
 			<File
 				RelativePath=".\strings.cpp"
 				>
--- a/tests/benchmarks/makefile.gcc
+++ b/tests/benchmarks/makefile.gcc
@@ -36,6 +36,7 @@ BENCH_OBJECTS =  \
 	$(OBJS)\bench_ipcclient.o \
 	$(OBJS)\bench_log.o \
 	$(OBJS)\bench_mbconv.o \
+	$(OBJS)\bench_regex.o \
 	$(OBJS)\bench_strings.o \
 	$(OBJS)\bench_tls.o \
 	$(OBJS)\bench_printfbench.o
@@ -310,6 +311,9 @@ $(OBJS)\bench_log.o: ./log.cpp
 $(OBJS)\bench_mbconv.o: ./mbconv.cpp
 	$(CXX) -c -o $@ $(BENCH_CXXFLAGS) $(CPPDEPS) $<

+$(OBJS)\bench_regex.o: ./regex.cpp
+	$(CXX) -c -o $@ $(BENCH_CXXFLAGS) $(CPPDEPS) $<
+
 $(OBJS)\bench_strings.o: ./strings.cpp
 	$(CXX) -c -o $@ $(BENCH_CXXFLAGS) $(CPPDEPS) $<

--- a/tests/benchmarks/makefile.vc
+++ b/tests/benchmarks/makefile.vc
@@ -37,6 +37,7 @@ BENCH_OBJECTS =  \
 	$(OBJS)\bench_ipcclient.obj \
 	$(OBJS)\bench_log.obj \
 	$(OBJS)\bench_mbconv.obj \
+	$(OBJS)\bench_regex.obj \
 	$(OBJS)\bench_strings.obj \
 	$(OBJS)\bench_tls.obj \
 	$(OBJS)\bench_printfbench.obj
@@ -698,6 +699,9 @@ $(OBJS)\bench_log.obj: .\log.cpp
 $(OBJS)\bench_mbconv.obj: .\mbconv.cpp
 	$(CXX) /c /nologo /TP /Fo$@ $(BENCH_CXXFLAGS) .\mbconv.cpp

+$(OBJS)\bench_regex.obj: .\regex.cpp
+	$(CXX) /c /nologo /TP /Fo$@ $(BENCH_CXXFLAGS) .\regex.cpp
+
 $(OBJS)\bench_strings.obj: .\strings.cpp
 	$(CXX) /c /nologo /TP /Fo$@ $(BENCH_CXXFLAGS) .\strings.cpp

--- a/tests/benchmarks/regex.cpp
+++ b/tests/benchmarks/regex.cpp
@@ -0,0 +1,74 @@
+/////////////////////////////////////////////////////////////////////////////
+// Name:        tests/benchmarks/regex.cpp
+// Purpose:     wxRegEx benchmarks
+// Author:      Vadim Zeitlin
+// Created:     2018-11-15
+// Copyright:   (c) 2018 Vadim Zeitlin <vadim@wxwidgets.org>
+// Licence:     wxWindows licence
+/////////////////////////////////////////////////////////////////////////////
+
+#include "wx/ffile.h"
+#include "wx/regex.h"
+
+#include "bench.h"
+
+// ----------------------------------------------------------------------------
+// Benchmark relative costs of compiling and matching for a simple regex
+// ----------------------------------------------------------------------------
+
+static const char* const RE_SIMPLE = ".";
+
+BENCHMARK_FUNC(RECompile)
+{
+    return wxRegEx(RE_SIMPLE).IsValid();
+}
+
+BENCHMARK_FUNC(REMatch)
+{
+    static wxRegEx re(RE_SIMPLE);
+    return re.Matches("foo");
+}
+
+BENCHMARK_FUNC(RECompileAndMatch)
+{
+    return wxRegEx(RE_SIMPLE).Matches("foo");
+}
+
+// ----------------------------------------------------------------------------
+// Benchmark the cost of using a more complicated regex
+// ----------------------------------------------------------------------------
+
+namespace
+{
+
+// Use the contents of an already existing test file.
+const wxString& GetTestText()
+{
+    static wxString text;
+    if ( text.empty() )
+    {
+        wxFFile("htmltest.html").ReadAll(&text);
+    }
+
+    return text;
+}
+
+} // anonymous namespace
+
+BENCHMARK_FUNC(REFindTD)
+{
+    // This is too simplistic, but good enough for benchmarking.
+    static wxRegEx re("<td>[^<]*</td>", wxRE_ICASE | wxRE_NEWLINE);
+
+    int matches = 0;
+    for ( const wxChar* p = GetTestText().c_str(); re.Matches(p); ++matches )
+    {
+        size_t start, len;
+        if ( !re.GetMatch(&start, &len) )
+            return false;
+
+        p += start + len;
+    }
+
+    return matches == 21; // result of "grep -c"
+}
--- a/tests/regex/regextest.cpp
+++ b/tests/regex/regextest.cpp
@@ -234,12 +234,18 @@ void RegExTestCase::doTest(int flavor)
    // 'e' - test that the pattern fails to compile
    if (m_mode == 'e') {
        CHECK( !re.IsValid() );
-    } else {
-        CHECK( re.IsValid() );
-    }

-    if (!re.IsValid())
+        // Never continue with this kind of test.
        return;
+    } else {
+        // Note: we don't use REQUIRE here because this would abort the entire
+        // test case on error instead of skipping just the rest of this regex
+        // test.
+        CHECK( re.IsValid() );
+
+        if (!re.IsValid())
+            return;
+    }

    bool matches = re.Matches(m_data, m_matchFlags);

--- a/tests/regex/wxregextest.cpp
+++ b/tests/regex/wxregextest.cpp
@@ -79,28 +79,33 @@ CheckMatch(const char* pattern,
    INFO( "Pattern: " << pattern << FlagStr(flags) << ", match: " << text );

    wxRegEx re(pattern, compileFlags);
-    REQUIRE( re.IsValid() );
-
-    bool ok = re.Matches(text, matchFlags);
-
-    if (expected) {
-        REQUIRE( ok );
-
-        wxStringTokenizer tkz(wxString(expected, *wxConvCurrent),
-                              wxT("\t"), wxTOKEN_RET_EMPTY);
-        size_t i;
-
-        for (i = 0; i < re.GetMatchCount() && tkz.HasMoreTokens(); i++) {
-            INFO( "Match #" << i );
-            CHECK( re.GetMatch(text, i) == tkz.GetNextToken() );
-        }
-
-        if ((flags & wxRE_NOSUB) == 0)
-            CHECK(re.GetMatchCount() == i);
+    if ( !re.IsValid() )
+    {
+        FAIL("Regex compilation failed");
+        return;
    }
-    else {
-        CHECK( !ok );
+
+    if ( !re.Matches(text, matchFlags) )
+    {
+        CHECK( !expected );
+        return;
    }
+
+    CHECK( expected );
+    if ( !expected )
+        return;
+
+    wxStringTokenizer tkz(wxString(expected, *wxConvCurrent),
+                          wxT("\t"), wxTOKEN_RET_EMPTY);
+    size_t i;
+
+    for (i = 0; i < re.GetMatchCount() && tkz.HasMoreTokens(); i++) {
+        INFO( "Match #" << i );
+        CHECK( re.GetMatch(text, i) == tkz.GetNextToken() );
+    }
+
+    if ((flags & wxRE_NOSUB) == 0)
+        CHECK(re.GetMatchCount() == i);
 }

 TEST_CASE("wxRegEx::Match", "[regex][match]")
@@ -165,4 +170,18 @@ TEST_CASE("wxRegEx::QuoteMeta", "[regex][meta]")
    CHECK( wxRegEx::QuoteMeta(":foo.*bar") == ":foo\\.\\*bar" );
 }

+TEST_CASE("wxRegEx::ConvertFromBasic", "[regex][basic]")
+{
+    CHECK( wxRegEx::ConvertFromBasic("\\(a\\)b") == "(a)b" );
+    CHECK( wxRegEx::ConvertFromBasic("a\\{0,1\\}b") == "a{0,1}b" );
+    CHECK( wxRegEx::ConvertFromBasic("*") == "\\*" );
+    CHECK( wxRegEx::ConvertFromBasic("**") == "\\**" );
+    CHECK( wxRegEx::ConvertFromBasic("^*") == "^\\*" );
+    CHECK( wxRegEx::ConvertFromBasic("^^") == "^\\^" );
+    CHECK( wxRegEx::ConvertFromBasic("x$y") == "x\\$y" );
+    CHECK( wxRegEx::ConvertFromBasic("$$") == "\\$$" );
+    CHECK( wxRegEx::ConvertFromBasic("\\(x$\\)") == "(x$)" );
+    CHECK( wxRegEx::ConvertFromBasic("[^$\\)]") == "[^$\\)]" );
+}
+
 #endif // wxUSE_REGEX