From 862c051e9ea9ed8c4f4369f008072ab29bbadcd1 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Sun, 20 Jun 2021 21:57:22 +0200 Subject: [PATCH 01/13] Remove unnecessary calls to c_str() for wxLogError() arguments Just pass strings to wxLogError() directly. No real changes. --- src/common/regex.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/common/regex.cpp b/src/common/regex.cpp index c0cf525976..38089f9835 100644 --- a/src/common/regex.cpp +++ b/src/common/regex.cpp @@ -305,7 +305,7 @@ bool wxRegExImpl::Compile(const wxString& expr, int flags) if ( errorcode ) { wxLogError(_("Invalid regular expression '%s': %s"), - expr.c_str(), GetErrorMsg(errorcode, !conv).c_str()); + expr, GetErrorMsg(errorcode, !conv)); m_isCompiled = false; } @@ -426,7 +426,7 @@ bool wxRegExImpl::Matches(const wxRegChar *str, default: // an error occurred wxLogError(_("Failed to find match for regular expression: %s"), - GetErrorMsg(rc, !str).c_str()); + GetErrorMsg(rc, !str)); wxFALLTHROUGH; case REG_NOMATCH: @@ -474,7 +474,7 @@ int wxRegExImpl::Replace(wxString *text, if (!textstr) { wxLogError(_("Failed to find match for regular expression: %s"), - GetErrorMsg(0, true).c_str()); + GetErrorMsg(0, true)); return 0; } size_t textlen = strlen(textstr); From 381cd322ec3e351ecfd75e8112f6d0a2a8bec1d9 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Sat, 19 Jun 2021 14:29:31 +0200 Subject: [PATCH 02/13] Always skip the rest of regex tests checking for compile failure It's useless to continue with testing the given regex if compiling it has unexpectedly succeeded, all the rest of the tests will fail anyhow. Also explain that REQUIRE() can't be used here, which is why we need to use CHECK() and then check the same condition again before returning to avoid the temptation to "simplify" things. --- tests/regex/regextest.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/regex/regextest.cpp b/tests/regex/regextest.cpp index 321fd29c85..4902c852d1 100644 --- a/tests/regex/regextest.cpp +++ b/tests/regex/regextest.cpp @@ -234,12 +234,18 @@ void RegExTestCase::doTest(int flavor) // 'e' - test that the pattern fails to compile if (m_mode == 'e') { CHECK( !re.IsValid() ); - } else { - CHECK( re.IsValid() ); - } - if (!re.IsValid()) + // Never continue with this kind of test. return; + } else { + // Note: we don't use REQUIRE here because this would abort the entire + // test case on error instead of skipping just the rest of this regex + // test. + CHECK( re.IsValid() ); + + if (!re.IsValid()) + return; + } bool matches = re.Matches(m_data, m_matchFlags); From 8be998861dc8861354af5762054f60c5492e69a3 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Sun, 20 Jun 2021 16:19:38 +0200 Subject: [PATCH 03/13] Use CHECK rather than REQUIRE in individual regex tests Don't prevent the remaining CheckMatch() tests from running if one of them fails by using CHECK() rather than REQUIRE(). This is a bit awkward and we probably ought to use sections here instead, but this change is more minimal. This commit is best viewed ignoring whitespace-only changes. --- tests/regex/wxregextest.cpp | 45 ++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/tests/regex/wxregextest.cpp b/tests/regex/wxregextest.cpp index 50a58c7d14..eaaefe2a5d 100644 --- a/tests/regex/wxregextest.cpp +++ b/tests/regex/wxregextest.cpp @@ -79,28 +79,33 @@ CheckMatch(const char* pattern, INFO( "Pattern: " << pattern << FlagStr(flags) << ", match: " << text ); wxRegEx re(pattern, compileFlags); - REQUIRE( re.IsValid() ); - - bool ok = re.Matches(text, matchFlags); - - if (expected) { - REQUIRE( ok ); - - wxStringTokenizer tkz(wxString(expected, *wxConvCurrent), - wxT("\t"), wxTOKEN_RET_EMPTY); - size_t i; - - for (i = 0; i < re.GetMatchCount() && tkz.HasMoreTokens(); i++) { - INFO( "Match #" << i ); - CHECK( re.GetMatch(text, i) == tkz.GetNextToken() ); - } - - if ((flags & wxRE_NOSUB) == 0) - CHECK(re.GetMatchCount() == i); + if ( !re.IsValid() ) + { + FAIL("Regex compilation failed"); + return; } - else { - CHECK( !ok ); + + if ( !re.Matches(text, matchFlags) ) + { + CHECK( !expected ); + return; } + + CHECK( expected ); + if ( !expected ) + return; + + wxStringTokenizer tkz(wxString(expected, *wxConvCurrent), + wxT("\t"), wxTOKEN_RET_EMPTY); + size_t i; + + for (i = 0; i < re.GetMatchCount() && tkz.HasMoreTokens(); i++) { + INFO( "Match #" << i ); + CHECK( re.GetMatch(text, i) == tkz.GetNextToken() ); + } + + if ((flags & wxRE_NOSUB) == 0) + CHECK(re.GetMatchCount() == i); } TEST_CASE("wxRegEx::Match", "[regex][match]") From 020b5f738329c2ae2bb0986fffc04aa90703eaa7 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Sat, 19 Jun 2021 23:59:06 +0200 Subject: [PATCH 04/13] Add simple wxRegEx benchmark Allow testing speed of some simple regex operations to compare the speed of the currently used regex library with some other alternatives. --- tests/benchmarks/Makefile.in | 4 ++ tests/benchmarks/bench.bkl | 1 + tests/benchmarks/bench_vc8_bench.vcproj | 4 ++ tests/benchmarks/bench_vc9_bench.vcproj | 4 ++ tests/benchmarks/makefile.gcc | 4 ++ tests/benchmarks/makefile.vc | 4 ++ tests/benchmarks/regex.cpp | 74 +++++++++++++++++++++++++ 7 files changed, 95 insertions(+) create mode 100644 tests/benchmarks/regex.cpp diff --git a/tests/benchmarks/Makefile.in b/tests/benchmarks/Makefile.in index 1945ccfe18..e7842cc239 100644 --- a/tests/benchmarks/Makefile.in +++ b/tests/benchmarks/Makefile.in @@ -57,6 +57,7 @@ BENCH_OBJECTS = \ bench_ipcclient.o \ bench_log.o \ bench_mbconv.o \ + bench_regex.o \ bench_strings.o \ bench_tls.o \ bench_printfbench.o @@ -299,6 +300,9 @@ bench_log.o: $(srcdir)/log.cpp bench_mbconv.o: $(srcdir)/mbconv.cpp $(CXXC) -c -o $@ $(BENCH_CXXFLAGS) $(srcdir)/mbconv.cpp +bench_regex.o: $(srcdir)/regex.cpp + $(CXXC) -c -o $@ $(BENCH_CXXFLAGS) $(srcdir)/regex.cpp + bench_strings.o: $(srcdir)/strings.cpp $(CXXC) -c -o $@ $(BENCH_CXXFLAGS) $(srcdir)/strings.cpp diff --git a/tests/benchmarks/bench.bkl b/tests/benchmarks/bench.bkl index f3efa94295..ed17be45fc 100644 --- a/tests/benchmarks/bench.bkl +++ b/tests/benchmarks/bench.bkl @@ -16,6 +16,7 @@ ipcclient.cpp log.cpp mbconv.cpp + regex.cpp strings.cpp tls.cpp printfbench.cpp diff --git a/tests/benchmarks/bench_vc8_bench.vcproj b/tests/benchmarks/bench_vc8_bench.vcproj index 59ea520090..80b87940c3 100644 --- a/tests/benchmarks/bench_vc8_bench.vcproj +++ b/tests/benchmarks/bench_vc8_bench.vcproj @@ -838,6 +838,10 @@ RelativePath=".\printfbench.cpp" > + + diff --git a/tests/benchmarks/bench_vc9_bench.vcproj b/tests/benchmarks/bench_vc9_bench.vcproj index 04a316aada..049dd37fb9 100644 --- a/tests/benchmarks/bench_vc9_bench.vcproj +++ b/tests/benchmarks/bench_vc9_bench.vcproj @@ -810,6 +810,10 @@ RelativePath=".\printfbench.cpp" > + + diff --git a/tests/benchmarks/makefile.gcc b/tests/benchmarks/makefile.gcc index 3bfb0a06cf..df40d939ce 100644 --- a/tests/benchmarks/makefile.gcc +++ b/tests/benchmarks/makefile.gcc @@ -36,6 +36,7 @@ BENCH_OBJECTS = \ $(OBJS)\bench_ipcclient.o \ $(OBJS)\bench_log.o \ $(OBJS)\bench_mbconv.o \ + $(OBJS)\bench_regex.o \ $(OBJS)\bench_strings.o \ $(OBJS)\bench_tls.o \ $(OBJS)\bench_printfbench.o @@ -310,6 +311,9 @@ $(OBJS)\bench_log.o: ./log.cpp $(OBJS)\bench_mbconv.o: ./mbconv.cpp $(CXX) -c -o $@ $(BENCH_CXXFLAGS) $(CPPDEPS) $< +$(OBJS)\bench_regex.o: ./regex.cpp + $(CXX) -c -o $@ $(BENCH_CXXFLAGS) $(CPPDEPS) $< + $(OBJS)\bench_strings.o: ./strings.cpp $(CXX) -c -o $@ $(BENCH_CXXFLAGS) $(CPPDEPS) $< diff --git a/tests/benchmarks/makefile.vc b/tests/benchmarks/makefile.vc index 948b2813d8..9b0a130818 100644 --- a/tests/benchmarks/makefile.vc +++ b/tests/benchmarks/makefile.vc @@ -37,6 +37,7 @@ BENCH_OBJECTS = \ $(OBJS)\bench_ipcclient.obj \ $(OBJS)\bench_log.obj \ $(OBJS)\bench_mbconv.obj \ + $(OBJS)\bench_regex.obj \ $(OBJS)\bench_strings.obj \ $(OBJS)\bench_tls.obj \ $(OBJS)\bench_printfbench.obj @@ -698,6 +699,9 @@ $(OBJS)\bench_log.obj: .\log.cpp $(OBJS)\bench_mbconv.obj: .\mbconv.cpp $(CXX) /c /nologo /TP /Fo$@ $(BENCH_CXXFLAGS) .\mbconv.cpp +$(OBJS)\bench_regex.obj: .\regex.cpp + $(CXX) /c /nologo /TP /Fo$@ $(BENCH_CXXFLAGS) .\regex.cpp + $(OBJS)\bench_strings.obj: .\strings.cpp $(CXX) /c /nologo /TP /Fo$@ $(BENCH_CXXFLAGS) .\strings.cpp diff --git a/tests/benchmarks/regex.cpp b/tests/benchmarks/regex.cpp new file mode 100644 index 0000000000..c2ca70fa98 --- /dev/null +++ b/tests/benchmarks/regex.cpp @@ -0,0 +1,74 @@ +///////////////////////////////////////////////////////////////////////////// +// Name: tests/benchmarks/regex.cpp +// Purpose: wxRegEx benchmarks +// Author: Vadim Zeitlin +// Created: 2018-11-15 +// Copyright: (c) 2018 Vadim Zeitlin +// Licence: wxWindows licence +///////////////////////////////////////////////////////////////////////////// + +#include "wx/ffile.h" +#include "wx/regex.h" + +#include "bench.h" + +// ---------------------------------------------------------------------------- +// Benchmark relative costs of compiling and matching for a simple regex +// ---------------------------------------------------------------------------- + +static const char* const RE_SIMPLE = "."; + +BENCHMARK_FUNC(RECompile) +{ + return wxRegEx(RE_SIMPLE).IsValid(); +} + +BENCHMARK_FUNC(REMatch) +{ + static wxRegEx re(RE_SIMPLE); + return re.Matches("foo"); +} + +BENCHMARK_FUNC(RECompileAndMatch) +{ + return wxRegEx(RE_SIMPLE).Matches("foo"); +} + +// ---------------------------------------------------------------------------- +// Benchmark the cost of using a more complicated regex +// ---------------------------------------------------------------------------- + +namespace +{ + +// Use the contents of an already existing test file. +const wxString& GetTestText() +{ + static wxString text; + if ( text.empty() ) + { + wxFFile("htmltest.html").ReadAll(&text); + } + + return text; +} + +} // anonymous namespace + +BENCHMARK_FUNC(REFindTD) +{ + // This is too simplistic, but good enough for benchmarking. + static wxRegEx re("[^<]*", wxRE_ICASE | wxRE_NEWLINE); + + int matches = 0; + for ( const wxChar* p = GetTestText().c_str(); re.Matches(p); ++matches ) + { + size_t start, len; + if ( !re.GetMatch(&start, &len) ) + return false; + + p += start + len; + } + + return matches == 21; // result of "grep -c" +} From f41564a3e2ecdf88e6ea4ca2d309c1288173a3b7 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Fri, 18 Jun 2021 23:56:09 +0200 Subject: [PATCH 05/13] Add wxRegEx::ConvertFromBasic() helper This will be used to implement support for BREs using PCRE which doesn't support them directly in the upcoming commits. --- include/wx/regex.h | 3 + interface/wx/regex.h | 10 ++ misc/suppressions/codespell-lines | 3 + src/common/regex.cpp | 275 ++++++++++++++++++++++++++++++ tests/regex/wxregextest.cpp | 14 ++ 5 files changed, 305 insertions(+) diff --git a/include/wx/regex.h b/include/wx/regex.h index ac70ff3907..dc3b34c4a5 100644 --- a/include/wx/regex.h +++ b/include/wx/regex.h @@ -144,6 +144,9 @@ public: static wxString QuoteMeta(const wxString& str); + // return the extended RE corresponding to the given basic RE + static wxString ConvertFromBasic(const wxString& bre); + // dtor not virtual, don't derive from this class ~wxRegEx(); diff --git a/interface/wx/regex.h b/interface/wx/regex.h index 0a6f24558c..cc3dc92acf 100644 --- a/interface/wx/regex.h +++ b/interface/wx/regex.h @@ -276,5 +276,15 @@ public: @since 3.1.3 */ static wxString QuoteMeta(const wxString& str); + + /** + Converts a basic regular expression to an extended regex syntax. + + This function can be used to convert @a bre using deprecated wxRE_BASIC + syntax to default (extended) syntax. + + @since 3.1.6 + */ + static wxString ConvertFromBasic(const wxString& bre); }; diff --git a/misc/suppressions/codespell-lines b/misc/suppressions/codespell-lines index 44401db6a6..3e1c2e8461 100644 --- a/misc/suppressions/codespell-lines +++ b/misc/suppressions/codespell-lines @@ -24,3 +24,6 @@ expressions (BRE). EREs are roughly those of the traditional @e egrep, // 2019), i.e. SEH translator seems to work just fine without /EHa too, so // Purpose: helpers for the structured exception handling (SEH) under Win32 * MinGW-w64 versions 7.3 and 8.1 (32-bit binaries use SJLJ exceptions, 64-bit ones use SEH, and all binaries use Win32 threads). + static wxString ConvertFromBasic(const wxString& bre); + This function can be used to convert @a bre using deprecated wxRE_BASIC + static wxString ConvertFromBasic(const wxString& bre); diff --git a/src/common/regex.cpp b/src/common/regex.cpp index 38089f9835..ed86b31c0c 100644 --- a/src/common/regex.cpp +++ b/src/common/regex.cpp @@ -258,6 +258,281 @@ wxString wxRegExImpl::GetErrorMsg(int errorcode, bool badconv) const return szError; } +// Helper function for processing bracket expressions inside a regex. +// +// Advance the iterator until the closing bracket matching the opening one the +// iterator currently points to, i.e.: +// +// Precondition: *it == '[' +// Postcondition: *it == ']' or it == end if failed to find matching ']' +static +wxString::const_iterator +SkipBracketExpression(wxString::const_iterator it, wxString::const_iterator end) +{ + wxASSERT_MSG( *it == '[', "must be at the start of bracket expression" ); + + // Initial ']', possibly after the preceding '^', is different because it + // stands for a literal ']' and not the end of the bracket expression, so + // check for it first. + ++it; + if ( it != end && *it == '^' ) + ++it; + if ( it != end && *it == ']' ) + ++it; + + // Any ']' from now on ends the bracket expression. + for ( ; it != end; ++it ) + { + const wxUniChar c = *it; + + if ( c == ']' ) + break; + + if ( c == '[' ) + { + // Bare '[' on its own is not special, but collating elements and + // character classes are, so check for them and advance past them + // if necessary to avoid misinterpreting the matching closing ']'. + if ( ++it == end ) + break; + + const wxUniChar c = *it; + if ( c == ':' || c == '.' || c == '=' ) + { + for ( ++it; it != end; ++it ) + { + if ( *it == c ) + { + if ( ++it == end ) + break; + + if ( *it == ']' ) + break; + } + } + + if ( it == end ) + break; + } + } + } + + return it; +} + +/* static */ +wxString wxRegEx::ConvertFromBasic(const wxString& bre) +{ + /* + Quoting regex(7): + + Obsolete ("basic") regular expressions differ in several respects. + '|', '+', and '?' are ordinary characters and there is no equivalent + for their functionality. The delimiters for bounds are "\{" and "\}", + with '{' and '}' by themselves ordinary characters. The parentheses + for nested subexpressions are "\(" and "\)", with '(' and ')' by + themselves ordinary characters. '^' is an ordinary character except at + the beginning of the RE or(!) the beginning of a parenthesized + subexpression, '$' is an ordinary character except at the end of the RE + or(!) the end of a parenthesized subexpression, and '*' is an ordinary + character if it appears at the beginning of the RE or the beginning of + a parenthesized subexpression (after a possible leading '^'). + + Finally, there is one new type of atom, a back reference: '\' followed + by a nonzero decimal digit d matches the same sequence of characters + matched by the dth parenthesized subexpression [...] + */ + wxString ere; + ere.reserve(bre.length()); + + enum SinceStart + { + SinceStart_None, // Just at the beginning. + SinceStart_OnlyCaret, // Had just "^" since the beginning. + SinceStart_Some // Had something else since the beginning. + }; + + struct State + { + explicit State(SinceStart sinceStart_) + { + isBackslash = false; + sinceStart = sinceStart_; + } + + bool isBackslash; + SinceStart sinceStart; + }; + + State previous(SinceStart_None); + for ( wxString::const_iterator it = bre.begin(), + end = bre.end(); + it != end; + ++it ) + { + const wxUniChar c = *it; + + // What should be done with the current character? + enum Disposition + { + Disposition_Skip, // Nothing. + Disposition_Append, // Append to output. + Disposition_Escape // ... after escaping it with backslash. + } disposition = Disposition_Append; + + State current(SinceStart_Some); + + if ( previous.isBackslash ) + { + // By default, keep the backslash present in the BRE, it's still + // needed in the ERE too. + disposition = Disposition_Escape; + + switch ( c.GetValue() ) + { + case '(': + // It's the start of a new subexpression. + current.sinceStart = SinceStart_None; + wxFALLTHROUGH; + + case ')': + case '{': + case '}': + // Do not escape to ensure they remain special in the ERE + // as the escaped versions were special in the BRE. + disposition = Disposition_Append; + break; + } + } + else // This character is not escaped. + { + switch ( c.GetValue() ) + { + case '\\': + current.isBackslash = true; + + // Don't do anything with it yet, we'll deal with it later. + disposition = Disposition_Skip; + break; + + case '^': + // Escape unless it appears at the start. + switch ( previous.sinceStart ) + { + case SinceStart_None: + // Don't escape, but do update the state. + current.sinceStart = SinceStart_OnlyCaret; + break; + + case SinceStart_OnlyCaret: + case SinceStart_Some: + disposition = Disposition_Escape; + break; + } + break; + + case '*': + // Escape unless it appears at the start or right after "^". + switch ( previous.sinceStart ) + { + case SinceStart_None: + case SinceStart_OnlyCaret: + disposition = Disposition_Escape; + break; + + case SinceStart_Some: + break; + } + break; + + case '$': + // Escape unless it appears at the end or just before "\)". + disposition = Disposition_Escape; + { + wxString::const_iterator next = it; + ++next; + if ( next == end ) + { + // It is at the end, so has special meaning. + disposition = Disposition_Append; + } + else // Not at the end, but maybe at subexpression end? + { + if ( *next == '\\' ) + { + ++next; + if ( next != end && *next == ')' ) + disposition = Disposition_Append; + } + } + } + break; + + case '|': + case '+': + case '?': + case '(': + case ')': + case '{': + case '}': + // Escape these characters which are not special in a BRE, + // but would be special in a ERE if left unescaped. + disposition = Disposition_Escape; + break; + + case '[': + // Rules are very different for the characters inside the + // bracket expressions and we don't have to change anything + // for them as the syntax is the same for BREs and EREs, so + // just process the entire expression at once. + { + const wxString::const_iterator start = it; + it = SkipBracketExpression(it, end); + + // Copy everything inside without any changes. + ere += wxString(start, it); + + if ( it == end ) + { + // If we reached the end without finding the + // matching ']' there is nothing remaining anyhow. + return ere; + } + + // Note that default Disposition_Append here is fine, + // we'll append the closing ']' to "ere" below. + } + break; + } + } + + switch ( disposition ) + { + case Disposition_Skip: + break; + + case Disposition_Escape: + ere += '\\'; + wxFALLTHROUGH; + + case Disposition_Append: + // Note: don't use "c" here, iterator may have been advanced + // inside the loop. + ere += *it; + break; + } + + previous = current; + } + + // It's an error if a RE ends with a backslash, but we still need to + // preserve this error in the resulting RE. + if ( previous.isBackslash ) + ere += '\\'; + + return ere; +} + bool wxRegExImpl::Compile(const wxString& expr, int flags) { Reinit(); diff --git a/tests/regex/wxregextest.cpp b/tests/regex/wxregextest.cpp index eaaefe2a5d..3d9780d12d 100644 --- a/tests/regex/wxregextest.cpp +++ b/tests/regex/wxregextest.cpp @@ -170,4 +170,18 @@ TEST_CASE("wxRegEx::QuoteMeta", "[regex][meta]") CHECK( wxRegEx::QuoteMeta(":foo.*bar") == ":foo\\.\\*bar" ); } +TEST_CASE("wxRegEx::ConvertFromBasic", "[regex][basic]") +{ + CHECK( wxRegEx::ConvertFromBasic("\\(a\\)b") == "(a)b" ); + CHECK( wxRegEx::ConvertFromBasic("a\\{0,1\\}b") == "a{0,1}b" ); + CHECK( wxRegEx::ConvertFromBasic("*") == "\\*" ); + CHECK( wxRegEx::ConvertFromBasic("**") == "\\**" ); + CHECK( wxRegEx::ConvertFromBasic("^*") == "^\\*" ); + CHECK( wxRegEx::ConvertFromBasic("^^") == "^\\^" ); + CHECK( wxRegEx::ConvertFromBasic("x$y") == "x\\$y" ); + CHECK( wxRegEx::ConvertFromBasic("$$") == "\\$$" ); + CHECK( wxRegEx::ConvertFromBasic("\\(x$\\)") == "(x$)" ); + CHECK( wxRegEx::ConvertFromBasic("[^$\\)]") == "[^$\\)]" ); +} + #endif // wxUSE_REGEX From 8b812a92aecb2081adc1e42f85d6ce090197854d Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Thu, 15 Jul 2021 22:13:21 +0200 Subject: [PATCH 06/13] Always convert to UTF-8 if conversion is necessary When using system regex functions in Unicode build, convert to UTF-8 and not the current locale encoding, as this should work just as well if the conversion doesn't fail and even better if it would fail when converting to the current locale encoding because conversion to UTF-8 never does. --- src/common/regex.cpp | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/src/common/regex.cpp b/src/common/regex.cpp index ed86b31c0c..55112667ba 100644 --- a/src/common/regex.cpp +++ b/src/common/regex.cpp @@ -44,17 +44,11 @@ // WXREGEX_IF_NEED_LEN() wrap the len parameter only used with the built-in // or GNU regex // WXREGEX_CONVERT_TO_MB defined when the regex lib is using chars and -// wxChar is wide, so conversion must be done -// WXREGEX_CHAR(x) Convert wxChar to wxRegChar +// wxChar is wide, so conversion to UTF-8 must be done // #ifdef __REG_NOFRONT # define WXREGEX_USING_BUILTIN # define WXREGEX_IF_NEED_LEN(x) ,x -# if wxUSE_UNICODE -# define WXREGEX_CHAR(x) (x).wc_str() -# else -# define WXREGEX_CHAR(x) (x).mb_str() -# endif #else # ifdef HAVE_RE_SEARCH # define WXREGEX_IF_NEED_LEN(x) ,x @@ -745,14 +739,8 @@ int wxRegExImpl::Replace(wxString *text, const wxChar *textstr = text->c_str(); size_t textlen = text->length(); #else - const wxWX2MBbuf textstr = WXREGEX_CHAR(*text); - if (!textstr) - { - wxLogError(_("Failed to find match for regular expression: %s"), - GetErrorMsg(0, true)); - return 0; - } - size_t textlen = strlen(textstr); + const wxScopedCharBuffer textstr = text->utf8_str(); + size_t textlen = textstr.length(); text->clear(); #endif @@ -841,7 +829,7 @@ int wxRegExImpl::Replace(wxString *text, textstr.data() #endif + matchStart + start, - *wxConvCurrent, len); + wxConvUTF8, len); mayHaveBackrefs = true; } @@ -870,7 +858,7 @@ int wxRegExImpl::Replace(wxString *text, #ifndef WXREGEX_CONVERT_TO_MB result.append(*text, matchStart, start); #else - result.append(wxString(textstr.data() + matchStart, *wxConvCurrent, start)); + result.append(wxString(textstr.data() + matchStart, wxConvUTF8, start)); #endif matchStart += start; result.append(textNew); @@ -883,7 +871,7 @@ int wxRegExImpl::Replace(wxString *text, #ifndef WXREGEX_CONVERT_TO_MB result.append(*text, matchStart, wxString::npos); #else - result.append(wxString(textstr.data() + matchStart, *wxConvCurrent)); + result.append(wxString(textstr.data() + matchStart, wxConvUTF8)); #endif *text = result; @@ -926,8 +914,16 @@ bool wxRegEx::Matches(const wxString& str, int flags) const { wxCHECK_MSG( IsValid(), false, wxT("must successfully Compile() first") ); - return m_impl->Matches(WXREGEX_CHAR(str), flags - WXREGEX_IF_NEED_LEN(str.length())); +#ifndef WXREGEX_CONVERT_TO_MB + const wxChar* const textstr = str.c_str(); + const size_t textlen = str.length(); +#else + const wxScopedCharBuffer textstr = str.utf8_str(); + const size_t textlen = textstr.length(); +#endif + + return m_impl->Matches(textstr, flags + WXREGEX_IF_NEED_LEN(textlen)); } bool wxRegEx::GetMatch(size_t *start, size_t *len, size_t index) const From 78546007af0c2b83dfa219f48d093ed0f667665a Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Thu, 15 Jul 2021 21:54:36 +0200 Subject: [PATCH 07/13] Get rid of WXREGEX_IF_NEED_LEN() in wxRegEx code Define wx_regexec() wrapper so that we can just drop the length if it's not supported in one place, instead of having to use this ugly macro in several places. No real changes. --- src/common/regex.cpp | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/common/regex.cpp b/src/common/regex.cpp index 55112667ba..d3e888d233 100644 --- a/src/common/regex.cpp +++ b/src/common/regex.cpp @@ -41,25 +41,27 @@ // WXREGEX_USING_BUILTIN defined when using the built-in regex lib // WXREGEX_USING_RE_SEARCH defined when using re_search in the GNU regex lib -// WXREGEX_IF_NEED_LEN() wrap the len parameter only used with the built-in -// or GNU regex // WXREGEX_CONVERT_TO_MB defined when the regex lib is using chars and // wxChar is wide, so conversion to UTF-8 must be done // #ifdef __REG_NOFRONT # define WXREGEX_USING_BUILTIN -# define WXREGEX_IF_NEED_LEN(x) ,x #else # ifdef HAVE_RE_SEARCH -# define WXREGEX_IF_NEED_LEN(x) ,x # define WXREGEX_USING_RE_SEARCH # else -# define WXREGEX_IF_NEED_LEN(x) + // We can't use length, so just drop it in this wrapper. + inline int + wx_regexec(const regex_t* preg, const char* string, size_t, + size_t nmatch, regmatch_t* pmatch, int eflags) + { + return regexec(preg, string, nmatch, pmatch, eflags); + } # endif # if wxUSE_UNICODE # define WXREGEX_CONVERT_TO_MB # endif -# define WXREGEX_CHAR(x) (x).mb_str() +# define wx_regcomp regcomp # define wx_regfree regfree # define wx_regerror regerror #endif @@ -151,8 +153,7 @@ public: // RE operations bool Compile(const wxString& expr, int flags = 0); - bool Matches(const wxRegChar *str, int flags - WXREGEX_IF_NEED_LEN(size_t len)) const; + bool Matches(const wxRegChar *str, int flags, size_t len) const; bool GetMatch(size_t *start, size_t *len, size_t index = 0) const; size_t GetMatchCount() const; int Replace(wxString *pattern, const wxString& replacement, @@ -653,8 +654,8 @@ static int ReSearch(const regex_t *preg, #endif // WXREGEX_USING_RE_SEARCH bool wxRegExImpl::Matches(const wxRegChar *str, - int flags - WXREGEX_IF_NEED_LEN(size_t len)) const + int flags, + size_t len) const { wxCHECK_MSG( IsValid(), false, wxT("must successfully Compile() first") ); @@ -683,7 +684,7 @@ bool wxRegExImpl::Matches(const wxRegChar *str, #elif defined WXREGEX_USING_RE_SEARCH int rc = str ? ReSearch(&self->m_RegEx, str, len, matches, flagsRE) : REG_BADPAT; #else - int rc = str ? regexec(&self->m_RegEx, str, m_nMatches, matches, flagsRE) : REG_BADPAT; + int rc = str ? wx_regexec(&self->m_RegEx, str, m_nMatches, matches, flagsRE) : REG_BADPAT; #endif switch ( rc ) @@ -777,8 +778,8 @@ int wxRegExImpl::Replace(wxString *text, #else textstr.data() + matchStart, #endif - countRepl ? wxRE_NOTBOL : 0 - WXREGEX_IF_NEED_LEN(textlen - matchStart)) ) + countRepl ? wxRE_NOTBOL : 0, + textlen - matchStart) ) { // the string possibly contains back references: we need to calculate // the replacement text anew after each match @@ -922,8 +923,7 @@ bool wxRegEx::Matches(const wxString& str, int flags) const const size_t textlen = textstr.length(); #endif - return m_impl->Matches(textstr, flags - WXREGEX_IF_NEED_LEN(textlen)); + return m_impl->Matches(textstr, flags, textlen); } bool wxRegEx::GetMatch(size_t *start, size_t *len, size_t index) const From 10dce93921e97f497959ca2cce9ac3b021ddeee6 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Thu, 15 Jul 2021 22:21:15 +0200 Subject: [PATCH 08/13] Remove handling of conversion errors that can't happen any more Since we always convert to UTF-8, there can be no conversion errors and we don't need to worry about them. --- src/common/regex.cpp | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/src/common/regex.cpp b/src/common/regex.cpp index d3e888d233..3a081ae075 100644 --- a/src/common/regex.cpp +++ b/src/common/regex.cpp @@ -161,7 +161,7 @@ public: private: // return the string containing the error message for the given err code - wxString GetErrorMsg(int errorcode, bool badconv) const; + wxString GetErrorMsg(int errorcode) const; // init the members void Init() @@ -219,19 +219,8 @@ wxRegExImpl::~wxRegExImpl() Free(); } -wxString wxRegExImpl::GetErrorMsg(int errorcode, bool badconv) const +wxString wxRegExImpl::GetErrorMsg(int errorcode) const { -#ifdef WXREGEX_CONVERT_TO_MB - // currently only needed when using system library in Unicode mode - if ( badconv ) - { - return _("conversion to 8-bit encoding failed"); - } -#else - // 'use' badconv to avoid a compiler warning - (void)badconv; -#endif - wxString szError; // first get the string length needed @@ -562,20 +551,16 @@ bool wxRegExImpl::Compile(const wxString& expr, int flags) // compile it #ifdef WXREGEX_USING_BUILTIN - bool conv = true; // FIXME-UTF8: use wc_str() after removing ANSI build int errorcode = wx_re_comp(&m_RegEx, expr.c_str(), expr.length(), flagsRE); #else - // FIXME-UTF8: this is potentially broken, we shouldn't even try it - // and should always use builtin regex library (or PCRE?) - const wxWX2MBbuf conv = expr.mbc_str(); - int errorcode = conv ? regcomp(&m_RegEx, conv, flagsRE) : REG_BADPAT; + int errorcode = wx_regcomp(&m_RegEx, expr.utf8_str(), flagsRE); #endif if ( errorcode ) { wxLogError(_("Invalid regular expression '%s': %s"), - expr, GetErrorMsg(errorcode, !conv)); + expr, GetErrorMsg(errorcode)); m_isCompiled = false; } @@ -682,9 +667,9 @@ bool wxRegExImpl::Matches(const wxRegChar *str, #if defined WXREGEX_USING_BUILTIN int rc = wx_re_exec(&self->m_RegEx, str, len, NULL, m_nMatches, matches, flagsRE); #elif defined WXREGEX_USING_RE_SEARCH - int rc = str ? ReSearch(&self->m_RegEx, str, len, matches, flagsRE) : REG_BADPAT; + int rc = ReSearch(&self->m_RegEx, str, len, matches, flagsRE); #else - int rc = str ? wx_regexec(&self->m_RegEx, str, m_nMatches, matches, flagsRE) : REG_BADPAT; + int rc = wx_regexec(&self->m_RegEx, str, len, m_nMatches, matches, flagsRE); #endif switch ( rc ) @@ -696,7 +681,7 @@ bool wxRegExImpl::Matches(const wxRegChar *str, default: // an error occurred wxLogError(_("Failed to find match for regular expression: %s"), - GetErrorMsg(rc, !str)); + GetErrorMsg(rc)); wxFALLTHROUGH; case REG_NOMATCH: From 3a52523f96e62c9485984c663eab7f10d9623229 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Thu, 15 Jul 2021 22:37:42 +0200 Subject: [PATCH 09/13] Simplify code in wxRegExImpl::Replace() Check for WXREGEX_CONVERT_TO_MB once and define textstr variable pointing to the string data instead of using it several times. No real changes. --- src/common/regex.cpp | 38 ++++++++++---------------------------- 1 file changed, 10 insertions(+), 28 deletions(-) diff --git a/src/common/regex.cpp b/src/common/regex.cpp index 3a081ae075..73ac264dac 100644 --- a/src/common/regex.cpp +++ b/src/common/regex.cpp @@ -725,8 +725,9 @@ int wxRegExImpl::Replace(wxString *text, const wxChar *textstr = text->c_str(); size_t textlen = text->length(); #else - const wxScopedCharBuffer textstr = text->utf8_str(); - size_t textlen = textstr.length(); + const wxScopedCharBuffer textbuf = text->utf8_str(); + const char* const textstr = textbuf.data(); + size_t textlen = textbuf.length(); text->clear(); #endif @@ -757,14 +758,9 @@ int wxRegExImpl::Replace(wxString *text, // note that "^" shouldn't match after the first call to Matches() so we // use wxRE_NOTBOL to prevent it from happening while ( (!maxMatches || countRepl < maxMatches) && - Matches( -#ifndef WXREGEX_CONVERT_TO_MB - textstr + matchStart, -#else - textstr.data() + matchStart, -#endif - countRepl ? wxRE_NOTBOL : 0, - textlen - matchStart) ) + Matches(textstr + matchStart, + countRepl ? wxRE_NOTBOL : 0, + textlen - matchStart) ) { // the string possibly contains back references: we need to calculate // the replacement text anew after each match @@ -808,14 +804,8 @@ int wxRegExImpl::Replace(wxString *text, } else { - textNew += wxString( -#ifndef WXREGEX_CONVERT_TO_MB - textstr -#else - textstr.data() -#endif - + matchStart + start, - wxConvUTF8, len); + textNew += wxString(textstr + matchStart + start, + wxConvUTF8, len); mayHaveBackrefs = true; } @@ -841,11 +831,7 @@ int wxRegExImpl::Replace(wxString *text, if (result.capacity() < result.length() + start + textNew.length()) result.reserve(2 * result.length()); -#ifndef WXREGEX_CONVERT_TO_MB - result.append(*text, matchStart, start); -#else - result.append(wxString(textstr.data() + matchStart, wxConvUTF8, start)); -#endif + result.append(wxString(textstr + matchStart, wxConvUTF8, start)); matchStart += start; result.append(textNew); @@ -854,11 +840,7 @@ int wxRegExImpl::Replace(wxString *text, matchStart += len; } -#ifndef WXREGEX_CONVERT_TO_MB - result.append(*text, matchStart, wxString::npos); -#else - result.append(wxString(textstr.data() + matchStart, wxConvUTF8)); -#endif + result.append(wxString(textstr + matchStart, wxConvUTF8)); *text = result; return countRepl; From 838b693b46fbe5f49a11e0d2626736d134f26aaf Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Thu, 15 Jul 2021 22:26:58 +0200 Subject: [PATCH 10/13] Fix wxRegEx::GetMatch() to work in WXREGEX_CONVERT_TO_MB case Using the offset into the wide string doesn't work for non-ASCII characters, it must be converted to UTF-8 first. --- src/common/regex.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/common/regex.cpp b/src/common/regex.cpp index 73ac264dac..471a983fcc 100644 --- a/src/common/regex.cpp +++ b/src/common/regex.cpp @@ -906,7 +906,11 @@ wxString wxRegEx::GetMatch(const wxString& text, size_t index) const if ( !GetMatch(&start, &len, index) ) return wxEmptyString; +#ifndef WXREGEX_CONVERT_TO_MB return text.Mid(start, len); +#else + return wxString::FromUTF8(text.utf8_str().data() + start, len); +#endif } size_t wxRegEx::GetMatchCount() const From d83368664ca606310607e94851dc00a27232388d Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Fri, 16 Jul 2021 11:02:48 +0200 Subject: [PATCH 11/13] Remove unnecessary call to clear() in Replace() There doesn't seem to be any need to clear the string in WXREGEX_CONVERT_TO_MB case. --- src/common/regex.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/common/regex.cpp b/src/common/regex.cpp index 471a983fcc..118bf0fcb3 100644 --- a/src/common/regex.cpp +++ b/src/common/regex.cpp @@ -728,7 +728,6 @@ int wxRegExImpl::Replace(wxString *text, const wxScopedCharBuffer textbuf = text->utf8_str(); const char* const textstr = textbuf.data(); size_t textlen = textbuf.length(); - text->clear(); #endif // the replacement text From fb21f556a70abcf5ecaad1e492dc13f2e83df7d2 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Fri, 16 Jul 2021 11:11:11 +0200 Subject: [PATCH 12/13] Use wxCharBuffer instead of manual allocations in GetErrorMsg() Replace new[]/delete[] with a wxCharBuffer object. Also check for conversion failure when converting the error message to wide char string. --- src/common/regex.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/common/regex.cpp b/src/common/regex.cpp index 118bf0fcb3..96daab4dab 100644 --- a/src/common/regex.cpp +++ b/src/common/regex.cpp @@ -227,14 +227,14 @@ wxString wxRegExImpl::GetErrorMsg(int errorcode) const int len = wx_regerror(errorcode, &m_RegEx, NULL, 0); if ( len > 0 ) { - char* szcmbError = new char[++len]; + wxCharBuffer errbuf(len); - (void)wx_regerror(errorcode, &m_RegEx, szcmbError, len); + (void)wx_regerror(errorcode, &m_RegEx, errbuf.data(), errbuf.length()); - szError = wxConvLibc.cMB2WX(szcmbError); - delete [] szcmbError; + szError = wxConvLibc.cMB2WX(errbuf); } - else // regerror() returned 0 + + if ( szError.empty() ) // regerror() returned 0 or conversion failed { szError = _("unknown error"); } From 4dd77dabe8fe7e968458da2d6d23c6c257150f42 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Fri, 16 Jul 2021 11:21:41 +0200 Subject: [PATCH 13/13] Check for WXREGEX_CONVERT_TO_MB when calling regcomp() too Make code more consistent by using the same approach as in the other places where conversion is necessary in some builds and define temporary variables for clarity. Also use wx_str() when the conversion is not necessary rather than c_str() as this is more efficient and allows to address an existing FIXME comment. --- src/common/regex.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/common/regex.cpp b/src/common/regex.cpp index 96daab4dab..2b784a7314 100644 --- a/src/common/regex.cpp +++ b/src/common/regex.cpp @@ -549,12 +549,18 @@ bool wxRegExImpl::Compile(const wxString& expr, int flags) if ( flags & wxRE_NEWLINE ) flagsRE |= REG_NEWLINE; +#ifndef WXREGEX_CONVERT_TO_MB + const wxChar *exprstr = expr.wx_str(); +#else + const wxScopedCharBuffer exprbuf = expr.utf8_str(); + const char* const exprstr = exprbuf.data(); +#endif + // compile it #ifdef WXREGEX_USING_BUILTIN - // FIXME-UTF8: use wc_str() after removing ANSI build - int errorcode = wx_re_comp(&m_RegEx, expr.c_str(), expr.length(), flagsRE); + int errorcode = wx_re_comp(&m_RegEx, exprstr, expr.length(), flagsRE); #else - int errorcode = wx_regcomp(&m_RegEx, expr.utf8_str(), flagsRE); + int errorcode = wx_regcomp(&m_RegEx, exprstr, flagsRE); #endif if ( errorcode )