Merge branch 'regex-prepare-pcre'
Cleanup and simplify wxRegEx code. See https://github.com/wxWidgets/wxWidgets/pull/2437
This commit is contained in:
@@ -144,6 +144,9 @@ public:
|
||||
|
||||
static wxString QuoteMeta(const wxString& str);
|
||||
|
||||
// return the extended RE corresponding to the given basic RE
|
||||
static wxString ConvertFromBasic(const wxString& bre);
|
||||
|
||||
// dtor not virtual, don't derive from this class
|
||||
~wxRegEx();
|
||||
|
||||
|
@@ -276,5 +276,15 @@ public:
|
||||
@since 3.1.3
|
||||
*/
|
||||
static wxString QuoteMeta(const wxString& str);
|
||||
|
||||
/**
|
||||
Converts a basic regular expression to an extended regex syntax.
|
||||
|
||||
This function can be used to convert @a bre using deprecated wxRE_BASIC
|
||||
syntax to default (extended) syntax.
|
||||
|
||||
@since 3.1.6
|
||||
*/
|
||||
static wxString ConvertFromBasic(const wxString& bre);
|
||||
};
|
||||
|
||||
|
@@ -24,3 +24,6 @@ expressions</em> (BRE). EREs are roughly those of the traditional @e egrep,
|
||||
// 2019), i.e. SEH translator seems to work just fine without /EHa too, so
|
||||
// Purpose: helpers for the structured exception handling (SEH) under Win32
|
||||
* MinGW-w64 versions 7.3 and 8.1 (32-bit binaries use SJLJ exceptions, 64-bit ones use SEH, and all binaries use Win32 threads).
|
||||
static wxString ConvertFromBasic(const wxString& bre);
|
||||
This function can be used to convert @a bre using deprecated wxRE_BASIC
|
||||
static wxString ConvertFromBasic(const wxString& bre);
|
||||
|
@@ -41,31 +41,27 @@
|
||||
|
||||
// WXREGEX_USING_BUILTIN defined when using the built-in regex lib
|
||||
// WXREGEX_USING_RE_SEARCH defined when using re_search in the GNU regex lib
|
||||
// WXREGEX_IF_NEED_LEN() wrap the len parameter only used with the built-in
|
||||
// or GNU regex
|
||||
// WXREGEX_CONVERT_TO_MB defined when the regex lib is using chars and
|
||||
// wxChar is wide, so conversion must be done
|
||||
// WXREGEX_CHAR(x) Convert wxChar to wxRegChar
|
||||
// wxChar is wide, so conversion to UTF-8 must be done
|
||||
//
|
||||
#ifdef __REG_NOFRONT
|
||||
# define WXREGEX_USING_BUILTIN
|
||||
# define WXREGEX_IF_NEED_LEN(x) ,x
|
||||
# if wxUSE_UNICODE
|
||||
# define WXREGEX_CHAR(x) (x).wc_str()
|
||||
# else
|
||||
# define WXREGEX_CHAR(x) (x).mb_str()
|
||||
# endif
|
||||
#else
|
||||
# ifdef HAVE_RE_SEARCH
|
||||
# define WXREGEX_IF_NEED_LEN(x) ,x
|
||||
# define WXREGEX_USING_RE_SEARCH
|
||||
# else
|
||||
# define WXREGEX_IF_NEED_LEN(x)
|
||||
// We can't use length, so just drop it in this wrapper.
|
||||
inline int
|
||||
wx_regexec(const regex_t* preg, const char* string, size_t,
|
||||
size_t nmatch, regmatch_t* pmatch, int eflags)
|
||||
{
|
||||
return regexec(preg, string, nmatch, pmatch, eflags);
|
||||
}
|
||||
# endif
|
||||
# if wxUSE_UNICODE
|
||||
# define WXREGEX_CONVERT_TO_MB
|
||||
# endif
|
||||
# define WXREGEX_CHAR(x) (x).mb_str()
|
||||
# define wx_regcomp regcomp
|
||||
# define wx_regfree regfree
|
||||
# define wx_regerror regerror
|
||||
#endif
|
||||
@@ -157,8 +153,7 @@ public:
|
||||
|
||||
// RE operations
|
||||
bool Compile(const wxString& expr, int flags = 0);
|
||||
bool Matches(const wxRegChar *str, int flags
|
||||
WXREGEX_IF_NEED_LEN(size_t len)) const;
|
||||
bool Matches(const wxRegChar *str, int flags, size_t len) const;
|
||||
bool GetMatch(size_t *start, size_t *len, size_t index = 0) const;
|
||||
size_t GetMatchCount() const;
|
||||
int Replace(wxString *pattern, const wxString& replacement,
|
||||
@@ -166,7 +161,7 @@ public:
|
||||
|
||||
private:
|
||||
// return the string containing the error message for the given err code
|
||||
wxString GetErrorMsg(int errorcode, bool badconv) const;
|
||||
wxString GetErrorMsg(int errorcode) const;
|
||||
|
||||
// init the members
|
||||
void Init()
|
||||
@@ -224,33 +219,22 @@ wxRegExImpl::~wxRegExImpl()
|
||||
Free();
|
||||
}
|
||||
|
||||
wxString wxRegExImpl::GetErrorMsg(int errorcode, bool badconv) const
|
||||
wxString wxRegExImpl::GetErrorMsg(int errorcode) const
|
||||
{
|
||||
#ifdef WXREGEX_CONVERT_TO_MB
|
||||
// currently only needed when using system library in Unicode mode
|
||||
if ( badconv )
|
||||
{
|
||||
return _("conversion to 8-bit encoding failed");
|
||||
}
|
||||
#else
|
||||
// 'use' badconv to avoid a compiler warning
|
||||
(void)badconv;
|
||||
#endif
|
||||
|
||||
wxString szError;
|
||||
|
||||
// first get the string length needed
|
||||
int len = wx_regerror(errorcode, &m_RegEx, NULL, 0);
|
||||
if ( len > 0 )
|
||||
{
|
||||
char* szcmbError = new char[++len];
|
||||
wxCharBuffer errbuf(len);
|
||||
|
||||
(void)wx_regerror(errorcode, &m_RegEx, szcmbError, len);
|
||||
(void)wx_regerror(errorcode, &m_RegEx, errbuf.data(), errbuf.length());
|
||||
|
||||
szError = wxConvLibc.cMB2WX(szcmbError);
|
||||
delete [] szcmbError;
|
||||
szError = wxConvLibc.cMB2WX(errbuf);
|
||||
}
|
||||
else // regerror() returned 0
|
||||
|
||||
if ( szError.empty() ) // regerror() returned 0 or conversion failed
|
||||
{
|
||||
szError = _("unknown error");
|
||||
}
|
||||
@@ -258,6 +242,281 @@ wxString wxRegExImpl::GetErrorMsg(int errorcode, bool badconv) const
|
||||
return szError;
|
||||
}
|
||||
|
||||
// Helper function for processing bracket expressions inside a regex.
|
||||
//
|
||||
// Advance the iterator until the closing bracket matching the opening one the
|
||||
// iterator currently points to, i.e.:
|
||||
//
|
||||
// Precondition: *it == '['
|
||||
// Postcondition: *it == ']' or it == end if failed to find matching ']'
|
||||
static
|
||||
wxString::const_iterator
|
||||
SkipBracketExpression(wxString::const_iterator it, wxString::const_iterator end)
|
||||
{
|
||||
wxASSERT_MSG( *it == '[', "must be at the start of bracket expression" );
|
||||
|
||||
// Initial ']', possibly after the preceding '^', is different because it
|
||||
// stands for a literal ']' and not the end of the bracket expression, so
|
||||
// check for it first.
|
||||
++it;
|
||||
if ( it != end && *it == '^' )
|
||||
++it;
|
||||
if ( it != end && *it == ']' )
|
||||
++it;
|
||||
|
||||
// Any ']' from now on ends the bracket expression.
|
||||
for ( ; it != end; ++it )
|
||||
{
|
||||
const wxUniChar c = *it;
|
||||
|
||||
if ( c == ']' )
|
||||
break;
|
||||
|
||||
if ( c == '[' )
|
||||
{
|
||||
// Bare '[' on its own is not special, but collating elements and
|
||||
// character classes are, so check for them and advance past them
|
||||
// if necessary to avoid misinterpreting the matching closing ']'.
|
||||
if ( ++it == end )
|
||||
break;
|
||||
|
||||
const wxUniChar c = *it;
|
||||
if ( c == ':' || c == '.' || c == '=' )
|
||||
{
|
||||
for ( ++it; it != end; ++it )
|
||||
{
|
||||
if ( *it == c )
|
||||
{
|
||||
if ( ++it == end )
|
||||
break;
|
||||
|
||||
if ( *it == ']' )
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( it == end )
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return it;
|
||||
}
|
||||
|
||||
/* static */
|
||||
wxString wxRegEx::ConvertFromBasic(const wxString& bre)
|
||||
{
|
||||
/*
|
||||
Quoting regex(7):
|
||||
|
||||
Obsolete ("basic") regular expressions differ in several respects.
|
||||
'|', '+', and '?' are ordinary characters and there is no equivalent
|
||||
for their functionality. The delimiters for bounds are "\{" and "\}",
|
||||
with '{' and '}' by themselves ordinary characters. The parentheses
|
||||
for nested subexpressions are "\(" and "\)", with '(' and ')' by
|
||||
themselves ordinary characters. '^' is an ordinary character except at
|
||||
the beginning of the RE or(!) the beginning of a parenthesized
|
||||
subexpression, '$' is an ordinary character except at the end of the RE
|
||||
or(!) the end of a parenthesized subexpression, and '*' is an ordinary
|
||||
character if it appears at the beginning of the RE or the beginning of
|
||||
a parenthesized subexpression (after a possible leading '^').
|
||||
|
||||
Finally, there is one new type of atom, a back reference: '\' followed
|
||||
by a nonzero decimal digit d matches the same sequence of characters
|
||||
matched by the dth parenthesized subexpression [...]
|
||||
*/
|
||||
wxString ere;
|
||||
ere.reserve(bre.length());
|
||||
|
||||
enum SinceStart
|
||||
{
|
||||
SinceStart_None, // Just at the beginning.
|
||||
SinceStart_OnlyCaret, // Had just "^" since the beginning.
|
||||
SinceStart_Some // Had something else since the beginning.
|
||||
};
|
||||
|
||||
struct State
|
||||
{
|
||||
explicit State(SinceStart sinceStart_)
|
||||
{
|
||||
isBackslash = false;
|
||||
sinceStart = sinceStart_;
|
||||
}
|
||||
|
||||
bool isBackslash;
|
||||
SinceStart sinceStart;
|
||||
};
|
||||
|
||||
State previous(SinceStart_None);
|
||||
for ( wxString::const_iterator it = bre.begin(),
|
||||
end = bre.end();
|
||||
it != end;
|
||||
++it )
|
||||
{
|
||||
const wxUniChar c = *it;
|
||||
|
||||
// What should be done with the current character?
|
||||
enum Disposition
|
||||
{
|
||||
Disposition_Skip, // Nothing.
|
||||
Disposition_Append, // Append to output.
|
||||
Disposition_Escape // ... after escaping it with backslash.
|
||||
} disposition = Disposition_Append;
|
||||
|
||||
State current(SinceStart_Some);
|
||||
|
||||
if ( previous.isBackslash )
|
||||
{
|
||||
// By default, keep the backslash present in the BRE, it's still
|
||||
// needed in the ERE too.
|
||||
disposition = Disposition_Escape;
|
||||
|
||||
switch ( c.GetValue() )
|
||||
{
|
||||
case '(':
|
||||
// It's the start of a new subexpression.
|
||||
current.sinceStart = SinceStart_None;
|
||||
wxFALLTHROUGH;
|
||||
|
||||
case ')':
|
||||
case '{':
|
||||
case '}':
|
||||
// Do not escape to ensure they remain special in the ERE
|
||||
// as the escaped versions were special in the BRE.
|
||||
disposition = Disposition_Append;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else // This character is not escaped.
|
||||
{
|
||||
switch ( c.GetValue() )
|
||||
{
|
||||
case '\\':
|
||||
current.isBackslash = true;
|
||||
|
||||
// Don't do anything with it yet, we'll deal with it later.
|
||||
disposition = Disposition_Skip;
|
||||
break;
|
||||
|
||||
case '^':
|
||||
// Escape unless it appears at the start.
|
||||
switch ( previous.sinceStart )
|
||||
{
|
||||
case SinceStart_None:
|
||||
// Don't escape, but do update the state.
|
||||
current.sinceStart = SinceStart_OnlyCaret;
|
||||
break;
|
||||
|
||||
case SinceStart_OnlyCaret:
|
||||
case SinceStart_Some:
|
||||
disposition = Disposition_Escape;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
case '*':
|
||||
// Escape unless it appears at the start or right after "^".
|
||||
switch ( previous.sinceStart )
|
||||
{
|
||||
case SinceStart_None:
|
||||
case SinceStart_OnlyCaret:
|
||||
disposition = Disposition_Escape;
|
||||
break;
|
||||
|
||||
case SinceStart_Some:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
case '$':
|
||||
// Escape unless it appears at the end or just before "\)".
|
||||
disposition = Disposition_Escape;
|
||||
{
|
||||
wxString::const_iterator next = it;
|
||||
++next;
|
||||
if ( next == end )
|
||||
{
|
||||
// It is at the end, so has special meaning.
|
||||
disposition = Disposition_Append;
|
||||
}
|
||||
else // Not at the end, but maybe at subexpression end?
|
||||
{
|
||||
if ( *next == '\\' )
|
||||
{
|
||||
++next;
|
||||
if ( next != end && *next == ')' )
|
||||
disposition = Disposition_Append;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case '|':
|
||||
case '+':
|
||||
case '?':
|
||||
case '(':
|
||||
case ')':
|
||||
case '{':
|
||||
case '}':
|
||||
// Escape these characters which are not special in a BRE,
|
||||
// but would be special in a ERE if left unescaped.
|
||||
disposition = Disposition_Escape;
|
||||
break;
|
||||
|
||||
case '[':
|
||||
// Rules are very different for the characters inside the
|
||||
// bracket expressions and we don't have to change anything
|
||||
// for them as the syntax is the same for BREs and EREs, so
|
||||
// just process the entire expression at once.
|
||||
{
|
||||
const wxString::const_iterator start = it;
|
||||
it = SkipBracketExpression(it, end);
|
||||
|
||||
// Copy everything inside without any changes.
|
||||
ere += wxString(start, it);
|
||||
|
||||
if ( it == end )
|
||||
{
|
||||
// If we reached the end without finding the
|
||||
// matching ']' there is nothing remaining anyhow.
|
||||
return ere;
|
||||
}
|
||||
|
||||
// Note that default Disposition_Append here is fine,
|
||||
// we'll append the closing ']' to "ere" below.
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
switch ( disposition )
|
||||
{
|
||||
case Disposition_Skip:
|
||||
break;
|
||||
|
||||
case Disposition_Escape:
|
||||
ere += '\\';
|
||||
wxFALLTHROUGH;
|
||||
|
||||
case Disposition_Append:
|
||||
// Note: don't use "c" here, iterator may have been advanced
|
||||
// inside the loop.
|
||||
ere += *it;
|
||||
break;
|
||||
}
|
||||
|
||||
previous = current;
|
||||
}
|
||||
|
||||
// It's an error if a RE ends with a backslash, but we still need to
|
||||
// preserve this error in the resulting RE.
|
||||
if ( previous.isBackslash )
|
||||
ere += '\\';
|
||||
|
||||
return ere;
|
||||
}
|
||||
|
||||
bool wxRegExImpl::Compile(const wxString& expr, int flags)
|
||||
{
|
||||
Reinit();
|
||||
@@ -290,22 +549,24 @@ bool wxRegExImpl::Compile(const wxString& expr, int flags)
|
||||
if ( flags & wxRE_NEWLINE )
|
||||
flagsRE |= REG_NEWLINE;
|
||||
|
||||
#ifndef WXREGEX_CONVERT_TO_MB
|
||||
const wxChar *exprstr = expr.wx_str();
|
||||
#else
|
||||
const wxScopedCharBuffer exprbuf = expr.utf8_str();
|
||||
const char* const exprstr = exprbuf.data();
|
||||
#endif
|
||||
|
||||
// compile it
|
||||
#ifdef WXREGEX_USING_BUILTIN
|
||||
bool conv = true;
|
||||
// FIXME-UTF8: use wc_str() after removing ANSI build
|
||||
int errorcode = wx_re_comp(&m_RegEx, expr.c_str(), expr.length(), flagsRE);
|
||||
int errorcode = wx_re_comp(&m_RegEx, exprstr, expr.length(), flagsRE);
|
||||
#else
|
||||
// FIXME-UTF8: this is potentially broken, we shouldn't even try it
|
||||
// and should always use builtin regex library (or PCRE?)
|
||||
const wxWX2MBbuf conv = expr.mbc_str();
|
||||
int errorcode = conv ? regcomp(&m_RegEx, conv, flagsRE) : REG_BADPAT;
|
||||
int errorcode = wx_regcomp(&m_RegEx, exprstr, flagsRE);
|
||||
#endif
|
||||
|
||||
if ( errorcode )
|
||||
{
|
||||
wxLogError(_("Invalid regular expression '%s': %s"),
|
||||
expr.c_str(), GetErrorMsg(errorcode, !conv).c_str());
|
||||
expr, GetErrorMsg(errorcode));
|
||||
|
||||
m_isCompiled = false;
|
||||
}
|
||||
@@ -384,8 +645,8 @@ static int ReSearch(const regex_t *preg,
|
||||
#endif // WXREGEX_USING_RE_SEARCH
|
||||
|
||||
bool wxRegExImpl::Matches(const wxRegChar *str,
|
||||
int flags
|
||||
WXREGEX_IF_NEED_LEN(size_t len)) const
|
||||
int flags,
|
||||
size_t len) const
|
||||
{
|
||||
wxCHECK_MSG( IsValid(), false, wxT("must successfully Compile() first") );
|
||||
|
||||
@@ -412,9 +673,9 @@ bool wxRegExImpl::Matches(const wxRegChar *str,
|
||||
#if defined WXREGEX_USING_BUILTIN
|
||||
int rc = wx_re_exec(&self->m_RegEx, str, len, NULL, m_nMatches, matches, flagsRE);
|
||||
#elif defined WXREGEX_USING_RE_SEARCH
|
||||
int rc = str ? ReSearch(&self->m_RegEx, str, len, matches, flagsRE) : REG_BADPAT;
|
||||
int rc = ReSearch(&self->m_RegEx, str, len, matches, flagsRE);
|
||||
#else
|
||||
int rc = str ? regexec(&self->m_RegEx, str, m_nMatches, matches, flagsRE) : REG_BADPAT;
|
||||
int rc = wx_regexec(&self->m_RegEx, str, len, m_nMatches, matches, flagsRE);
|
||||
#endif
|
||||
|
||||
switch ( rc )
|
||||
@@ -426,7 +687,7 @@ bool wxRegExImpl::Matches(const wxRegChar *str,
|
||||
default:
|
||||
// an error occurred
|
||||
wxLogError(_("Failed to find match for regular expression: %s"),
|
||||
GetErrorMsg(rc, !str).c_str());
|
||||
GetErrorMsg(rc));
|
||||
wxFALLTHROUGH;
|
||||
|
||||
case REG_NOMATCH:
|
||||
@@ -470,15 +731,9 @@ int wxRegExImpl::Replace(wxString *text,
|
||||
const wxChar *textstr = text->c_str();
|
||||
size_t textlen = text->length();
|
||||
#else
|
||||
const wxWX2MBbuf textstr = WXREGEX_CHAR(*text);
|
||||
if (!textstr)
|
||||
{
|
||||
wxLogError(_("Failed to find match for regular expression: %s"),
|
||||
GetErrorMsg(0, true).c_str());
|
||||
return 0;
|
||||
}
|
||||
size_t textlen = strlen(textstr);
|
||||
text->clear();
|
||||
const wxScopedCharBuffer textbuf = text->utf8_str();
|
||||
const char* const textstr = textbuf.data();
|
||||
size_t textlen = textbuf.length();
|
||||
#endif
|
||||
|
||||
// the replacement text
|
||||
@@ -508,14 +763,9 @@ int wxRegExImpl::Replace(wxString *text,
|
||||
// note that "^" shouldn't match after the first call to Matches() so we
|
||||
// use wxRE_NOTBOL to prevent it from happening
|
||||
while ( (!maxMatches || countRepl < maxMatches) &&
|
||||
Matches(
|
||||
#ifndef WXREGEX_CONVERT_TO_MB
|
||||
textstr + matchStart,
|
||||
#else
|
||||
textstr.data() + matchStart,
|
||||
#endif
|
||||
countRepl ? wxRE_NOTBOL : 0
|
||||
WXREGEX_IF_NEED_LEN(textlen - matchStart)) )
|
||||
Matches(textstr + matchStart,
|
||||
countRepl ? wxRE_NOTBOL : 0,
|
||||
textlen - matchStart) )
|
||||
{
|
||||
// the string possibly contains back references: we need to calculate
|
||||
// the replacement text anew after each match
|
||||
@@ -559,14 +809,8 @@ int wxRegExImpl::Replace(wxString *text,
|
||||
}
|
||||
else
|
||||
{
|
||||
textNew += wxString(
|
||||
#ifndef WXREGEX_CONVERT_TO_MB
|
||||
textstr
|
||||
#else
|
||||
textstr.data()
|
||||
#endif
|
||||
+ matchStart + start,
|
||||
*wxConvCurrent, len);
|
||||
textNew += wxString(textstr + matchStart + start,
|
||||
wxConvUTF8, len);
|
||||
|
||||
mayHaveBackrefs = true;
|
||||
}
|
||||
@@ -592,11 +836,7 @@ int wxRegExImpl::Replace(wxString *text,
|
||||
if (result.capacity() < result.length() + start + textNew.length())
|
||||
result.reserve(2 * result.length());
|
||||
|
||||
#ifndef WXREGEX_CONVERT_TO_MB
|
||||
result.append(*text, matchStart, start);
|
||||
#else
|
||||
result.append(wxString(textstr.data() + matchStart, *wxConvCurrent, start));
|
||||
#endif
|
||||
result.append(wxString(textstr + matchStart, wxConvUTF8, start));
|
||||
matchStart += start;
|
||||
result.append(textNew);
|
||||
|
||||
@@ -605,11 +845,7 @@ int wxRegExImpl::Replace(wxString *text,
|
||||
matchStart += len;
|
||||
}
|
||||
|
||||
#ifndef WXREGEX_CONVERT_TO_MB
|
||||
result.append(*text, matchStart, wxString::npos);
|
||||
#else
|
||||
result.append(wxString(textstr.data() + matchStart, *wxConvCurrent));
|
||||
#endif
|
||||
result.append(wxString(textstr + matchStart, wxConvUTF8));
|
||||
*text = result;
|
||||
|
||||
return countRepl;
|
||||
@@ -651,8 +887,15 @@ bool wxRegEx::Matches(const wxString& str, int flags) const
|
||||
{
|
||||
wxCHECK_MSG( IsValid(), false, wxT("must successfully Compile() first") );
|
||||
|
||||
return m_impl->Matches(WXREGEX_CHAR(str), flags
|
||||
WXREGEX_IF_NEED_LEN(str.length()));
|
||||
#ifndef WXREGEX_CONVERT_TO_MB
|
||||
const wxChar* const textstr = str.c_str();
|
||||
const size_t textlen = str.length();
|
||||
#else
|
||||
const wxScopedCharBuffer textstr = str.utf8_str();
|
||||
const size_t textlen = textstr.length();
|
||||
#endif
|
||||
|
||||
return m_impl->Matches(textstr, flags, textlen);
|
||||
}
|
||||
|
||||
bool wxRegEx::GetMatch(size_t *start, size_t *len, size_t index) const
|
||||
@@ -668,7 +911,11 @@ wxString wxRegEx::GetMatch(const wxString& text, size_t index) const
|
||||
if ( !GetMatch(&start, &len, index) )
|
||||
return wxEmptyString;
|
||||
|
||||
#ifndef WXREGEX_CONVERT_TO_MB
|
||||
return text.Mid(start, len);
|
||||
#else
|
||||
return wxString::FromUTF8(text.utf8_str().data() + start, len);
|
||||
#endif
|
||||
}
|
||||
|
||||
size_t wxRegEx::GetMatchCount() const
|
||||
|
@@ -57,6 +57,7 @@ BENCH_OBJECTS = \
|
||||
bench_ipcclient.o \
|
||||
bench_log.o \
|
||||
bench_mbconv.o \
|
||||
bench_regex.o \
|
||||
bench_strings.o \
|
||||
bench_tls.o \
|
||||
bench_printfbench.o
|
||||
@@ -299,6 +300,9 @@ bench_log.o: $(srcdir)/log.cpp
|
||||
bench_mbconv.o: $(srcdir)/mbconv.cpp
|
||||
$(CXXC) -c -o $@ $(BENCH_CXXFLAGS) $(srcdir)/mbconv.cpp
|
||||
|
||||
bench_regex.o: $(srcdir)/regex.cpp
|
||||
$(CXXC) -c -o $@ $(BENCH_CXXFLAGS) $(srcdir)/regex.cpp
|
||||
|
||||
bench_strings.o: $(srcdir)/strings.cpp
|
||||
$(CXXC) -c -o $@ $(BENCH_CXXFLAGS) $(srcdir)/strings.cpp
|
||||
|
||||
|
@@ -16,6 +16,7 @@
|
||||
ipcclient.cpp
|
||||
log.cpp
|
||||
mbconv.cpp
|
||||
regex.cpp
|
||||
strings.cpp
|
||||
tls.cpp
|
||||
printfbench.cpp
|
||||
|
@@ -838,6 +838,10 @@
|
||||
RelativePath=".\printfbench.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\regex.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\strings.cpp"
|
||||
>
|
||||
|
@@ -810,6 +810,10 @@
|
||||
RelativePath=".\printfbench.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\regex.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\strings.cpp"
|
||||
>
|
||||
|
@@ -36,6 +36,7 @@ BENCH_OBJECTS = \
|
||||
$(OBJS)\bench_ipcclient.o \
|
||||
$(OBJS)\bench_log.o \
|
||||
$(OBJS)\bench_mbconv.o \
|
||||
$(OBJS)\bench_regex.o \
|
||||
$(OBJS)\bench_strings.o \
|
||||
$(OBJS)\bench_tls.o \
|
||||
$(OBJS)\bench_printfbench.o
|
||||
@@ -310,6 +311,9 @@ $(OBJS)\bench_log.o: ./log.cpp
|
||||
$(OBJS)\bench_mbconv.o: ./mbconv.cpp
|
||||
$(CXX) -c -o $@ $(BENCH_CXXFLAGS) $(CPPDEPS) $<
|
||||
|
||||
$(OBJS)\bench_regex.o: ./regex.cpp
|
||||
$(CXX) -c -o $@ $(BENCH_CXXFLAGS) $(CPPDEPS) $<
|
||||
|
||||
$(OBJS)\bench_strings.o: ./strings.cpp
|
||||
$(CXX) -c -o $@ $(BENCH_CXXFLAGS) $(CPPDEPS) $<
|
||||
|
||||
|
@@ -37,6 +37,7 @@ BENCH_OBJECTS = \
|
||||
$(OBJS)\bench_ipcclient.obj \
|
||||
$(OBJS)\bench_log.obj \
|
||||
$(OBJS)\bench_mbconv.obj \
|
||||
$(OBJS)\bench_regex.obj \
|
||||
$(OBJS)\bench_strings.obj \
|
||||
$(OBJS)\bench_tls.obj \
|
||||
$(OBJS)\bench_printfbench.obj
|
||||
@@ -698,6 +699,9 @@ $(OBJS)\bench_log.obj: .\log.cpp
|
||||
$(OBJS)\bench_mbconv.obj: .\mbconv.cpp
|
||||
$(CXX) /c /nologo /TP /Fo$@ $(BENCH_CXXFLAGS) .\mbconv.cpp
|
||||
|
||||
$(OBJS)\bench_regex.obj: .\regex.cpp
|
||||
$(CXX) /c /nologo /TP /Fo$@ $(BENCH_CXXFLAGS) .\regex.cpp
|
||||
|
||||
$(OBJS)\bench_strings.obj: .\strings.cpp
|
||||
$(CXX) /c /nologo /TP /Fo$@ $(BENCH_CXXFLAGS) .\strings.cpp
|
||||
|
||||
|
74
tests/benchmarks/regex.cpp
Normal file
74
tests/benchmarks/regex.cpp
Normal file
@@ -0,0 +1,74 @@
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
// Name: tests/benchmarks/regex.cpp
|
||||
// Purpose: wxRegEx benchmarks
|
||||
// Author: Vadim Zeitlin
|
||||
// Created: 2018-11-15
|
||||
// Copyright: (c) 2018 Vadim Zeitlin <vadim@wxwidgets.org>
|
||||
// Licence: wxWindows licence
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "wx/ffile.h"
|
||||
#include "wx/regex.h"
|
||||
|
||||
#include "bench.h"
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Benchmark relative costs of compiling and matching for a simple regex
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
static const char* const RE_SIMPLE = ".";
|
||||
|
||||
BENCHMARK_FUNC(RECompile)
|
||||
{
|
||||
return wxRegEx(RE_SIMPLE).IsValid();
|
||||
}
|
||||
|
||||
BENCHMARK_FUNC(REMatch)
|
||||
{
|
||||
static wxRegEx re(RE_SIMPLE);
|
||||
return re.Matches("foo");
|
||||
}
|
||||
|
||||
BENCHMARK_FUNC(RECompileAndMatch)
|
||||
{
|
||||
return wxRegEx(RE_SIMPLE).Matches("foo");
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Benchmark the cost of using a more complicated regex
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
// Use the contents of an already existing test file.
|
||||
const wxString& GetTestText()
|
||||
{
|
||||
static wxString text;
|
||||
if ( text.empty() )
|
||||
{
|
||||
wxFFile("htmltest.html").ReadAll(&text);
|
||||
}
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
BENCHMARK_FUNC(REFindTD)
|
||||
{
|
||||
// This is too simplistic, but good enough for benchmarking.
|
||||
static wxRegEx re("<td>[^<]*</td>", wxRE_ICASE | wxRE_NEWLINE);
|
||||
|
||||
int matches = 0;
|
||||
for ( const wxChar* p = GetTestText().c_str(); re.Matches(p); ++matches )
|
||||
{
|
||||
size_t start, len;
|
||||
if ( !re.GetMatch(&start, &len) )
|
||||
return false;
|
||||
|
||||
p += start + len;
|
||||
}
|
||||
|
||||
return matches == 21; // result of "grep -c"
|
||||
}
|
@@ -234,12 +234,18 @@ void RegExTestCase::doTest(int flavor)
|
||||
// 'e' - test that the pattern fails to compile
|
||||
if (m_mode == 'e') {
|
||||
CHECK( !re.IsValid() );
|
||||
} else {
|
||||
CHECK( re.IsValid() );
|
||||
}
|
||||
|
||||
if (!re.IsValid())
|
||||
// Never continue with this kind of test.
|
||||
return;
|
||||
} else {
|
||||
// Note: we don't use REQUIRE here because this would abort the entire
|
||||
// test case on error instead of skipping just the rest of this regex
|
||||
// test.
|
||||
CHECK( re.IsValid() );
|
||||
|
||||
if (!re.IsValid())
|
||||
return;
|
||||
}
|
||||
|
||||
bool matches = re.Matches(m_data, m_matchFlags);
|
||||
|
||||
|
@@ -79,28 +79,33 @@ CheckMatch(const char* pattern,
|
||||
INFO( "Pattern: " << pattern << FlagStr(flags) << ", match: " << text );
|
||||
|
||||
wxRegEx re(pattern, compileFlags);
|
||||
REQUIRE( re.IsValid() );
|
||||
|
||||
bool ok = re.Matches(text, matchFlags);
|
||||
|
||||
if (expected) {
|
||||
REQUIRE( ok );
|
||||
|
||||
wxStringTokenizer tkz(wxString(expected, *wxConvCurrent),
|
||||
wxT("\t"), wxTOKEN_RET_EMPTY);
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < re.GetMatchCount() && tkz.HasMoreTokens(); i++) {
|
||||
INFO( "Match #" << i );
|
||||
CHECK( re.GetMatch(text, i) == tkz.GetNextToken() );
|
||||
}
|
||||
|
||||
if ((flags & wxRE_NOSUB) == 0)
|
||||
CHECK(re.GetMatchCount() == i);
|
||||
if ( !re.IsValid() )
|
||||
{
|
||||
FAIL("Regex compilation failed");
|
||||
return;
|
||||
}
|
||||
else {
|
||||
CHECK( !ok );
|
||||
|
||||
if ( !re.Matches(text, matchFlags) )
|
||||
{
|
||||
CHECK( !expected );
|
||||
return;
|
||||
}
|
||||
|
||||
CHECK( expected );
|
||||
if ( !expected )
|
||||
return;
|
||||
|
||||
wxStringTokenizer tkz(wxString(expected, *wxConvCurrent),
|
||||
wxT("\t"), wxTOKEN_RET_EMPTY);
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < re.GetMatchCount() && tkz.HasMoreTokens(); i++) {
|
||||
INFO( "Match #" << i );
|
||||
CHECK( re.GetMatch(text, i) == tkz.GetNextToken() );
|
||||
}
|
||||
|
||||
if ((flags & wxRE_NOSUB) == 0)
|
||||
CHECK(re.GetMatchCount() == i);
|
||||
}
|
||||
|
||||
TEST_CASE("wxRegEx::Match", "[regex][match]")
|
||||
@@ -165,4 +170,18 @@ TEST_CASE("wxRegEx::QuoteMeta", "[regex][meta]")
|
||||
CHECK( wxRegEx::QuoteMeta(":foo.*bar") == ":foo\\.\\*bar" );
|
||||
}
|
||||
|
||||
TEST_CASE("wxRegEx::ConvertFromBasic", "[regex][basic]")
|
||||
{
|
||||
CHECK( wxRegEx::ConvertFromBasic("\\(a\\)b") == "(a)b" );
|
||||
CHECK( wxRegEx::ConvertFromBasic("a\\{0,1\\}b") == "a{0,1}b" );
|
||||
CHECK( wxRegEx::ConvertFromBasic("*") == "\\*" );
|
||||
CHECK( wxRegEx::ConvertFromBasic("**") == "\\**" );
|
||||
CHECK( wxRegEx::ConvertFromBasic("^*") == "^\\*" );
|
||||
CHECK( wxRegEx::ConvertFromBasic("^^") == "^\\^" );
|
||||
CHECK( wxRegEx::ConvertFromBasic("x$y") == "x\\$y" );
|
||||
CHECK( wxRegEx::ConvertFromBasic("$$") == "\\$$" );
|
||||
CHECK( wxRegEx::ConvertFromBasic("\\(x$\\)") == "(x$)" );
|
||||
CHECK( wxRegEx::ConvertFromBasic("[^$\\)]") == "[^$\\)]" );
|
||||
}
|
||||
|
||||
#endif // wxUSE_REGEX
|
||||
|
Reference in New Issue
Block a user