Add wxRegEx::ConvertFromBasic() helper
This will be used to implement support for BREs using PCRE which doesn't support them directly in the upcoming commits.
This commit is contained in:
@@ -144,6 +144,9 @@ public:
|
||||
|
||||
static wxString QuoteMeta(const wxString& str);
|
||||
|
||||
// return the extended RE corresponding to the given basic RE
|
||||
static wxString ConvertFromBasic(const wxString& bre);
|
||||
|
||||
// dtor not virtual, don't derive from this class
|
||||
~wxRegEx();
|
||||
|
||||
|
@@ -276,5 +276,15 @@ public:
|
||||
@since 3.1.3
|
||||
*/
|
||||
static wxString QuoteMeta(const wxString& str);
|
||||
|
||||
/**
|
||||
Converts a basic regular expression to an extended regex syntax.
|
||||
|
||||
This function can be used to convert @a bre using deprecated wxRE_BASIC
|
||||
syntax to default (extended) syntax.
|
||||
|
||||
@since 3.1.6
|
||||
*/
|
||||
static wxString ConvertFromBasic(const wxString& bre);
|
||||
};
|
||||
|
||||
|
@@ -24,3 +24,6 @@ expressions</em> (BRE). EREs are roughly those of the traditional @e egrep,
|
||||
// 2019), i.e. SEH translator seems to work just fine without /EHa too, so
|
||||
// Purpose: helpers for the structured exception handling (SEH) under Win32
|
||||
* MinGW-w64 versions 7.3 and 8.1 (32-bit binaries use SJLJ exceptions, 64-bit ones use SEH, and all binaries use Win32 threads).
|
||||
static wxString ConvertFromBasic(const wxString& bre);
|
||||
This function can be used to convert @a bre using deprecated wxRE_BASIC
|
||||
static wxString ConvertFromBasic(const wxString& bre);
|
||||
|
@@ -258,6 +258,281 @@ wxString wxRegExImpl::GetErrorMsg(int errorcode, bool badconv) const
|
||||
return szError;
|
||||
}
|
||||
|
||||
// Helper function for processing bracket expressions inside a regex.
|
||||
//
|
||||
// Advance the iterator until the closing bracket matching the opening one the
|
||||
// iterator currently points to, i.e.:
|
||||
//
|
||||
// Precondition: *it == '['
|
||||
// Postcondition: *it == ']' or it == end if failed to find matching ']'
|
||||
static
|
||||
wxString::const_iterator
|
||||
SkipBracketExpression(wxString::const_iterator it, wxString::const_iterator end)
|
||||
{
|
||||
wxASSERT_MSG( *it == '[', "must be at the start of bracket expression" );
|
||||
|
||||
// Initial ']', possibly after the preceding '^', is different because it
|
||||
// stands for a literal ']' and not the end of the bracket expression, so
|
||||
// check for it first.
|
||||
++it;
|
||||
if ( it != end && *it == '^' )
|
||||
++it;
|
||||
if ( it != end && *it == ']' )
|
||||
++it;
|
||||
|
||||
// Any ']' from now on ends the bracket expression.
|
||||
for ( ; it != end; ++it )
|
||||
{
|
||||
const wxUniChar c = *it;
|
||||
|
||||
if ( c == ']' )
|
||||
break;
|
||||
|
||||
if ( c == '[' )
|
||||
{
|
||||
// Bare '[' on its own is not special, but collating elements and
|
||||
// character classes are, so check for them and advance past them
|
||||
// if necessary to avoid misinterpreting the matching closing ']'.
|
||||
if ( ++it == end )
|
||||
break;
|
||||
|
||||
const wxUniChar c = *it;
|
||||
if ( c == ':' || c == '.' || c == '=' )
|
||||
{
|
||||
for ( ++it; it != end; ++it )
|
||||
{
|
||||
if ( *it == c )
|
||||
{
|
||||
if ( ++it == end )
|
||||
break;
|
||||
|
||||
if ( *it == ']' )
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( it == end )
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return it;
|
||||
}
|
||||
|
||||
/* static */
|
||||
wxString wxRegEx::ConvertFromBasic(const wxString& bre)
|
||||
{
|
||||
/*
|
||||
Quoting regex(7):
|
||||
|
||||
Obsolete ("basic") regular expressions differ in several respects.
|
||||
'|', '+', and '?' are ordinary characters and there is no equivalent
|
||||
for their functionality. The delimiters for bounds are "\{" and "\}",
|
||||
with '{' and '}' by themselves ordinary characters. The parentheses
|
||||
for nested subexpressions are "\(" and "\)", with '(' and ')' by
|
||||
themselves ordinary characters. '^' is an ordinary character except at
|
||||
the beginning of the RE or(!) the beginning of a parenthesized
|
||||
subexpression, '$' is an ordinary character except at the end of the RE
|
||||
or(!) the end of a parenthesized subexpression, and '*' is an ordinary
|
||||
character if it appears at the beginning of the RE or the beginning of
|
||||
a parenthesized subexpression (after a possible leading '^').
|
||||
|
||||
Finally, there is one new type of atom, a back reference: '\' followed
|
||||
by a nonzero decimal digit d matches the same sequence of characters
|
||||
matched by the dth parenthesized subexpression [...]
|
||||
*/
|
||||
wxString ere;
|
||||
ere.reserve(bre.length());
|
||||
|
||||
enum SinceStart
|
||||
{
|
||||
SinceStart_None, // Just at the beginning.
|
||||
SinceStart_OnlyCaret, // Had just "^" since the beginning.
|
||||
SinceStart_Some // Had something else since the beginning.
|
||||
};
|
||||
|
||||
struct State
|
||||
{
|
||||
explicit State(SinceStart sinceStart_)
|
||||
{
|
||||
isBackslash = false;
|
||||
sinceStart = sinceStart_;
|
||||
}
|
||||
|
||||
bool isBackslash;
|
||||
SinceStart sinceStart;
|
||||
};
|
||||
|
||||
State previous(SinceStart_None);
|
||||
for ( wxString::const_iterator it = bre.begin(),
|
||||
end = bre.end();
|
||||
it != end;
|
||||
++it )
|
||||
{
|
||||
const wxUniChar c = *it;
|
||||
|
||||
// What should be done with the current character?
|
||||
enum Disposition
|
||||
{
|
||||
Disposition_Skip, // Nothing.
|
||||
Disposition_Append, // Append to output.
|
||||
Disposition_Escape // ... after escaping it with backslash.
|
||||
} disposition = Disposition_Append;
|
||||
|
||||
State current(SinceStart_Some);
|
||||
|
||||
if ( previous.isBackslash )
|
||||
{
|
||||
// By default, keep the backslash present in the BRE, it's still
|
||||
// needed in the ERE too.
|
||||
disposition = Disposition_Escape;
|
||||
|
||||
switch ( c.GetValue() )
|
||||
{
|
||||
case '(':
|
||||
// It's the start of a new subexpression.
|
||||
current.sinceStart = SinceStart_None;
|
||||
wxFALLTHROUGH;
|
||||
|
||||
case ')':
|
||||
case '{':
|
||||
case '}':
|
||||
// Do not escape to ensure they remain special in the ERE
|
||||
// as the escaped versions were special in the BRE.
|
||||
disposition = Disposition_Append;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else // This character is not escaped.
|
||||
{
|
||||
switch ( c.GetValue() )
|
||||
{
|
||||
case '\\':
|
||||
current.isBackslash = true;
|
||||
|
||||
// Don't do anything with it yet, we'll deal with it later.
|
||||
disposition = Disposition_Skip;
|
||||
break;
|
||||
|
||||
case '^':
|
||||
// Escape unless it appears at the start.
|
||||
switch ( previous.sinceStart )
|
||||
{
|
||||
case SinceStart_None:
|
||||
// Don't escape, but do update the state.
|
||||
current.sinceStart = SinceStart_OnlyCaret;
|
||||
break;
|
||||
|
||||
case SinceStart_OnlyCaret:
|
||||
case SinceStart_Some:
|
||||
disposition = Disposition_Escape;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
case '*':
|
||||
// Escape unless it appears at the start or right after "^".
|
||||
switch ( previous.sinceStart )
|
||||
{
|
||||
case SinceStart_None:
|
||||
case SinceStart_OnlyCaret:
|
||||
disposition = Disposition_Escape;
|
||||
break;
|
||||
|
||||
case SinceStart_Some:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
case '$':
|
||||
// Escape unless it appears at the end or just before "\)".
|
||||
disposition = Disposition_Escape;
|
||||
{
|
||||
wxString::const_iterator next = it;
|
||||
++next;
|
||||
if ( next == end )
|
||||
{
|
||||
// It is at the end, so has special meaning.
|
||||
disposition = Disposition_Append;
|
||||
}
|
||||
else // Not at the end, but maybe at subexpression end?
|
||||
{
|
||||
if ( *next == '\\' )
|
||||
{
|
||||
++next;
|
||||
if ( next != end && *next == ')' )
|
||||
disposition = Disposition_Append;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case '|':
|
||||
case '+':
|
||||
case '?':
|
||||
case '(':
|
||||
case ')':
|
||||
case '{':
|
||||
case '}':
|
||||
// Escape these characters which are not special in a BRE,
|
||||
// but would be special in a ERE if left unescaped.
|
||||
disposition = Disposition_Escape;
|
||||
break;
|
||||
|
||||
case '[':
|
||||
// Rules are very different for the characters inside the
|
||||
// bracket expressions and we don't have to change anything
|
||||
// for them as the syntax is the same for BREs and EREs, so
|
||||
// just process the entire expression at once.
|
||||
{
|
||||
const wxString::const_iterator start = it;
|
||||
it = SkipBracketExpression(it, end);
|
||||
|
||||
// Copy everything inside without any changes.
|
||||
ere += wxString(start, it);
|
||||
|
||||
if ( it == end )
|
||||
{
|
||||
// If we reached the end without finding the
|
||||
// matching ']' there is nothing remaining anyhow.
|
||||
return ere;
|
||||
}
|
||||
|
||||
// Note that default Disposition_Append here is fine,
|
||||
// we'll append the closing ']' to "ere" below.
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
switch ( disposition )
|
||||
{
|
||||
case Disposition_Skip:
|
||||
break;
|
||||
|
||||
case Disposition_Escape:
|
||||
ere += '\\';
|
||||
wxFALLTHROUGH;
|
||||
|
||||
case Disposition_Append:
|
||||
// Note: don't use "c" here, iterator may have been advanced
|
||||
// inside the loop.
|
||||
ere += *it;
|
||||
break;
|
||||
}
|
||||
|
||||
previous = current;
|
||||
}
|
||||
|
||||
// It's an error if a RE ends with a backslash, but we still need to
|
||||
// preserve this error in the resulting RE.
|
||||
if ( previous.isBackslash )
|
||||
ere += '\\';
|
||||
|
||||
return ere;
|
||||
}
|
||||
|
||||
bool wxRegExImpl::Compile(const wxString& expr, int flags)
|
||||
{
|
||||
Reinit();
|
||||
|
@@ -170,4 +170,18 @@ TEST_CASE("wxRegEx::QuoteMeta", "[regex][meta]")
|
||||
CHECK( wxRegEx::QuoteMeta(":foo.*bar") == ":foo\\.\\*bar" );
|
||||
}
|
||||
|
||||
TEST_CASE("wxRegEx::ConvertFromBasic", "[regex][basic]")
|
||||
{
|
||||
CHECK( wxRegEx::ConvertFromBasic("\\(a\\)b") == "(a)b" );
|
||||
CHECK( wxRegEx::ConvertFromBasic("a\\{0,1\\}b") == "a{0,1}b" );
|
||||
CHECK( wxRegEx::ConvertFromBasic("*") == "\\*" );
|
||||
CHECK( wxRegEx::ConvertFromBasic("**") == "\\**" );
|
||||
CHECK( wxRegEx::ConvertFromBasic("^*") == "^\\*" );
|
||||
CHECK( wxRegEx::ConvertFromBasic("^^") == "^\\^" );
|
||||
CHECK( wxRegEx::ConvertFromBasic("x$y") == "x\\$y" );
|
||||
CHECK( wxRegEx::ConvertFromBasic("$$") == "\\$$" );
|
||||
CHECK( wxRegEx::ConvertFromBasic("\\(x$\\)") == "(x$)" );
|
||||
CHECK( wxRegEx::ConvertFromBasic("[^$\\)]") == "[^$\\)]" );
|
||||
}
|
||||
|
||||
#endif // wxUSE_REGEX
|
||||
|
Reference in New Issue
Block a user