Tcl regex lib
git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@3274 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
This commit is contained in:
970
src/regex/re_syntax.n
Normal file
970
src/regex/re_syntax.n
Normal file
@@ -0,0 +1,970 @@
|
||||
'\"
|
||||
'\" Copyright (c) 1998 Sun Microsystems, Inc.
|
||||
'\" Copyright (c) 1999 Scriptics Corporation
|
||||
'\"
|
||||
'\" This software is copyrighted by the Regents of the University of
|
||||
'\" California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
|
||||
'\" Corporation and other parties. The following terms apply to all files
|
||||
'\" associated with the software unless explicitly disclaimed in
|
||||
'\" individual files.
|
||||
'\"
|
||||
'\" The authors hereby grant permission to use, copy, modify, distribute,
|
||||
'\" and license this software and its documentation for any purpose, provided
|
||||
'\" that existing copyright notices are retained in all copies and that this
|
||||
'\" notice is included verbatim in any distributions. No written agreement,
|
||||
'\" license, or royalty fee is required for any of the authorized uses.
|
||||
'\" Modifications to this software may be copyrighted by their authors
|
||||
'\" and need not follow the licensing terms described here, provided that
|
||||
'\" the new terms are clearly indicated on the first page of each file where
|
||||
'\" they apply.
|
||||
'\"
|
||||
'\" IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
|
||||
'\" FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
|
||||
'\" ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
|
||||
'\" DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
|
||||
'\" POSSIBILITY OF SUCH DAMAGE.
|
||||
'\"
|
||||
'\" THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
|
||||
'\" INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
'\" FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
|
||||
'\" IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
|
||||
'\" NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
|
||||
'\" MODIFICATIONS.
|
||||
'\"
|
||||
'\" GOVERNMENT USE: If you are acquiring this software on behalf of the
|
||||
'\" U.S. government, the Government shall have only "Restricted Rights"
|
||||
'\" in the software and related documentation as defined in the Federal
|
||||
'\" Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
|
||||
'\" are acquiring the software on behalf of the Department of Defense, the
|
||||
'\" software shall be classified as "Commercial Computer Software" and the
|
||||
'\" Government shall have only "Restricted Rights" as defined in Clause
|
||||
'\" 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
|
||||
'\" authors grant the U.S. Government and others acting in its behalf
|
||||
'\" permission to use and distribute the software in accordance with the
|
||||
'\" terms specified in this license.
|
||||
'\"
|
||||
'\" RCS: @(#) Id: re_syntax.n,v 1.3 1999/07/14 19:09:36 jpeek Exp
|
||||
'\"
|
||||
.so man.macros
|
||||
.TH re_syntax n "8.1" Tcl "Tcl Built-In Commands"
|
||||
.BS
|
||||
.SH NAME
|
||||
re_syntax \- Syntax of Tcl regular expressions.
|
||||
.BE
|
||||
|
||||
.SH DESCRIPTION
|
||||
.PP
|
||||
A \fIregular expression\fR describes strings of characters.
|
||||
It's a pattern that matches certain strings and doesn't match others.
|
||||
|
||||
.SH "DIFFERENT FLAVORS OF REs"
|
||||
Regular expressions (``RE''s), as defined by POSIX, come in two
|
||||
flavors: \fIextended\fR REs (``EREs'') and \fIbasic\fR REs (``BREs'').
|
||||
EREs are roughly those of the traditional \fIegrep\fR, while BREs are
|
||||
roughly those of the traditional \fIed\fR. This implementation adds
|
||||
a third flavor, \fIadvanced\fR REs (``AREs''), basically EREs with
|
||||
some significant extensions.
|
||||
.PP
|
||||
This manual page primarily describes AREs. BREs mostly exist for
|
||||
backward compatibility in some old programs; they will be discussed at
|
||||
the end. POSIX EREs are almost an exact subset of AREs. Features of
|
||||
AREs that are not present in EREs will be indicated.
|
||||
|
||||
.SH "REGULAR EXPRESSION SYNTAX"
|
||||
.PP
|
||||
Tcl regular expressions are implemented using the package written by
|
||||
Henry Spencer, based on the 1003.2 spec and some (not quite all) of
|
||||
the Perl5 extensions (thanks, Henry!). Much of the description of
|
||||
regular expressions below is copied verbatim from his manual entry.
|
||||
.PP
|
||||
An ARE is one or more \fIbranches\fR,
|
||||
separated by `\fB|\fR',
|
||||
matching anything that matches any of the branches.
|
||||
.PP
|
||||
A branch is zero or more \fIconstraints\fR or \fIquantified atoms\fR,
|
||||
concatenated.
|
||||
It matches a match for the first, followed by a match for the second, etc;
|
||||
an empty branch matches the empty string.
|
||||
.PP
|
||||
A quantified atom is an \fIatom\fR possibly followed
|
||||
by a single \fIquantifier\fR.
|
||||
Without a quantifier, it matches a match for the atom.
|
||||
The quantifiers,
|
||||
and what a so-quantified atom matches, are:
|
||||
.RS 2
|
||||
.TP 6
|
||||
\fB*\fR
|
||||
a sequence of 0 or more matches of the atom
|
||||
.TP
|
||||
\fB+\fR
|
||||
a sequence of 1 or more matches of the atom
|
||||
.TP
|
||||
\fB?\fR
|
||||
a sequence of 0 or 1 matches of the atom
|
||||
.TP
|
||||
\fB{\fIm\fB}\fR
|
||||
a sequence of exactly \fIm\fR matches of the atom
|
||||
.TP
|
||||
\fB{\fIm\fB,}\fR
|
||||
a sequence of \fIm\fR or more matches of the atom
|
||||
.TP
|
||||
\fB{\fIm\fB,\fIn\fB}\fR
|
||||
a sequence of \fIm\fR through \fIn\fR (inclusive) matches of the atom;
|
||||
\fIm\fR may not exceed \fIn\fR
|
||||
.TP
|
||||
\fB*? +? ?? {\fIm\fB}? {\fIm\fB,}? {\fIm\fB,\fIn\fB}?\fR
|
||||
\fInon-greedy\fR quantifiers,
|
||||
which match the same possibilities,
|
||||
but prefer the smallest number rather than the largest number
|
||||
of matches (see MATCHING)
|
||||
.RE
|
||||
.PP
|
||||
The forms using
|
||||
\fB{\fR and \fB}\fR
|
||||
are known as \fIbound\fRs.
|
||||
The numbers
|
||||
\fIm\fR and \fIn\fR are unsigned decimal integers
|
||||
with permissible values from 0 to 255 inclusive.
|
||||
.PP
|
||||
An atom is one of:
|
||||
.RS 2
|
||||
.TP 6
|
||||
\fB(\fIre\fB)\fR
|
||||
(where \fIre\fR is any regular expression)
|
||||
matches a match for
|
||||
\fIre\fR, with the match noted for possible reporting
|
||||
.TP
|
||||
\fB(?:\fIre\fB)\fR
|
||||
as previous,
|
||||
but does no reporting
|
||||
(a ``non-capturing'' set of parentheses)
|
||||
.TP
|
||||
\fB()\fR
|
||||
matches an empty string,
|
||||
noted for possible reporting
|
||||
.TP
|
||||
\fB(?:)\fR
|
||||
matches an empty string,
|
||||
without reporting
|
||||
.TP
|
||||
\fB[\fIchars\fB]\fR
|
||||
a \fIbracket expression\fR,
|
||||
matching any one of the \fIchars\fR (see BRACKET EXPRESSIONS for more detail)
|
||||
.TP
|
||||
\fB.\fR
|
||||
matches any single character
|
||||
.TP
|
||||
\fB\e\fIk\fR
|
||||
(where \fIk\fR is a non-alphanumeric character)
|
||||
matches that character taken as an ordinary character,
|
||||
e.g. \e\e matches a backslash character
|
||||
.TP
|
||||
\fB\e\fIc\fR
|
||||
where \fIc\fR is alphanumeric
|
||||
(possibly followed by other characters),
|
||||
an \fIescape\fR (AREs only),
|
||||
see ESCAPES below
|
||||
.TP
|
||||
\fB{\fR
|
||||
when followed by a character other than a digit,
|
||||
matches the left-brace character `\fB{\fR';
|
||||
when followed by a digit, it is the beginning of a
|
||||
\fIbound\fR (see above)
|
||||
.TP
|
||||
\fIx\fR
|
||||
where \fIx\fR is
|
||||
a single character with no other significance, matches that character.
|
||||
.RE
|
||||
.PP
|
||||
A \fIconstraint\fR matches an empty string when specific conditions
|
||||
are met.
|
||||
A constraint may not be followed by a quantifier.
|
||||
The simple constraints are as follows; some more constraints are
|
||||
described later, under ESCAPES.
|
||||
.RS 2
|
||||
.TP 8
|
||||
\fB^\fR
|
||||
matches at the beginning of a line
|
||||
.TP
|
||||
\fB$\fR
|
||||
matches at the end of a line
|
||||
.TP
|
||||
\fB(?=\fIre\fB)\fR
|
||||
\fIpositive lookahead\fR (AREs only), matches at any point
|
||||
where a substring matching \fIre\fR begins
|
||||
.TP
|
||||
\fB(?!\fIre\fB)\fR
|
||||
\fInegative lookahead\fR (AREs only), matches at any point
|
||||
where no substring matching \fIre\fR begins
|
||||
.RE
|
||||
.PP
|
||||
The lookahead constraints may not contain back references (see later),
|
||||
and all parentheses within them are considered non-capturing.
|
||||
.PP
|
||||
An RE may not end with `\fB\e\fR'.
|
||||
|
||||
.SH "BRACKET EXPRESSIONS"
|
||||
A \fIbracket expression\fR is a list of characters enclosed in `\fB[\|]\fR'.
|
||||
It normally matches any single character from the list (but see below).
|
||||
If the list begins with `\fB^\fR',
|
||||
it matches any single character
|
||||
(but see below) \fInot\fR from the rest of the list.
|
||||
.PP
|
||||
If two characters in the list are separated by `\fB\-\fR',
|
||||
this is shorthand
|
||||
for the full \fIrange\fR of characters between those two (inclusive) in the
|
||||
collating sequence,
|
||||
e.g.
|
||||
\fB[0\-9]\fR
|
||||
in ASCII matches any decimal digit.
|
||||
Two ranges may not share an
|
||||
endpoint, so e.g.
|
||||
\fBa\-c\-e\fR
|
||||
is illegal.
|
||||
Ranges are very collating-sequence-dependent,
|
||||
and portable programs should avoid relying on them.
|
||||
.PP
|
||||
To include a literal
|
||||
\fB]\fR
|
||||
or
|
||||
\fB\-\fR
|
||||
in the list,
|
||||
the simplest method is to
|
||||
enclose it in
|
||||
\fB[.\fR and \fB.]\fR
|
||||
to make it a collating element (see below).
|
||||
Alternatively,
|
||||
make it the first character
|
||||
(following a possible `\fB^\fR'),
|
||||
or (AREs only) precede it with `\fB\e\fR'.
|
||||
Alternatively, for `\fB\-\fR',
|
||||
make it the last character,
|
||||
or the second endpoint of a range.
|
||||
To use a literal
|
||||
\fB\-\fR
|
||||
as the first endpoint of a range,
|
||||
make it a collating element
|
||||
or (AREs only) precede it with `\fB\e\fR'.
|
||||
With the exception of these, some combinations using
|
||||
\fB[\fR
|
||||
(see next
|
||||
paragraphs), and escapes,
|
||||
all other special characters lose their
|
||||
special significance within a bracket expression.
|
||||
.PP
|
||||
Within a bracket expression, a collating element (a character,
|
||||
a multi-character sequence that collates as if it were a single character,
|
||||
or a collating-sequence name for either)
|
||||
enclosed in
|
||||
\fB[.\fR and \fB.]\fR
|
||||
stands for the
|
||||
sequence of characters of that collating element.
|
||||
The sequence is a single element of the bracket expression's list.
|
||||
A bracket expression in a locale that has
|
||||
multi-character collating elements
|
||||
can thus match more than one character.
|
||||
.VS 8.2
|
||||
So (insidiously), a bracket expression that starts with \fB^\fR
|
||||
can match multi-character collating elements even if none of them
|
||||
appear in the bracket expression!
|
||||
(\fINote:\fR Tcl currently has no multi-character collating elements.
|
||||
This information is only for illustration.)
|
||||
.PP
|
||||
For example, assume the collating sequence includes a \fBch\fR
|
||||
multi-character collating element.
|
||||
Then the RE \fB[[.ch.]]*c\fR (zero or more \fBch\fP's followed by \fBc\fP)
|
||||
matches the first five characters of `\fBchchcc\fR'.
|
||||
Also, the RE \fB[^c]b\fR matches all of `\fBchb\fR'
|
||||
(because \fB[^c]\fR matches the multi-character \fBch\fR).
|
||||
.VE 8.2
|
||||
.PP
|
||||
Within a bracket expression, a collating element enclosed in
|
||||
\fB[=\fR
|
||||
and
|
||||
\fB=]\fR
|
||||
is an equivalence class, standing for the sequences of characters
|
||||
of all collating elements equivalent to that one, including itself.
|
||||
(If there are no other equivalent collating elements,
|
||||
the treatment is as if the enclosing delimiters were `\fB[.\fR'\&
|
||||
and `\fB.]\fR'.)
|
||||
For example, if
|
||||
\fBo\fR
|
||||
and
|
||||
\fB\o'o^'\fR
|
||||
are the members of an equivalence class,
|
||||
then `\fB[[=o=]]\fR', `\fB[[=\o'o^'=]]\fR',
|
||||
and `\fB[o\o'o^']\fR'\&
|
||||
are all synonymous.
|
||||
An equivalence class may not be an endpoint
|
||||
of a range.
|
||||
.VS 8.2
|
||||
(\fINote:\fR
|
||||
Tcl currently implements only the Unicode locale.
|
||||
It doesn't define any equivalence classes.
|
||||
The examples above are just illustrations.)
|
||||
.VE 8.2
|
||||
.PP
|
||||
Within a bracket expression, the name of a \fIcharacter class\fR enclosed
|
||||
in
|
||||
\fB[:\fR
|
||||
and
|
||||
\fB:]\fR
|
||||
stands for the list of all characters
|
||||
(not all collating elements!)
|
||||
belonging to that
|
||||
class.
|
||||
Standard character classes are:
|
||||
.PP
|
||||
.RS
|
||||
.ne 5
|
||||
.nf
|
||||
.ta 3c
|
||||
\fBalpha\fR A letter.
|
||||
\fBupper\fR An upper-case letter.
|
||||
\fBlower\fR A lower-case letter.
|
||||
\fBdigit\fR A decimal digit.
|
||||
\fBxdigit\fR A hexadecimal digit.
|
||||
\fBalnum\fR An alphanumeric (letter or digit).
|
||||
\fBprint\fR An alphanumeric (same as alnum).
|
||||
\fBblank\fR A space or tab character.
|
||||
\fBspace\fR A character producing white space in displayed text.
|
||||
\fBpunct\fR A punctuation character.
|
||||
\fBgraph\fR A character with a visible representation.
|
||||
\fBcntrl\fR A control character.
|
||||
.fi
|
||||
.RE
|
||||
.PP
|
||||
A locale may provide others.
|
||||
.VS 8.2
|
||||
(Note that the current Tcl implementation has only one locale:
|
||||
the Unicode locale.)
|
||||
.VE 8.2
|
||||
A character class may not be used as an endpoint of a range.
|
||||
.PP
|
||||
There are two special cases of bracket expressions:
|
||||
the bracket expressions
|
||||
\fB[[:<:]]\fR
|
||||
and
|
||||
\fB[[:>:]]\fR
|
||||
are constraints, matching empty strings at
|
||||
the beginning and end of a word respectively.
|
||||
'\" note, discussion of escapes below references this definition of word
|
||||
A word is defined as a sequence of
|
||||
word characters
|
||||
that is neither preceded nor followed by
|
||||
word characters.
|
||||
A word character is an
|
||||
\fIalnum\fR
|
||||
character
|
||||
or an underscore
|
||||
(\fB_\fR).
|
||||
These special bracket expressions are deprecated;
|
||||
users of AREs should use constraint escapes instead (see below).
|
||||
.SH ESCAPES
|
||||
Escapes (AREs only), which begin with a
|
||||
\fB\e\fR
|
||||
followed by an alphanumeric character,
|
||||
come in several varieties:
|
||||
character entry, class shorthands, constraint escapes, and back references.
|
||||
A
|
||||
\fB\e\fR
|
||||
followed by an alphanumeric character but not constituting
|
||||
a valid escape is illegal in AREs.
|
||||
In EREs, there are no escapes:
|
||||
outside a bracket expression,
|
||||
a
|
||||
\fB\e\fR
|
||||
followed by an alphanumeric character merely stands for that
|
||||
character as an ordinary character,
|
||||
and inside a bracket expression,
|
||||
\fB\e\fR
|
||||
is an ordinary character.
|
||||
(The latter is the one actual incompatibility between EREs and AREs.)
|
||||
.PP
|
||||
Character-entry escapes (AREs only) exist to make it easier to specify
|
||||
non-printing and otherwise inconvenient characters in REs:
|
||||
.RS 2
|
||||
.TP 5
|
||||
\fB\ea\fR
|
||||
alert (bell) character, as in C
|
||||
.TP
|
||||
\fB\eb\fR
|
||||
backspace, as in C
|
||||
.TP
|
||||
\fB\eB\fR
|
||||
synonym for
|
||||
\fB\e\fR
|
||||
to help reduce backslash doubling in some
|
||||
applications where there are multiple levels of backslash processing
|
||||
.TP
|
||||
\fB\ec\fIX\fR
|
||||
(where X is any character) the character whose
|
||||
low-order 5 bits are the same as those of
|
||||
\fIX\fR,
|
||||
and whose other bits are all zero
|
||||
.TP
|
||||
\fB\ee\fR
|
||||
the character whose collating-sequence name
|
||||
is `\fBESC\fR',
|
||||
or failing that, the character with octal value 033
|
||||
.TP
|
||||
\fB\ef\fR
|
||||
formfeed, as in C
|
||||
.TP
|
||||
\fB\en\fR
|
||||
newline, as in C
|
||||
.TP
|
||||
\fB\er\fR
|
||||
carriage return, as in C
|
||||
.TP
|
||||
\fB\et\fR
|
||||
horizontal tab, as in C
|
||||
.TP
|
||||
\fB\eu\fIwxyz\fR
|
||||
(where
|
||||
\fIwxyz\fR
|
||||
is exactly four hexadecimal digits)
|
||||
the Unicode character
|
||||
\fBU+\fIwxyz\fR
|
||||
in the local byte ordering
|
||||
.TP
|
||||
\fB\eU\fIstuvwxyz\fR
|
||||
(where
|
||||
\fIstuvwxyz\fR
|
||||
is exactly eight hexadecimal digits)
|
||||
reserved for a somewhat-hypothetical Unicode extension to 32 bits
|
||||
.TP
|
||||
\fB\ev\fR
|
||||
vertical tab, as in C
|
||||
are all available.
|
||||
.TP
|
||||
\fB\ex\fIhhh\fR
|
||||
(where
|
||||
\fIhhh\fR
|
||||
is any sequence of hexadecimal digits)
|
||||
the character whose hexadecimal value is
|
||||
\fB0x\fIhhh\fR
|
||||
(a single character no matter how many hexadecimal digits are used).
|
||||
.TP
|
||||
\fB\e0\fR
|
||||
the character whose value is
|
||||
\fB0\fR
|
||||
.TP
|
||||
\fB\e\fIxy\fR
|
||||
(where
|
||||
\fIxy\fR
|
||||
is exactly two octal digits,
|
||||
and is not a
|
||||
\fIback reference\fR (see below))
|
||||
the character whose octal value is
|
||||
\fB0\fIxy\fR
|
||||
.TP
|
||||
\fB\e\fIxyz\fR
|
||||
(where
|
||||
\fIxyz\fR
|
||||
is exactly three octal digits,
|
||||
and is not a
|
||||
back reference (see below))
|
||||
the character whose octal value is
|
||||
\fB0\fIxyz\fR
|
||||
.RE
|
||||
.PP
|
||||
Hexadecimal digits are `\fB0\fR'-`\fB9\fR', `\fBa\fR'-`\fBf\fR',
|
||||
and `\fBA\fR'-`\fBF\fR'.
|
||||
Octal digits are `\fB0\fR'-`\fB7\fR'.
|
||||
.PP
|
||||
The character-entry escapes are always taken as ordinary characters.
|
||||
For example,
|
||||
\fB\e135\fR
|
||||
is
|
||||
\fB]\fR
|
||||
in ASCII,
|
||||
but
|
||||
\fB\e135\fR
|
||||
does not terminate a bracket expression.
|
||||
Beware, however, that some applications (e.g., C compilers) interpret
|
||||
such sequences themselves before the regular-expression package
|
||||
gets to see them, which may require doubling (quadrupling, etc.) the `\fB\e\fR'.
|
||||
.PP
|
||||
Class-shorthand escapes (AREs only) provide shorthands for certain commonly-used
|
||||
character classes:
|
||||
.RS 2
|
||||
.TP 10
|
||||
\fB\ed\fR
|
||||
\fB[[:digit:]]\fR
|
||||
.TP
|
||||
\fB\es\fR
|
||||
\fB[[:space:]]\fR
|
||||
.TP
|
||||
\fB\ew\fR
|
||||
\fB[[:alnum:]_]\fR
|
||||
(note underscore)
|
||||
.TP
|
||||
\fB\eD\fR
|
||||
\fB[^[:digit:]]\fR
|
||||
.TP
|
||||
\fB\eS\fR
|
||||
\fB[^[:space:]]\fR
|
||||
.TP
|
||||
\fB\eW\fR
|
||||
\fB[^[:alnum:]_]\fR
|
||||
(note underscore)
|
||||
.RE
|
||||
.PP
|
||||
Within bracket expressions, `\fB\ed\fR', `\fB\es\fR',
|
||||
and `\fB\ew\fR'\&
|
||||
lose their outer brackets,
|
||||
and `\fB\eD\fR', `\fB\eS\fR',
|
||||
and `\fB\eW\fR'\&
|
||||
are illegal.
|
||||
.VS 8.2
|
||||
(So, for example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR.
|
||||
Also, \fB[a-c\eD]\fR, which is equivalent to \fB[a-c^[:digit:]]\fR, is illegal.)
|
||||
.VE 8.2
|
||||
.PP
|
||||
A constraint escape (AREs only) is a constraint,
|
||||
matching the empty string if specific conditions are met,
|
||||
written as an escape:
|
||||
.RS 2
|
||||
.TP 6
|
||||
\fB\eA\fR
|
||||
matches only at the beginning of the string
|
||||
(see MATCHING, below, for how this differs from `\fB^\fR')
|
||||
.TP
|
||||
\fB\em\fR
|
||||
matches only at the beginning of a word
|
||||
.TP
|
||||
\fB\eM\fR
|
||||
matches only at the end of a word
|
||||
.TP
|
||||
\fB\ey\fR
|
||||
matches only at the beginning or end of a word
|
||||
.TP
|
||||
\fB\eY\fR
|
||||
matches only at a point that is not the beginning or end of a word
|
||||
.TP
|
||||
\fB\eZ\fR
|
||||
matches only at the end of the string
|
||||
(see MATCHING, below, for how this differs from `\fB$\fR')
|
||||
.TP
|
||||
\fB\e\fIm\fR
|
||||
(where
|
||||
\fIm\fR
|
||||
is a nonzero digit) a \fIback reference\fR, see below
|
||||
.TP
|
||||
\fB\e\fImnn\fR
|
||||
(where
|
||||
\fIm\fR
|
||||
is a nonzero digit, and
|
||||
\fInn\fR
|
||||
is some more digits,
|
||||
and the decimal value
|
||||
\fImnn\fR
|
||||
is not greater than the number of closing capturing parentheses seen so far)
|
||||
a \fIback reference\fR, see below
|
||||
.RE
|
||||
.PP
|
||||
A word is defined as in the specification of
|
||||
\fB[[:<:]]\fR
|
||||
and
|
||||
\fB[[:>:]]\fR
|
||||
above.
|
||||
Constraint escapes are illegal within bracket expressions.
|
||||
.PP
|
||||
A back reference (AREs only) matches the same string matched by the parenthesized
|
||||
subexpression specified by the number,
|
||||
so that (e.g.)
|
||||
\fB([bc])\e1\fR
|
||||
matches
|
||||
\fBbb\fR
|
||||
or
|
||||
\fBcc\fR
|
||||
but not `\fBbc\fR'.
|
||||
The subexpression must entirely precede the back reference in the RE.
|
||||
Subexpressions are numbered in the order of their leading parentheses.
|
||||
Non-capturing parentheses do not define subexpressions.
|
||||
.PP
|
||||
There is an inherent historical ambiguity between octal character-entry
|
||||
escapes and back references, which is resolved by heuristics,
|
||||
as hinted at above.
|
||||
A leading zero always indicates an octal escape.
|
||||
A single non-zero digit, not followed by another digit,
|
||||
is always taken as a back reference.
|
||||
A multi-digit sequence not starting with a zero is taken as a back
|
||||
reference if it comes after a suitable subexpression
|
||||
(i.e. the number is in the legal range for a back reference),
|
||||
and otherwise is taken as octal.
|
||||
.SH "METASYNTAX"
|
||||
In addition to the main syntax described above, there are some special
|
||||
forms and miscellaneous syntactic facilities available.
|
||||
.PP
|
||||
Normally the flavor of RE being used is specified by
|
||||
application-dependent means.
|
||||
However, this can be overridden by a \fIdirector\fR.
|
||||
If an RE of any flavor begins with `\fB***:\fR',
|
||||
the rest of the RE is an ARE.
|
||||
If an RE of any flavor begins with `\fB***=\fR',
|
||||
the rest of the RE is taken to be a literal string,
|
||||
with all characters considered ordinary characters.
|
||||
.PP
|
||||
An ARE may begin with \fIembedded options\fR:
|
||||
a sequence
|
||||
\fB(?\fIxyz\fB)\fR
|
||||
(where
|
||||
\fIxyz\fR
|
||||
is one or more alphabetic characters)
|
||||
specifies options affecting the rest of the RE.
|
||||
These supplement, and can override,
|
||||
any options specified by the application.
|
||||
The available option letters are:
|
||||
.RS 2
|
||||
.TP 3
|
||||
\fBb\fR
|
||||
rest of RE is a BRE
|
||||
.TP 3
|
||||
\fBc\fR
|
||||
case-sensitive matching (usual default)
|
||||
.TP 3
|
||||
\fBe\fR
|
||||
rest of RE is an ERE
|
||||
.TP 3
|
||||
\fBi\fR
|
||||
case-insensitive matching (see MATCHING, below)
|
||||
.TP 3
|
||||
\fBm\fR
|
||||
historical synonym for
|
||||
\fBn\fR
|
||||
.TP 3
|
||||
\fBn\fR
|
||||
newline-sensitive matching (see MATCHING, below)
|
||||
.TP 3
|
||||
\fBp\fR
|
||||
partial newline-sensitive matching (see MATCHING, below)
|
||||
.TP 3
|
||||
\fBq\fR
|
||||
rest of RE is a literal (``quoted'') string, all ordinary characters
|
||||
.TP 3
|
||||
\fBs\fR
|
||||
non-newline-sensitive matching (usual default)
|
||||
.TP 3
|
||||
\fBt\fR
|
||||
tight syntax (usual default; see below)
|
||||
.TP 3
|
||||
\fBw\fR
|
||||
inverse partial newline-sensitive (``weird'') matching (see MATCHING, below)
|
||||
.TP 3
|
||||
\fBx\fR
|
||||
expanded syntax (see below)
|
||||
.RE
|
||||
.PP
|
||||
Embedded options take effect at the
|
||||
\fB)\fR
|
||||
terminating the sequence.
|
||||
They are available only at the start of an ARE,
|
||||
and may not be used later within it.
|
||||
.PP
|
||||
In addition to the usual (\fItight\fR) RE syntax, in which all characters are
|
||||
significant, there is an \fIexpanded\fR syntax,
|
||||
available in all flavors of RE
|
||||
with the \fB-expanded\fR switch, or in AREs with the embedded x option.
|
||||
In the expanded syntax,
|
||||
white-space characters are ignored
|
||||
and all characters between a
|
||||
\fB#\fR
|
||||
and the following newline (or the end of the RE) are ignored,
|
||||
permitting paragraphing and commenting a complex RE.
|
||||
There are three exceptions to that basic rule:
|
||||
.RS 2
|
||||
.PP
|
||||
a white-space character or `\fB#\fR' preceded by `\fB\e\fR' is retained
|
||||
.PP
|
||||
white space or `\fB#\fR' within a bracket expression is retained
|
||||
.PP
|
||||
white space and comments are illegal within multi-character symbols
|
||||
like the ARE `\fB(?:\fR' or the BRE `\fB\e(\fR'
|
||||
.RE
|
||||
.PP
|
||||
Expanded-syntax white-space characters are blank, tab, newline, and
|
||||
.VS 8.2
|
||||
any character that belongs to the \fIspace\fR character class.
|
||||
.VE 8.2
|
||||
.PP
|
||||
Finally, in an ARE,
|
||||
outside bracket expressions, the sequence `\fB(?#\fIttt\fB)\fR'
|
||||
(where
|
||||
\fIttt\fR
|
||||
is any text not containing a `\fB)\fR')
|
||||
is a comment,
|
||||
completely ignored.
|
||||
Again, this is not allowed between the characters of
|
||||
multi-character symbols like `\fB(?:\fR'.
|
||||
Such comments are more a historical artifact than a useful facility,
|
||||
and their use is deprecated;
|
||||
use the expanded syntax instead.
|
||||
.PP
|
||||
\fINone\fR of these metasyntax extensions is available if the application
|
||||
(or an initial
|
||||
\fB***=\fR
|
||||
director)
|
||||
has specified that the user's input be treated as a literal string
|
||||
rather than as an RE.
|
||||
.SH MATCHING
|
||||
In the event that an RE could match more than one substring of a given
|
||||
string,
|
||||
the RE matches the one starting earliest in the string.
|
||||
If the RE could match more than one substring starting at that point,
|
||||
its choice is determined by its \fIpreference\fR:
|
||||
either the longest substring, or the shortest.
|
||||
.PP
|
||||
Most atoms, and all constraints, have no preference.
|
||||
A parenthesized RE has the same preference (possibly none) as the RE.
|
||||
A quantified atom with quantifier
|
||||
\fB{\fIm\fB}\fR
|
||||
or
|
||||
\fB{\fIm\fB}?\fR
|
||||
has the same preference (possibly none) as the atom itself.
|
||||
A quantified atom with other normal quantifiers (including
|
||||
\fB{\fIm\fB,\fIn\fB}\fR
|
||||
with
|
||||
\fIm\fR
|
||||
equal to
|
||||
\fIn\fR)
|
||||
prefers longest match.
|
||||
A quantified atom with other non-greedy quantifiers (including
|
||||
\fB{\fIm\fB,\fIn\fB}?\fR
|
||||
with
|
||||
\fIm\fR
|
||||
equal to
|
||||
\fIn\fR)
|
||||
prefers shortest match.
|
||||
A branch has the same preference as the first quantified atom in it
|
||||
which has a preference.
|
||||
An RE consisting of two or more branches connected by the
|
||||
\fB|\fR
|
||||
operator prefers longest match.
|
||||
.PP
|
||||
Subject to the constraints imposed by the rules for matching the whole RE,
|
||||
subexpressions also match the longest or shortest possible substrings,
|
||||
based on their preferences,
|
||||
with subexpressions starting earlier in the RE taking priority over
|
||||
ones starting later.
|
||||
Note that outer subexpressions thus take priority over
|
||||
their component subexpressions.
|
||||
.PP
|
||||
Note that the quantifiers
|
||||
\fB{1,1}\fR
|
||||
and
|
||||
\fB{1,1}?\fR
|
||||
can be used to force longest and shortest preference, respectively,
|
||||
on a subexpression or a whole RE.
|
||||
.PP
|
||||
Match lengths are measured in characters, not collating elements.
|
||||
An empty string is considered longer than no match at all.
|
||||
For example,
|
||||
\fBbb*\fR
|
||||
matches the three middle characters of `\fBabbbc\fR',
|
||||
\fB(week|wee)(night|knights)\fR
|
||||
matches all ten characters of `\fBweeknights\fR',
|
||||
when
|
||||
\fB(.*).*\fR
|
||||
is matched against
|
||||
\fBabc\fR
|
||||
the parenthesized subexpression
|
||||
matches all three characters, and
|
||||
when
|
||||
\fB(a*)*\fR
|
||||
is matched against
|
||||
\fBbc\fR
|
||||
both the whole RE and the parenthesized
|
||||
subexpression match an empty string.
|
||||
.PP
|
||||
If case-independent matching is specified,
|
||||
the effect is much as if all case distinctions had vanished from the
|
||||
alphabet.
|
||||
When an alphabetic that exists in multiple cases appears as an
|
||||
ordinary character outside a bracket expression, it is effectively
|
||||
transformed into a bracket expression containing both cases,
|
||||
so that
|
||||
\fBx\fR
|
||||
becomes `\fB[xX]\fR'.
|
||||
When it appears inside a bracket expression, all case counterparts
|
||||
of it are added to the bracket expression, so that
|
||||
\fB[x]\fR
|
||||
becomes
|
||||
\fB[xX]\fR
|
||||
and
|
||||
\fB[^x]\fR
|
||||
becomes `\fB[^xX]\fR'.
|
||||
.PP
|
||||
If newline-sensitive matching is specified, \fB.\fR
|
||||
and bracket expressions using
|
||||
\fB^\fR
|
||||
will never match the newline character
|
||||
(so that matches will never cross newlines unless the RE
|
||||
explicitly arranges it)
|
||||
and
|
||||
\fB^\fR
|
||||
and
|
||||
\fB$\fR
|
||||
will match the empty string after and before a newline
|
||||
respectively, in addition to matching at beginning and end of string
|
||||
respectively.
|
||||
ARE
|
||||
\fB\eA\fR
|
||||
and
|
||||
\fB\eZ\fR
|
||||
continue to match beginning or end of string \fIonly\fR.
|
||||
.PP
|
||||
If partial newline-sensitive matching is specified,
|
||||
this affects \fB.\fR
|
||||
and bracket expressions
|
||||
as with newline-sensitive matching, but not
|
||||
\fB^\fR
|
||||
and `\fB$\fR'.
|
||||
.PP
|
||||
If inverse partial newline-sensitive matching is specified,
|
||||
this affects
|
||||
\fB^\fR
|
||||
and
|
||||
\fB$\fR
|
||||
as with
|
||||
newline-sensitive matching,
|
||||
but not \fB.\fR
|
||||
and bracket expressions.
|
||||
This isn't very useful but is provided for symmetry.
|
||||
.SH "LIMITS AND COMPATIBILITY"
|
||||
No particular limit is imposed on the length of REs.
|
||||
Programs intended to be highly portable should not employ REs longer
|
||||
than 256 bytes,
|
||||
as a POSIX-compliant implementation can refuse to accept such REs.
|
||||
.PP
|
||||
The only feature of AREs that is actually incompatible with
|
||||
POSIX EREs is that
|
||||
\fB\e\fR
|
||||
does not lose its special
|
||||
significance inside bracket expressions.
|
||||
All other ARE features use syntax which is illegal or has
|
||||
undefined or unspecified effects in POSIX EREs;
|
||||
the
|
||||
\fB***\fR
|
||||
syntax of directors likewise is outside the POSIX
|
||||
syntax for both BREs and EREs.
|
||||
.PP
|
||||
Many of the ARE extensions are borrowed from Perl, but some have
|
||||
been changed to clean them up, and a few Perl extensions are not present.
|
||||
Incompatibilities of note include `\fB\eb\fR', `\fB\eB\fR',
|
||||
the lack of special treatment for a trailing newline,
|
||||
the addition of complemented bracket expressions to the things
|
||||
affected by newline-sensitive matching,
|
||||
the restrictions on parentheses and back references in lookahead constraints,
|
||||
and the longest/shortest-match (rather than first-match) matching semantics.
|
||||
.PP
|
||||
The matching rules for REs containing both normal and non-greedy quantifiers
|
||||
have changed since early beta-test versions of this package.
|
||||
(The new rules are much simpler and cleaner,
|
||||
but don't work as hard at guessing the user's real intentions.)
|
||||
.PP
|
||||
Henry Spencer's original 1986 \fIregexp\fR package,
|
||||
still in widespread use (e.g., in pre-8.1 releases of Tcl),
|
||||
implemented an early version of today's EREs.
|
||||
There are four incompatibilities between \fIregexp\fR's near-EREs
|
||||
(`RREs' for short) and AREs.
|
||||
In roughly increasing order of significance:
|
||||
.PP
|
||||
.RS
|
||||
In AREs,
|
||||
\fB\e\fR
|
||||
followed by an alphanumeric character is either an
|
||||
escape or an error,
|
||||
while in RREs, it was just another way of writing the
|
||||
alphanumeric.
|
||||
This should not be a problem because there was no reason to write
|
||||
such a sequence in RREs.
|
||||
.PP
|
||||
\fB{\fR
|
||||
followed by a digit in an ARE is the beginning of a bound,
|
||||
while in RREs,
|
||||
\fB{\fR
|
||||
was always an ordinary character.
|
||||
Such sequences should be rare,
|
||||
and will often result in an error because following characters
|
||||
will not look like a valid bound.
|
||||
.PP
|
||||
In AREs,
|
||||
\fB\e\fR
|
||||
remains a special character within `\fB[\|]\fR',
|
||||
so a literal
|
||||
\fB\e\fR
|
||||
within
|
||||
\fB[\|]\fR
|
||||
must be written `\fB\e\e\fR'.
|
||||
\fB\e\e\fR
|
||||
also gives a literal
|
||||
\fB\e\fR
|
||||
within
|
||||
\fB[\|]\fR
|
||||
in RREs,
|
||||
but only truly paranoid programmers routinely doubled the backslash.
|
||||
.PP
|
||||
AREs report the longest/shortest match for the RE,
|
||||
rather than the first found in a specified search order.
|
||||
This may affect some RREs which were written in the expectation that
|
||||
the first match would be reported.
|
||||
(The careful crafting of RREs to optimize the search order for fast
|
||||
matching is obsolete (AREs examine all possible matches
|
||||
in parallel, and their performance is largely insensitive to their
|
||||
complexity) but cases where the search order was exploited to deliberately
|
||||
find a match which was \fInot\fR the longest/shortest will need rewriting.)
|
||||
.RE
|
||||
|
||||
.SH "BASIC REGULAR EXPRESSIONS"
|
||||
BREs differ from EREs in several respects. `\fB|\fR', `\fB+\fR',
|
||||
and
|
||||
\fB?\fR
|
||||
are ordinary characters and there is no equivalent
|
||||
for their functionality.
|
||||
The delimiters for bounds are
|
||||
\fB\e{\fR
|
||||
and `\fB\e}\fR',
|
||||
with
|
||||
\fB{\fR
|
||||
and
|
||||
\fB}\fR
|
||||
by themselves ordinary characters.
|
||||
The parentheses for nested subexpressions are
|
||||
\fB\e(\fR
|
||||
and `\fB\e)\fR',
|
||||
with
|
||||
\fB(\fR
|
||||
and
|
||||
\fB)\fR
|
||||
by themselves ordinary characters.
|
||||
\fB^\fR
|
||||
is an ordinary character except at the beginning of the
|
||||
RE or the beginning of a parenthesized subexpression,
|
||||
\fB$\fR
|
||||
is an ordinary character except at the end of the
|
||||
RE or the end of a parenthesized subexpression,
|
||||
and
|
||||
\fB*\fR
|
||||
is an ordinary character if it appears at the beginning of the
|
||||
RE or the beginning of a parenthesized subexpression
|
||||
(after a possible leading `\fB^\fR').
|
||||
Finally,
|
||||
single-digit back references are available,
|
||||
and
|
||||
\fB\e<\fR
|
||||
and
|
||||
\fB\e>\fR
|
||||
are synonyms for
|
||||
\fB[[:<:]]\fR
|
||||
and
|
||||
\fB[[:>:]]\fR
|
||||
respectively;
|
||||
no other escapes are available.
|
||||
|
||||
.SH "SEE ALSO"
|
||||
RegExp(3), regexp(n), regsub(n), lsearch(n), switch(n), text(n)
|
||||
|
||||
.SH KEYWORDS
|
||||
match, regular expression, string
|
1559
src/regex/regc_nfa.c
Normal file
1559
src/regex/regc_nfa.c
Normal file
File diff suppressed because it is too large
Load Diff
699
src/regex/rege_dfa.c
Normal file
699
src/regex/rege_dfa.c
Normal file
@@ -0,0 +1,699 @@
|
||||
/*
|
||||
* DFA routines
|
||||
* This file is #included by regexec.c.
|
||||
*
|
||||
* Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
|
||||
*
|
||||
* Development of this software was funded, in part, by Cray Research Inc.,
|
||||
* UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
|
||||
* Corporation, none of whom are responsible for the results. The author
|
||||
* thanks all of them.
|
||||
*
|
||||
* Redistribution and use in source and binary forms -- with or without
|
||||
* modification -- are permitted for any purpose, provided that
|
||||
* redistributions in source form retain this entire copyright notice and
|
||||
* indicate the origin and nature of any modifications.
|
||||
*
|
||||
* I'd appreciate being given credit for this package in the documentation
|
||||
* of software which uses it, but that is not a requirement.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
||||
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
|
||||
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
||||
* HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
||||
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
||||
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* $Header$
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* longest - longest-preferred matching engine
|
||||
*/
|
||||
static chr * /* endpoint, or NULL */
|
||||
longest(struct vars * v, /* used only for debug and exec flags */
|
||||
struct dfa * d,
|
||||
chr *start, /* where the match should start */
|
||||
chr *stop, /* match must end at or before here */
|
||||
int *hitstopp) /* record whether hit v->stop, if non-NULL */
|
||||
{
|
||||
chr *cp;
|
||||
chr *realstop = (stop == v->stop) ? stop : stop + 1;
|
||||
color co;
|
||||
struct sset *css;
|
||||
struct sset *ss;
|
||||
chr *post;
|
||||
int i;
|
||||
struct colormap *cm = d->cm;
|
||||
|
||||
/* initialize */
|
||||
css = initialize(v, d, start);
|
||||
cp = start;
|
||||
if (hitstopp != NULL)
|
||||
*hitstopp = 0;
|
||||
|
||||
/* startup */
|
||||
FDEBUG(("+++ startup +++\n"));
|
||||
if (cp == v->start)
|
||||
{
|
||||
co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
|
||||
FDEBUG(("color %ld\n", (long) co));
|
||||
}
|
||||
else
|
||||
{
|
||||
co = GETCOLOR(cm, *(cp - 1));
|
||||
FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
|
||||
}
|
||||
css = miss(v, d, css, co, cp, start);
|
||||
if (css == NULL)
|
||||
return NULL;
|
||||
css->lastseen = cp;
|
||||
|
||||
/* main loop */
|
||||
if (v->eflags & REG_FTRACE)
|
||||
while (cp < realstop)
|
||||
{
|
||||
FDEBUG(("+++ at c%d +++\n", css - d->ssets));
|
||||
co = GETCOLOR(cm, *cp);
|
||||
FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
|
||||
ss = css->outs[co];
|
||||
if (ss == NULL)
|
||||
{
|
||||
ss = miss(v, d, css, co, cp + 1, start);
|
||||
if (ss == NULL)
|
||||
break; /* NOTE BREAK OUT */
|
||||
}
|
||||
cp++;
|
||||
ss->lastseen = cp;
|
||||
css = ss;
|
||||
}
|
||||
else
|
||||
while (cp < realstop)
|
||||
{
|
||||
co = GETCOLOR(cm, *cp);
|
||||
ss = css->outs[co];
|
||||
if (ss == NULL)
|
||||
{
|
||||
ss = miss(v, d, css, co, cp + 1, start);
|
||||
if (ss == NULL)
|
||||
break; /* NOTE BREAK OUT */
|
||||
}
|
||||
cp++;
|
||||
ss->lastseen = cp;
|
||||
css = ss;
|
||||
}
|
||||
|
||||
/* shutdown */
|
||||
FDEBUG(("+++ shutdown at c%d +++\n", css - d->ssets));
|
||||
if (cp == v->stop && stop == v->stop)
|
||||
{
|
||||
if (hitstopp != NULL)
|
||||
*hitstopp = 1;
|
||||
co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
|
||||
FDEBUG(("color %ld\n", (long) co));
|
||||
ss = miss(v, d, css, co, cp, start);
|
||||
/* special case: match ended at eol? */
|
||||
if (ss != NULL && (ss->flags & POSTSTATE))
|
||||
return cp;
|
||||
else if (ss != NULL)
|
||||
ss->lastseen = cp; /* to be tidy */
|
||||
}
|
||||
|
||||
/* find last match, if any */
|
||||
post = d->lastpost;
|
||||
for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
|
||||
if ((ss->flags & POSTSTATE) && post != ss->lastseen &&
|
||||
(post == NULL || post < ss->lastseen))
|
||||
post = ss->lastseen;
|
||||
if (post != NULL) /* found one */
|
||||
return post - 1;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* shortest - shortest-preferred matching engine
|
||||
*/
|
||||
static chr * /* endpoint, or NULL */
|
||||
shortest(struct vars * v,
|
||||
struct dfa * d,
|
||||
chr *start, /* where the match should start */
|
||||
chr *min, /* match must end at or after here */
|
||||
chr *max, /* match must end at or before here */
|
||||
chr **coldp, /* store coldstart pointer here, if
|
||||
* nonNULL */
|
||||
int *hitstopp) /* record whether hit v->stop, if non-NULL */
|
||||
{
|
||||
chr *cp;
|
||||
chr *realmin = (min == v->stop) ? min : min + 1;
|
||||
chr *realmax = (max == v->stop) ? max : max + 1;
|
||||
color co;
|
||||
struct sset *css;
|
||||
struct sset *ss;
|
||||
struct colormap *cm = d->cm;
|
||||
|
||||
/* initialize */
|
||||
css = initialize(v, d, start);
|
||||
cp = start;
|
||||
if (hitstopp != NULL)
|
||||
*hitstopp = 0;
|
||||
|
||||
/* startup */
|
||||
FDEBUG(("--- startup ---\n"));
|
||||
if (cp == v->start)
|
||||
{
|
||||
co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
|
||||
FDEBUG(("color %ld\n", (long) co));
|
||||
}
|
||||
else
|
||||
{
|
||||
co = GETCOLOR(cm, *(cp - 1));
|
||||
FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
|
||||
}
|
||||
css = miss(v, d, css, co, cp, start);
|
||||
if (css == NULL)
|
||||
return NULL;
|
||||
css->lastseen = cp;
|
||||
ss = css;
|
||||
|
||||
/* main loop */
|
||||
if (v->eflags & REG_FTRACE)
|
||||
while (cp < realmax)
|
||||
{
|
||||
FDEBUG(("--- at c%d ---\n", css - d->ssets));
|
||||
co = GETCOLOR(cm, *cp);
|
||||
FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
|
||||
ss = css->outs[co];
|
||||
if (ss == NULL)
|
||||
{
|
||||
ss = miss(v, d, css, co, cp + 1, start);
|
||||
if (ss == NULL)
|
||||
break; /* NOTE BREAK OUT */
|
||||
}
|
||||
cp++;
|
||||
ss->lastseen = cp;
|
||||
css = ss;
|
||||
if ((ss->flags & POSTSTATE) && cp >= realmin)
|
||||
break; /* NOTE BREAK OUT */
|
||||
}
|
||||
else
|
||||
while (cp < realmax)
|
||||
{
|
||||
co = GETCOLOR(cm, *cp);
|
||||
ss = css->outs[co];
|
||||
if (ss == NULL)
|
||||
{
|
||||
ss = miss(v, d, css, co, cp + 1, start);
|
||||
if (ss == NULL)
|
||||
break; /* NOTE BREAK OUT */
|
||||
}
|
||||
cp++;
|
||||
ss->lastseen = cp;
|
||||
css = ss;
|
||||
if ((ss->flags & POSTSTATE) && cp >= realmin)
|
||||
break; /* NOTE BREAK OUT */
|
||||
}
|
||||
|
||||
if (ss == NULL)
|
||||
return NULL;
|
||||
|
||||
if (coldp != NULL) /* report last no-progress state set, if
|
||||
* any */
|
||||
*coldp = lastcold(v, d);
|
||||
|
||||
if ((ss->flags & POSTSTATE) && cp > min)
|
||||
{
|
||||
assert(cp >= realmin);
|
||||
cp--;
|
||||
}
|
||||
else if (cp == v->stop && max == v->stop)
|
||||
{
|
||||
co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
|
||||
FDEBUG(("color %ld\n", (long) co));
|
||||
ss = miss(v, d, css, co, cp, start);
|
||||
/* match might have ended at eol */
|
||||
if ((ss == NULL || !(ss->flags & POSTSTATE)) && hitstopp != NULL)
|
||||
*hitstopp = 1;
|
||||
}
|
||||
|
||||
if (ss == NULL || !(ss->flags & POSTSTATE))
|
||||
return NULL;
|
||||
|
||||
return cp;
|
||||
}
|
||||
|
||||
/*
|
||||
* lastcold - determine last point at which no progress had been made
|
||||
*/
|
||||
static chr * /* endpoint, or NULL */
|
||||
lastcold(struct vars * v,
|
||||
struct dfa * d)
|
||||
{
|
||||
struct sset *ss;
|
||||
chr *nopr;
|
||||
int i;
|
||||
|
||||
nopr = d->lastnopr;
|
||||
if (nopr == NULL)
|
||||
nopr = v->start;
|
||||
for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
|
||||
if ((ss->flags & NOPROGRESS) && nopr < ss->lastseen)
|
||||
nopr = ss->lastseen;
|
||||
return nopr;
|
||||
}
|
||||
|
||||
/*
|
||||
* newdfa - set up a fresh DFA
|
||||
*/
|
||||
static struct dfa *
|
||||
newdfa(struct vars * v,
|
||||
struct cnfa * cnfa,
|
||||
struct colormap * cm,
|
||||
struct smalldfa * small) /* preallocated space, may be NULL */
|
||||
{
|
||||
struct dfa *d;
|
||||
size_t nss = cnfa->nstates * 2;
|
||||
int wordsper = (cnfa->nstates + UBITS - 1) / UBITS;
|
||||
struct smalldfa *smallwas = small;
|
||||
|
||||
assert(cnfa != NULL && cnfa->nstates != 0);
|
||||
|
||||
if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS)
|
||||
{
|
||||
assert(wordsper == 1);
|
||||
if (small == NULL)
|
||||
{
|
||||
small = (struct smalldfa *) MALLOC(
|
||||
sizeof(struct smalldfa));
|
||||
if (small == NULL)
|
||||
{
|
||||
ERR(REG_ESPACE);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
d = &small->dfa;
|
||||
d->ssets = small->ssets;
|
||||
d->statesarea = small->statesarea;
|
||||
d->work = &d->statesarea[nss];
|
||||
d->outsarea = small->outsarea;
|
||||
d->incarea = small->incarea;
|
||||
d->cptsmalloced = 0;
|
||||
d->mallocarea = (smallwas == NULL) ? (char *) small : NULL;
|
||||
}
|
||||
else
|
||||
{
|
||||
d = (struct dfa *) MALLOC(sizeof(struct dfa));
|
||||
if (d == NULL)
|
||||
{
|
||||
ERR(REG_ESPACE);
|
||||
return NULL;
|
||||
}
|
||||
d->ssets = (struct sset *) MALLOC(nss * sizeof(struct sset));
|
||||
d->statesarea = (unsigned *) MALLOC((nss + WORK) * wordsper *
|
||||
sizeof(unsigned));
|
||||
d->work = &d->statesarea[nss * wordsper];
|
||||
d->outsarea = (struct sset **) MALLOC(nss * cnfa->ncolors *
|
||||
sizeof(struct sset *));
|
||||
d->incarea = (struct arcp *) MALLOC(nss * cnfa->ncolors *
|
||||
sizeof(struct arcp));
|
||||
d->cptsmalloced = 1;
|
||||
d->mallocarea = (char *) d;
|
||||
if (d->ssets == NULL || d->statesarea == NULL ||
|
||||
d->outsarea == NULL || d->incarea == NULL)
|
||||
{
|
||||
freedfa(d);
|
||||
ERR(REG_ESPACE);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
d->nssets = (v->eflags & REG_SMALL) ? 7 : nss;
|
||||
d->nssused = 0;
|
||||
d->nstates = cnfa->nstates;
|
||||
d->ncolors = cnfa->ncolors;
|
||||
d->wordsper = wordsper;
|
||||
d->cnfa = cnfa;
|
||||
d->cm = cm;
|
||||
d->lastpost = NULL;
|
||||
d->lastnopr = NULL;
|
||||
d->search = d->ssets;
|
||||
|
||||
/* initialization of sset fields is done as needed */
|
||||
|
||||
return d;
|
||||
}
|
||||
|
||||
/*
|
||||
* freedfa - free a DFA
|
||||
*/
|
||||
static void
|
||||
freedfa(struct dfa * d)
|
||||
{
|
||||
if (d->cptsmalloced)
|
||||
{
|
||||
if (d->ssets != NULL)
|
||||
FREE(d->ssets);
|
||||
if (d->statesarea != NULL)
|
||||
FREE(d->statesarea);
|
||||
if (d->outsarea != NULL)
|
||||
FREE(d->outsarea);
|
||||
if (d->incarea != NULL)
|
||||
FREE(d->incarea);
|
||||
}
|
||||
|
||||
if (d->mallocarea != NULL)
|
||||
FREE(d->mallocarea);
|
||||
}
|
||||
|
||||
/*
|
||||
* hash - construct a hash code for a bitvector
|
||||
*
|
||||
* There are probably better ways, but they're more expensive.
|
||||
*/
|
||||
static unsigned
|
||||
hash(unsigned *uv,
|
||||
int n)
|
||||
{
|
||||
int i;
|
||||
unsigned h;
|
||||
|
||||
h = 0;
|
||||
for (i = 0; i < n; i++)
|
||||
h ^= uv[i];
|
||||
return h;
|
||||
}
|
||||
|
||||
/*
|
||||
* initialize - hand-craft a cache entry for startup, otherwise get ready
|
||||
*/
|
||||
static struct sset *
|
||||
initialize(struct vars * v, /* used only for debug flags */
|
||||
struct dfa * d,
|
||||
chr *start)
|
||||
{
|
||||
struct sset *ss;
|
||||
int i;
|
||||
|
||||
/* is previous one still there? */
|
||||
if (d->nssused > 0 && (d->ssets[0].flags & STARTER))
|
||||
ss = &d->ssets[0];
|
||||
else
|
||||
{ /* no, must (re)build it */
|
||||
ss = getvacant(v, d, start, start);
|
||||
for (i = 0; i < d->wordsper; i++)
|
||||
ss->states[i] = 0;
|
||||
BSET(ss->states, d->cnfa->pre);
|
||||
ss->hash = HASH(ss->states, d->wordsper);
|
||||
assert(d->cnfa->pre != d->cnfa->post);
|
||||
ss->flags = STARTER | LOCKED | NOPROGRESS;
|
||||
/* lastseen dealt with below */
|
||||
}
|
||||
|
||||
for (i = 0; i < d->nssused; i++)
|
||||
d->ssets[i].lastseen = NULL;
|
||||
ss->lastseen = start; /* maybe untrue, but harmless */
|
||||
d->lastpost = NULL;
|
||||
d->lastnopr = NULL;
|
||||
return ss;
|
||||
}
|
||||
|
||||
/*
|
||||
* miss - handle a cache miss
|
||||
*/
|
||||
static struct sset * /* NULL if goes to empty set */
|
||||
miss(struct vars * v, /* used only for debug flags */
|
||||
struct dfa * d,
|
||||
struct sset * css,
|
||||
pcolor co,
|
||||
chr *cp, /* next chr */
|
||||
chr *start) /* where the attempt got started */
|
||||
{
|
||||
struct cnfa *cnfa = d->cnfa;
|
||||
int i;
|
||||
unsigned h;
|
||||
struct carc *ca;
|
||||
struct sset *p;
|
||||
int ispost;
|
||||
int noprogress;
|
||||
int gotstate;
|
||||
int dolacons;
|
||||
int sawlacons;
|
||||
|
||||
/* for convenience, we can be called even if it might not be a miss */
|
||||
if (css->outs[co] != NULL)
|
||||
{
|
||||
FDEBUG(("hit\n"));
|
||||
return css->outs[co];
|
||||
}
|
||||
FDEBUG(("miss\n"));
|
||||
|
||||
/* first, what set of states would we end up in? */
|
||||
for (i = 0; i < d->wordsper; i++)
|
||||
d->work[i] = 0;
|
||||
ispost = 0;
|
||||
noprogress = 1;
|
||||
gotstate = 0;
|
||||
for (i = 0; i < d->nstates; i++)
|
||||
if (ISBSET(css->states, i))
|
||||
for (ca = cnfa->states[i] + 1; ca->co != COLORLESS; ca++)
|
||||
if (ca->co == co)
|
||||
{
|
||||
BSET(d->work, ca->to);
|
||||
gotstate = 1;
|
||||
if (ca->to == cnfa->post)
|
||||
ispost = 1;
|
||||
if (!cnfa->states[ca->to]->co)
|
||||
noprogress = 0;
|
||||
FDEBUG(("%d -> %d\n", i, ca->to));
|
||||
}
|
||||
dolacons = (gotstate) ? (cnfa->flags & HASLACONS) : 0;
|
||||
sawlacons = 0;
|
||||
while (dolacons)
|
||||
{ /* transitive closure */
|
||||
dolacons = 0;
|
||||
for (i = 0; i < d->nstates; i++)
|
||||
if (ISBSET(d->work, i))
|
||||
for (ca = cnfa->states[i] + 1; ca->co != COLORLESS;
|
||||
ca++)
|
||||
{
|
||||
if (ca->co <= cnfa->ncolors)
|
||||
continue; /* NOTE CONTINUE */
|
||||
sawlacons = 1;
|
||||
if (ISBSET(d->work, ca->to))
|
||||
continue; /* NOTE CONTINUE */
|
||||
if (!lacon(v, cnfa, cp, ca->co))
|
||||
continue; /* NOTE CONTINUE */
|
||||
BSET(d->work, ca->to);
|
||||
dolacons = 1;
|
||||
if (ca->to == cnfa->post)
|
||||
ispost = 1;
|
||||
if (!cnfa->states[ca->to]->co)
|
||||
noprogress = 0;
|
||||
FDEBUG(("%d :> %d\n", i, ca->to));
|
||||
}
|
||||
}
|
||||
if (!gotstate)
|
||||
return NULL;
|
||||
h = HASH(d->work, d->wordsper);
|
||||
|
||||
/* next, is that in the cache? */
|
||||
for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
|
||||
if (HIT(h, d->work, p, d->wordsper))
|
||||
{
|
||||
FDEBUG(("cached c%d\n", p - d->ssets));
|
||||
break; /* NOTE BREAK OUT */
|
||||
}
|
||||
if (i == 0)
|
||||
{ /* nope, need a new cache entry */
|
||||
p = getvacant(v, d, cp, start);
|
||||
assert(p != css);
|
||||
for (i = 0; i < d->wordsper; i++)
|
||||
p->states[i] = d->work[i];
|
||||
p->hash = h;
|
||||
p->flags = (ispost) ? POSTSTATE : 0;
|
||||
if (noprogress)
|
||||
p->flags |= NOPROGRESS;
|
||||
/* lastseen to be dealt with by caller */
|
||||
}
|
||||
|
||||
if (!sawlacons)
|
||||
{ /* lookahead conds. always cache miss */
|
||||
FDEBUG(("c%d[%d]->c%d\n", css - d->ssets, co, p - d->ssets));
|
||||
css->outs[co] = p;
|
||||
css->inchain[co] = p->ins;
|
||||
p->ins.ss = css;
|
||||
p->ins.co = (color) co;
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
/*
|
||||
* lacon - lookahead-constraint checker for miss()
|
||||
*/
|
||||
static int /* predicate: constraint satisfied? */
|
||||
lacon(struct vars * v,
|
||||
struct cnfa * pcnfa, /* parent cnfa */
|
||||
chr *cp,
|
||||
pcolor co) /* "color" of the lookahead constraint */
|
||||
{
|
||||
int n;
|
||||
struct subre *sub;
|
||||
struct dfa *d;
|
||||
struct smalldfa sd;
|
||||
chr *end;
|
||||
|
||||
n = co - pcnfa->ncolors;
|
||||
assert(n < v->g->nlacons && v->g->lacons != NULL);
|
||||
FDEBUG(("=== testing lacon %d\n", n));
|
||||
sub = &v->g->lacons[n];
|
||||
d = newdfa(v, &sub->cnfa, &v->g->cmap, &sd);
|
||||
if (d == NULL)
|
||||
{
|
||||
ERR(REG_ESPACE);
|
||||
return 0;
|
||||
}
|
||||
end = longest(v, d, cp, v->stop, (int *) NULL);
|
||||
freedfa(d);
|
||||
FDEBUG(("=== lacon %d match %d\n", n, (end != NULL)));
|
||||
return (sub->subno) ? (end != NULL) : (end == NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* getvacant - get a vacant state set
|
||||
* This routine clears out the inarcs and outarcs, but does not otherwise
|
||||
* clear the innards of the state set -- that's up to the caller.
|
||||
*/
|
||||
static struct sset *
|
||||
getvacant(struct vars * v, /* used only for debug flags */
|
||||
struct dfa * d,
|
||||
chr *cp,
|
||||
chr *start)
|
||||
{
|
||||
int i;
|
||||
struct sset *ss;
|
||||
struct sset *p;
|
||||
struct arcp ap;
|
||||
struct arcp lastap;
|
||||
color co;
|
||||
|
||||
ss = pickss(v, d, cp, start);
|
||||
assert(!(ss->flags & LOCKED));
|
||||
|
||||
/* clear out its inarcs, including self-referential ones */
|
||||
ap = ss->ins;
|
||||
while ((p = ap.ss) != NULL)
|
||||
{
|
||||
co = ap.co;
|
||||
FDEBUG(("zapping c%d's %ld outarc\n", p - d->ssets, (long) co));
|
||||
p->outs[co] = NULL;
|
||||
ap = p->inchain[co];
|
||||
p->inchain[co].ss = NULL; /* paranoia */
|
||||
}
|
||||
ss->ins.ss = NULL;
|
||||
|
||||
/* take it off the inarc chains of the ssets reached by its outarcs */
|
||||
for (i = 0; i < d->ncolors; i++)
|
||||
{
|
||||
p = ss->outs[i];
|
||||
assert(p != ss); /* not self-referential */
|
||||
if (p == NULL)
|
||||
continue; /* NOTE CONTINUE */
|
||||
FDEBUG(("del outarc %d from c%d's in chn\n", i, p - d->ssets));
|
||||
if (p->ins.ss == ss && p->ins.co == i)
|
||||
p->ins = ss->inchain[i];
|
||||
else
|
||||
{
|
||||
assert(p->ins.ss != NULL);
|
||||
for (ap = p->ins; ap.ss != NULL &&
|
||||
!(ap.ss == ss && ap.co == i);
|
||||
ap = ap.ss->inchain[ap.co])
|
||||
lastap = ap;
|
||||
assert(ap.ss != NULL);
|
||||
lastap.ss->inchain[lastap.co] = ss->inchain[i];
|
||||
}
|
||||
ss->outs[i] = NULL;
|
||||
ss->inchain[i].ss = NULL;
|
||||
}
|
||||
|
||||
/* if ss was a success state, may need to remember location */
|
||||
if ((ss->flags & POSTSTATE) && ss->lastseen != d->lastpost &&
|
||||
(d->lastpost == NULL || d->lastpost < ss->lastseen))
|
||||
d->lastpost = ss->lastseen;
|
||||
|
||||
/* likewise for a no-progress state */
|
||||
if ((ss->flags & NOPROGRESS) && ss->lastseen != d->lastnopr &&
|
||||
(d->lastnopr == NULL || d->lastnopr < ss->lastseen))
|
||||
d->lastnopr = ss->lastseen;
|
||||
|
||||
return ss;
|
||||
}
|
||||
|
||||
/*
|
||||
* pickss - pick the next stateset to be used
|
||||
*/
|
||||
static struct sset *
|
||||
pickss(struct vars * v, /* used only for debug flags */
|
||||
struct dfa * d,
|
||||
chr *cp,
|
||||
chr *start)
|
||||
{
|
||||
int i;
|
||||
struct sset *ss;
|
||||
struct sset *end;
|
||||
chr *ancient;
|
||||
|
||||
/* shortcut for cases where cache isn't full */
|
||||
if (d->nssused < d->nssets)
|
||||
{
|
||||
i = d->nssused;
|
||||
d->nssused++;
|
||||
ss = &d->ssets[i];
|
||||
FDEBUG(("new c%d\n", i));
|
||||
/* set up innards */
|
||||
ss->states = &d->statesarea[i * d->wordsper];
|
||||
ss->flags = 0;
|
||||
ss->ins.ss = NULL;
|
||||
ss->ins.co = WHITE; /* give it some value */
|
||||
ss->outs = &d->outsarea[i * d->ncolors];
|
||||
ss->inchain = &d->incarea[i * d->ncolors];
|
||||
for (i = 0; i < d->ncolors; i++)
|
||||
{
|
||||
ss->outs[i] = NULL;
|
||||
ss->inchain[i].ss = NULL;
|
||||
}
|
||||
return ss;
|
||||
}
|
||||
|
||||
/* look for oldest, or old enough anyway */
|
||||
if (cp - start > d->nssets * 2 / 3) /* oldest 33% are expendable */
|
||||
ancient = cp - d->nssets * 2 / 3;
|
||||
else
|
||||
ancient = start;
|
||||
for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++)
|
||||
if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
|
||||
!(ss->flags & LOCKED))
|
||||
{
|
||||
d->search = ss + 1;
|
||||
FDEBUG(("replacing c%d\n", ss - d->ssets));
|
||||
return ss;
|
||||
}
|
||||
for (ss = d->ssets, end = d->search; ss < end; ss++)
|
||||
if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
|
||||
!(ss->flags & LOCKED))
|
||||
{
|
||||
d->search = ss + 1;
|
||||
FDEBUG(("replacing c%d\n", ss - d->ssets));
|
||||
return ss;
|
||||
}
|
||||
|
||||
/* nobody's old enough?!? -- something's really wrong */
|
||||
FDEBUG(("can't find victim to replace!\n"));
|
||||
assert(NOTREACHED);
|
||||
ERR(REG_ASSERT);
|
||||
return d->ssets;
|
||||
}
|
Reference in New Issue
Block a user