Import regex from tcl 8.4.5

git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/branches/RXSPENCER@3951 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
This commit is contained in:
Ryan Norton
1999-10-13 02:22:18 +00:00
parent 9bd536df18
commit a6c3a78d25
4 changed files with 1303 additions and 1177 deletions

View File

@@ -28,8 +28,6 @@
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $Header$
*
*/
/* scanning macros (know about v) */
@@ -64,26 +62,23 @@
#define ENDOF(array) ((array) + sizeof(array)/sizeof(chr))
/*
* lexstart - set up lexical stuff, scan leading options
- lexstart - set up lexical stuff, scan leading options
^ static VOID lexstart(struct vars *);
*/
static void
lexstart(struct vars * v)
static VOID
lexstart(v)
struct vars *v;
{
prefixes(v); /* may turn on new type bits etc. */
NOERR();
if (v->cflags & REG_QUOTE)
{
if (v->cflags&REG_QUOTE) {
assert(!(v->cflags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE)));
INTOCON(L_Q);
}
else if (v->cflags & REG_EXTENDED)
{
} else if (v->cflags&REG_EXTENDED) {
assert(!(v->cflags&REG_QUOTE));
INTOCON(L_ERE);
}
else
{
} else {
assert(!(v->cflags&(REG_QUOTE|REG_ADVF)));
INTOCON(L_BRE);
}
@@ -93,10 +88,12 @@ lexstart(struct vars * v)
}
/*
* prefixes - implement various special prefixes
- prefixes - implement various special prefixes
^ static VOID prefixes(struct vars *);
*/
static void
prefixes(struct vars * v)
static VOID
prefixes(v)
struct vars *v;
{
/* literal string doesn't get any of this stuff */
if (v->cflags&REG_QUOTE)
@@ -104,8 +101,7 @@ prefixes(struct vars * v)
/* initial "***" gets special things */
if (HAVE(4) && NEXT3('*', '*', '*'))
switch (*(v->now + 3))
{
switch (*(v->now + 3)) {
case CHR('?'): /* "***?" error, msg shows version */
ERR(REG_BADPAT);
return; /* proceed no further */
@@ -133,13 +129,11 @@ prefixes(struct vars * v)
return;
/* embedded options (AREs only) */
if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2)))
{
if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2))) {
NOTE(REG_UNONPOSIX);
v->now += 2;
for (; !ATEOS() && iscalpha(*v->now); v->now++)
switch (*v->now)
{
switch (*v->now) {
case CHR('b'): /* BREs (but why???) */
v->cflags &= ~(REG_ADVANCED|REG_QUOTE);
break;
@@ -182,8 +176,7 @@ prefixes(struct vars * v)
ERR(REG_BADOPT);
return;
}
if (!NEXT1(')'))
{
if (!NEXT1(')')) {
ERR(REG_BADOPT);
return;
}
@@ -194,15 +187,16 @@ prefixes(struct vars * v)
}
/*
* lexnest - "call a subroutine", interpolating string at the lexical level
*
- lexnest - "call a subroutine", interpolating string at the lexical level
* Note, this is not a very general facility. There are a number of
* implicit assumptions about what sorts of strings can be subroutines.
^ static VOID lexnest(struct vars *, chr *, chr *);
*/
static void
lexnest(struct vars * v,
chr *beginp, /* start of interpolation */
chr *endp) /* one past end of interpolation */
static VOID
lexnest(v, beginp, endp)
struct vars *v;
chr *beginp; /* start of interpolation */
chr *endp; /* one past end of interpolation */
{
assert(v->savenow == NULL); /* only one level of nesting */
v->savenow = v->now;
@@ -261,20 +255,24 @@ static chr brbackw[] = { /* \w within brackets */
};
/*
* lexword - interpolate a bracket expression for word characters
- lexword - interpolate a bracket expression for word characters
* Possibly ought to inquire whether there is a "word" character class.
^ static VOID lexword(struct vars *);
*/
static void
lexword(struct vars * v)
static VOID
lexword(v)
struct vars *v;
{
lexnest(v, backw, ENDOF(backw));
}
/*
* next - get next token
- next - get next token
^ static int next(struct vars *);
*/
static int /* 1 normal, 0 failure */
next(struct vars * v)
next(v)
struct vars *v;
{
chr c;
@@ -286,15 +284,13 @@ next(struct vars * v)
v->lasttype = v->nexttype;
/* REG_BOSONLY */
if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY))
{
if (v->nexttype == EMPTY && (v->cflags&REG_BOSONLY)) {
/* at start of a REG_BOSONLY RE */
RETV(SBEGIN, 0); /* same as \A */
}
/* if we're nested and we've hit end, return to outer level */
if (v->savenow != NULL && ATEOS())
{
if (v->savenow != NULL && ATEOS()) {
v->now = v->savenow;
v->stop = v->savestop;
v->savenow = v->savestop = NULL;
@@ -302,8 +298,7 @@ next(struct vars * v)
/* skip white space etc. if appropriate (not in literal or []) */
if (v->cflags&REG_EXPANDED)
switch (v->lexcon)
{
switch (v->lexcon) {
case L_ERE:
case L_BRE:
case L_EBND:
@@ -313,10 +308,8 @@ next(struct vars * v)
}
/* handle EOS, depending on context */
if (ATEOS())
{
switch (v->lexcon)
{
if (ATEOS()) {
switch (v->lexcon) {
case L_ERE:
case L_BRE:
case L_Q:
@@ -340,8 +333,7 @@ next(struct vars * v)
c = *v->now++;
/* deal with the easy contexts, punt EREs to code below */
switch (v->lexcon)
{
switch (v->lexcon) {
case L_BRE: /* punt BREs to separate function */
return brenext(v, c);
break;
@@ -352,46 +344,33 @@ next(struct vars * v)
break;
case L_BBND: /* bounds are fairly simple */
case L_EBND:
switch (c)
{
case CHR('0'):
case CHR('1'):
case CHR('2'):
case CHR('3'):
case CHR('4'):
case CHR('5'):
case CHR('6'):
case CHR('7'):
case CHR('8'):
case CHR('9'):
switch (c) {
case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'):
case CHR('8'): case CHR('9'):
RETV(DIGIT, (chr)DIGITVAL(c));
break;
case CHR(','):
RET(',');
break;
case CHR('}'): /* ERE bound ends with } */
if (INCON(L_EBND))
{
if (INCON(L_EBND)) {
INTOCON(L_ERE);
if ((v->cflags & REG_ADVF) && NEXT1('?'))
{
if ((v->cflags&REG_ADVF) && NEXT1('?')) {
v->now++;
NOTE(REG_UNONPOSIX);
RETV('}', 0);
}
RETV('}', 1);
}
else
} else
FAILW(REG_BADBR);
break;
case CHR('\\'): /* BRE bound ends with \} */
if (INCON(L_BBND) && NEXT1('}'))
{
if (INCON(L_BBND) && NEXT1('}')) {
v->now++;
INTOCON(L_BRE);
RET('}');
}
else
} else
FAILW(REG_BADBR);
break;
default:
@@ -401,13 +380,11 @@ next(struct vars * v)
assert(NOTREACHED);
break;
case L_BRACK: /* brackets are not too hard */
switch (c)
{
switch (c) {
case CHR(']'):
if (LASTTYPE('['))
RETV(PLAIN, c);
else
{
else {
INTOCON((v->cflags&REG_EXTENDED) ?
L_ERE : L_BRE);
RET(']');
@@ -421,14 +398,12 @@ next(struct vars * v)
if (ATEOS())
FAILW(REG_EESCAPE);
(DISCARD)lexescape(v);
switch (v->nexttype)
{ /* not all escapes okay here */
switch (v->nexttype) { /* not all escapes okay here */
case PLAIN:
return 1;
break;
case CCLASS:
switch (v->nextvalue)
{
switch (v->nextvalue) {
case 'd':
lexnest(v, brbackd, ENDOF(brbackd));
break;
@@ -459,8 +434,7 @@ next(struct vars * v)
case CHR('['):
if (ATEOS())
FAILW(REG_EBRACK);
switch (*v->now++)
{
switch (*v->now++) {
case CHR('.'):
INTOCON(L_CEL);
/* might or might not be locale-specific */
@@ -490,33 +464,27 @@ next(struct vars * v)
assert(NOTREACHED);
break;
case L_CEL: /* collating elements are easy */
if (c == CHR('.') && NEXT1(']'))
{
if (c == CHR('.') && NEXT1(']')) {
v->now++;
INTOCON(L_BRACK);
RETV(END, '.');
}
else
} else
RETV(PLAIN, c);
break;
case L_ECL: /* ditto equivalence classes */
if (c == CHR('=') && NEXT1(']'))
{
if (c == CHR('=') && NEXT1(']')) {
v->now++;
INTOCON(L_BRACK);
RETV(END, '=');
}
else
} else
RETV(PLAIN, c);
break;
case L_CCL: /* ditto character classes */
if (c == CHR(':') && NEXT1(']'))
{
if (c == CHR(':') && NEXT1(']')) {
v->now++;
INTOCON(L_BRACK);
RETV(END, ':');
}
else
} else
RETV(PLAIN, c);
break;
default:
@@ -528,14 +496,12 @@ next(struct vars * v)
assert(INCON(L_ERE));
/* deal with EREs and AREs, except for backslashes */
switch (c)
{
switch (c) {
case CHR('|'):
RET('|');
break;
case CHR('*'):
if ((v->cflags & REG_ADVF) && NEXT1('?'))
{
if ((v->cflags&REG_ADVF) && NEXT1('?')) {
v->now++;
NOTE(REG_UNONPOSIX);
RETV('*', 0);
@@ -543,8 +509,7 @@ next(struct vars * v)
RETV('*', 1);
break;
case CHR('+'):
if ((v->cflags & REG_ADVF) && NEXT1('?'))
{
if ((v->cflags&REG_ADVF) && NEXT1('?')) {
v->now++;
NOTE(REG_UNONPOSIX);
RETV('+', 0);
@@ -552,8 +517,7 @@ next(struct vars * v)
RETV('+', 1);
break;
case CHR('?'):
if ((v->cflags & REG_ADVF) && NEXT1('?'))
{
if ((v->cflags&REG_ADVF) && NEXT1('?')) {
v->now++;
NOTE(REG_UNONPOSIX);
RETV('?', 0);
@@ -563,14 +527,11 @@ next(struct vars * v)
case CHR('{'): /* bounds start or plain character */
if (v->cflags&REG_EXPANDED)
skip(v);
if (ATEOS() || !iscdigit(*v->now))
{
if (ATEOS() || !iscdigit(*v->now)) {
NOTE(REG_UBRACES);
NOTE(REG_UUNSPEC);
RETV(PLAIN, c);
}
else
{
} else {
NOTE(REG_UBOUNDS);
INTOCON(L_EBND);
RET('{');
@@ -578,12 +539,10 @@ next(struct vars * v)
assert(NOTREACHED);
break;
case CHR('('): /* parenthesis, or advanced extension */
if ((v->cflags & REG_ADVF) && NEXT1('?'))
{
if ((v->cflags&REG_ADVF) && NEXT1('?')) {
NOTE(REG_UNONPOSIX);
v->now++;
switch (*v->now++)
{
switch (*v->now++) {
case CHR(':'): /* non-capturing paren */
RETV('(', 0);
break;
@@ -615,8 +574,9 @@ next(struct vars * v)
RETV('(', 1);
break;
case CHR(')'):
if (LASTTYPE('('))
if (LASTTYPE('(')) {
NOTE(REG_UUNSPEC);
}
RETV(')', c);
break;
case CHR('['): /* easy except for [[:<:]] and [[:>:]] */
@@ -626,16 +586,14 @@ next(struct vars * v)
*(v->now+2) == CHR('>')) &&
*(v->now+3) == CHR(':') &&
*(v->now+4) == CHR(']') &&
*(v->now + 5) == CHR(']'))
{
*(v->now+5) == CHR(']')) {
c = *(v->now+2);
v->now += 6;
NOTE(REG_UNONPOSIX);
RET((c == CHR('<')) ? '<' : '>');
}
INTOCON(L_BRACK);
if (NEXT1('^'))
{
if (NEXT1('^')) {
v->now++;
RETV('[', 0);
}
@@ -661,10 +619,8 @@ next(struct vars * v)
/* ERE/ARE backslash handling; backslash already eaten */
assert(!ATEOS());
if (!(v->cflags & REG_ADVF))
{ /* only AREs have non-trivial escapes */
if (iscalnum(*v->now))
{
if (!(v->cflags&REG_ADVF)) { /* only AREs have non-trivial escapes */
if (iscalnum(*v->now)) {
NOTE(REG_UBSALNUM);
NOTE(REG_UUNSPEC);
}
@@ -673,28 +629,14 @@ next(struct vars * v)
(DISCARD)lexescape(v);
if (ISERR())
FAILW(REG_EESCAPE);
if (v->nexttype == CCLASS)
{ /* fudge at lexical level */
switch (v->nextvalue)
{
case 'd':
lexnest(v, backd, ENDOF(backd));
break;
case 'D':
lexnest(v, backD, ENDOF(backD));
break;
case 's':
lexnest(v, backs, ENDOF(backs));
break;
case 'S':
lexnest(v, backS, ENDOF(backS));
break;
case 'w':
lexnest(v, backw, ENDOF(backw));
break;
case 'W':
lexnest(v, backW, ENDOF(backW));
break;
if (v->nexttype == CCLASS) { /* fudge at lexical level */
switch (v->nextvalue) {
case 'd': lexnest(v, backd, ENDOF(backd)); break;
case 'D': lexnest(v, backD, ENDOF(backD)); break;
case 's': lexnest(v, backs, ENDOF(backs)); break;
case 'S': lexnest(v, backS, ENDOF(backS)); break;
case 'w': lexnest(v, backw, ENDOF(backw)); break;
case 'W': lexnest(v, backW, ENDOF(backW)); break;
default:
assert(NOTREACHED);
FAILW(REG_ASSERT);
@@ -709,12 +651,13 @@ next(struct vars * v)
}
/*
* lexescape - parse an ARE backslash escape (backslash already eaten)
- lexescape - parse an ARE backslash escape (backslash already eaten)
* Note slightly nonstandard use of the CCLASS type code.
^ static int lexescape(struct vars *);
*/
static int /* not actually used, but convenient for
* RETV */
lexescape(struct vars * v)
static int /* not actually used, but convenient for RETV */
lexescape(v)
struct vars *v;
{
chr c;
static chr alert[] = {
@@ -733,8 +676,7 @@ lexescape(struct vars * v)
RETV(PLAIN, c);
NOTE(REG_UNONPOSIX);
switch (c)
{
switch (c) {
case CHR('a'):
RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007')));
break;
@@ -816,8 +758,7 @@ lexescape(struct vars * v)
break;
case CHR('x'):
NOTE(REG_UUNPORT);
c = lexdigits(v, 16, 1, 255); /* REs >255 long outside
* spec */
c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */
if (ISERR())
FAILW(REG_EESCAPE);
RETV(PLAIN, c);
@@ -833,24 +774,16 @@ lexescape(struct vars * v)
case CHR('Z'):
RETV(SEND, 0);
break;
case CHR('1'):
case CHR('2'):
case CHR('3'):
case CHR('4'):
case CHR('5'):
case CHR('6'):
case CHR('7'):
case CHR('8'):
case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'):
case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'):
case CHR('9'):
save = v->now;
v->now--; /* put first digit back */
c = lexdigits(v, 10, 1, 255); /* REs >255 long outside
* spec */
c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */
if (ISERR())
FAILW(REG_EESCAPE);
/* ugly heuristic (first test is "exactly 1 digit?") */
if (v->now - save == 0 || (int) c <= v->nsubexp)
{
if (v->now - save == 0 || (int)c <= v->nsubexp) {
NOTE(REG_UBACKREF);
RETV(BACKREF, (chr)c);
}
@@ -874,70 +807,44 @@ lexescape(struct vars * v)
}
/*
* lexdigits - slurp up digits and return chr value
- lexdigits - slurp up digits and return chr value
^ static chr lexdigits(struct vars *, int, int, int);
*/
static chr /* chr value; errors signalled via ERR */
lexdigits(struct vars * v,
int base,
int minlen,
int maxlen)
lexdigits(v, base, minlen, maxlen)
struct vars *v;
int base;
int minlen;
int maxlen;
{
uchr n; /* unsigned to avoid overflow misbehavior */
int len;
chr c;
int d;
const uchr ub = (uchr) base;
CONST uchr ub = (uchr) base;
n = 0;
for (len = 0; len < maxlen && !ATEOS(); len++)
{
for (len = 0; len < maxlen && !ATEOS(); len++) {
c = *v->now++;
switch (c)
{
case CHR('0'):
case CHR('1'):
case CHR('2'):
case CHR('3'):
case CHR('4'):
case CHR('5'):
case CHR('6'):
case CHR('7'):
case CHR('8'):
case CHR('9'):
switch (c) {
case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'):
case CHR('8'): case CHR('9'):
d = DIGITVAL(c);
break;
case CHR('a'):
case CHR('A'):
d = 10;
break;
case CHR('b'):
case CHR('B'):
d = 11;
break;
case CHR('c'):
case CHR('C'):
d = 12;
break;
case CHR('d'):
case CHR('D'):
d = 13;
break;
case CHR('e'):
case CHR('E'):
d = 14;
break;
case CHR('f'):
case CHR('F'):
d = 15;
break;
case CHR('a'): case CHR('A'): d = 10; break;
case CHR('b'): case CHR('B'): d = 11; break;
case CHR('c'): case CHR('C'): d = 12; break;
case CHR('d'): case CHR('D'): d = 13; break;
case CHR('e'): case CHR('E'): d = 14; break;
case CHR('f'): case CHR('F'): d = 15; break;
default:
v->now--; /* oops, not a digit at all */
d = -1;
break;
}
if (d >= base)
{ /* not a plausible digit */
if (d >= base) { /* not a plausible digit */
v->now--;
d = -1;
}
@@ -952,19 +859,19 @@ lexdigits(struct vars * v,
}
/*
* brenext - get next BRE token
*
- brenext - get next BRE token
* This is much like EREs except for all the stupid backslashes and the
* context-dependency of some things.
^ static int brenext(struct vars *, pchr);
*/
static int /* 1 normal, 0 failure */
brenext(struct vars * v,
chr pc)
brenext(v, pc)
struct vars *v;
pchr pc;
{
chr c = (chr)pc;
switch (c)
{
switch (c) {
case CHR('*'):
if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^'))
RETV(PLAIN, c);
@@ -977,16 +884,14 @@ brenext(struct vars * v,
*(v->now+2) == CHR('>')) &&
*(v->now+3) == CHR(':') &&
*(v->now+4) == CHR(']') &&
*(v->now + 5) == CHR(']'))
{
*(v->now+5) == CHR(']')) {
c = *(v->now+2);
v->now += 6;
NOTE(REG_UNONPOSIX);
RET((c == CHR('<')) ? '<' : '>');
}
INTOCON(L_BRACK);
if (NEXT1('^'))
{
if (NEXT1('^')) {
v->now++;
RETV('[', 0);
}
@@ -998,8 +903,7 @@ brenext(struct vars * v,
case CHR('^'):
if (LASTTYPE(EMPTY))
RET('^');
if (LASTTYPE('('))
{
if (LASTTYPE('(')) {
NOTE(REG_UUNSPEC);
RET('^');
}
@@ -1010,8 +914,7 @@ brenext(struct vars * v,
skip(v);
if (ATEOS())
RET('$');
if (NEXT2('\\', ')'))
{
if (NEXT2('\\', ')')) {
NOTE(REG_UUNSPEC);
RET('$');
}
@@ -1030,8 +933,7 @@ brenext(struct vars * v,
FAILW(REG_EESCAPE);
c = *v->now++;
switch (c)
{
switch (c) {
case CHR('{'):
INTOCON(L_BBND);
NOTE(REG_UBOUNDS);
@@ -1051,21 +953,14 @@ brenext(struct vars * v,
NOTE(REG_UNONPOSIX);
RET('>');
break;
case CHR('1'):
case CHR('2'):
case CHR('3'):
case CHR('4'):
case CHR('5'):
case CHR('6'):
case CHR('7'):
case CHR('8'):
case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'):
case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'):
case CHR('9'):
NOTE(REG_UBACKREF);
RETV(BACKREF, (chr)DIGITVAL(c));
break;
default:
if (iscalnum(c))
{
if (iscalnum(c)) {
NOTE(REG_UBSALNUM);
NOTE(REG_UUNSPEC);
}
@@ -1077,17 +972,18 @@ brenext(struct vars * v,
}
/*
* skip - skip white space and comments in expanded form
- skip - skip white space and comments in expanded form
^ static VOID skip(struct vars *);
*/
static void
skip(struct vars * v)
static VOID
skip(v)
struct vars *v;
{
chr *start = v->now;
assert(v->cflags&REG_EXPANDED);
for (;;)
{
for (;;) {
while (!ATEOS() && iscspace(*v->now))
v->now++;
if (ATEOS() || *v->now != CHR('#'))
@@ -1103,27 +999,46 @@ skip(struct vars * v)
}
/*
* newline - return the chr for a newline
*
- newline - return the chr for a newline
* This helps confine use of CHR to this source file.
^ static chr newline(NOPARMS);
*/
static chr
newline(void)
newline()
{
return CHR('\n');
}
/*
* chrnamed - return the chr known by a given (chr string) name
*
- ch - return the chr sequence for regc_locale.c's fake collating element ch
* This helps confine use of CHR to this source file. Beware that the caller
* knows how long the sequence is.
^ #ifdef REG_DEBUG
^ static chr *ch(NOPARMS);
^ #endif
*/
#ifdef REG_DEBUG
static chr *
ch()
{
static chr chstr[] = { CHR('c'), CHR('h'), CHR('\0') };
return chstr;
}
#endif
/*
- chrnamed - return the chr known by a given (chr string) name
* The code is a bit clumsy, but this routine gets only such specialized
* use that it hardly matters.
^ static chr chrnamed(struct vars *, chr *, chr *, pchr);
*/
static chr
chrnamed(struct vars * v,
chr *startp, /* start of name */
chr *endp, /* just past end of name */
chr lastresort) /* what to return if name lookup fails */
chrnamed(v, startp, endp, lastresort)
struct vars *v;
chr *startp; /* start of name */
chr *endp; /* just past end of name */
pchr lastresort; /* what to return if name lookup fails */
{
celt c;
int errsave;

View File

@@ -1,75 +1,18 @@
/*
* $Id$
*/
{
REG_OKAY, "REG_OKAY", "no errors detected"
},
{
REG_NOMATCH, "REG_NOMATCH", "failed to match"
},
{
REG_BADPAT, "REG_BADPAT", "invalid regexp (reg version 0.8)"
},
{
REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element"
},
{
REG_ECTYPE, "REG_ECTYPE", "invalid character class"
},
{
REG_EESCAPE, "REG_EESCAPE", "invalid escape \\ sequence"
},
{
REG_ESUBREG, "REG_ESUBREG", "invalid backreference number"
},
{
REG_EBRACK, "REG_EBRACK", "brackets [] not balanced"
},
{
REG_EPAREN, "REG_EPAREN", "parentheses () not balanced"
},
{
REG_EBRACE, "REG_EBRACE", "braces {} not balanced"
},
{
REG_BADBR, "REG_BADBR", "invalid repetition count(s)"
},
{
REG_ERANGE, "REG_ERANGE", "invalid character range"
},
{
REG_ESPACE, "REG_ESPACE", "out of memory"
},
{
REG_BADRPT, "REG_BADRPT", "quantifier operand invalid"
},
{
REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug"
},
{
REG_INVARG, "REG_INVARG", "invalid argument to regex function"
},
{
REG_MIXED, "REG_MIXED", "character widths of regex and string differ"
},
{
REG_BADOPT, "REG_BADOPT", "invalid embedded option"
},
{ REG_OKAY, "REG_OKAY", "no errors detected" },
{ REG_NOMATCH, "REG_NOMATCH", "failed to match" },
{ REG_BADPAT, "REG_BADPAT", "invalid regexp (reg version 0.8)" },
{ REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element" },
{ REG_ECTYPE, "REG_ECTYPE", "invalid character class" },
{ REG_EESCAPE, "REG_EESCAPE", "invalid escape \\ sequence" },
{ REG_ESUBREG, "REG_ESUBREG", "invalid backreference number" },
{ REG_EBRACK, "REG_EBRACK", "brackets [] not balanced" },
{ REG_EPAREN, "REG_EPAREN", "parentheses () not balanced" },
{ REG_EBRACE, "REG_EBRACE", "braces {} not balanced" },
{ REG_BADBR, "REG_BADBR", "invalid repetition count(s)" },
{ REG_ERANGE, "REG_ERANGE", "invalid character range" },
{ REG_ESPACE, "REG_ESPACE", "out of memory" },
{ REG_BADRPT, "REG_BADRPT", "quantifier operand invalid" },
{ REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug" },
{ REG_INVARG, "REG_INVARG", "invalid argument to regex function" },
{ REG_MIXED, "REG_MIXED", "character widths of regex and string differ" },
{ REG_BADOPT, "REG_BADOPT", "invalid embedded option" },

View File

@@ -1,74 +1,341 @@
#ifndef _REGEX_H_
#define _REGEX_H_ /* never again */
/* ========= begin header generated by ./mkh ========= */
/*
* regular expressions
*
* Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
*
* Development of this software was funded, in part, by Cray Research Inc.,
* UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
* Corporation, none of whom are responsible for the results. The author
* thanks all of them.
*
* Redistribution and use in source and binary forms -- with or without
* modification -- are permitted for any purpose, provided that
* redistributions in source form retain this entire copyright notice and
* indicate the origin and nature of any modifications.
*
* I'd appreciate being given credit for this package in the documentation
* of software which uses it, but that is not a requirement.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*
*
* Prototypes etc. marked with "^" within comments get gathered up (and
* possibly edited) by the regfwd program and inserted near the bottom of
* this file.
*
* We offer the option of declaring one wide-character version of the
* RE functions as well as the char versions. To do that, define
* __REG_WIDE_T to the type of wide characters (unfortunately, there
* is no consensus that wchar_t is suitable) and __REG_WIDE_COMPILE and
* __REG_WIDE_EXEC to the names to be used for the compile and execute
* functions (suggestion: re_Xcomp and re_Xexec, where X is a letter
* suggestive of the wide type, e.g. re_ucomp and re_uexec for Unicode).
* For cranky old compilers, it may be necessary to do something like:
* #define __REG_WIDE_COMPILE(a,b,c,d) re_Xcomp(a,b,c,d)
* #define __REG_WIDE_EXEC(a,b,c,d,e,f,g) re_Xexec(a,b,c,d,e,f,g)
* rather than just #defining the names as parameterless macros.
*
* For some specialized purposes, it may be desirable to suppress the
* declarations of the "front end" functions, regcomp() and regexec(),
* or of the char versions of the compile and execute functions. To
* suppress the front-end functions, define __REG_NOFRONT. To suppress
* the char versions, define __REG_NOCHAR.
*
* The right place to do those defines (and some others you may want, see
* below) would be <sys/types.h>. If you don't have control of that file,
* the right place to add your own defines to this file is marked below.
* This is normally done automatically, by the makefile and regmkhdr, based
* on the contents of regcustom.h.
*/
/*
* voodoo for C++
*/
#ifdef __cplusplus
extern "C" {
#endif
/* === regex2.h === */
typedef off_t regoff_t;
/*
* Add your own defines, if needed, here.
*/
/*
* Location where a chunk of regcustom.h is automatically spliced into
* this file (working from its prototype, regproto.h).
*/
/* --- begin --- */
/* ensure certain things don't sneak in from system headers */
#ifdef __REG_WIDE_T
#undef __REG_WIDE_T
#endif
#ifdef __REG_WIDE_COMPILE
#undef __REG_WIDE_COMPILE
#endif
#ifdef __REG_WIDE_EXEC
#undef __REG_WIDE_EXEC
#endif
#ifdef __REG_REGOFF_T
#undef __REG_REGOFF_T
#endif
#ifdef __REG_VOID_T
#undef __REG_VOID_T
#endif
#ifdef __REG_CONST
#undef __REG_CONST
#endif
#ifdef __REG_NOFRONT
#undef __REG_NOFRONT
#endif
#ifdef __REG_NOCHAR
#undef __REG_NOCHAR
#endif
/* interface types */
#define __REG_WIDE_T Tcl_UniChar
#define __REG_REGOFF_T long /* not really right, but good enough... */
#define __REG_VOID_T VOID
#define __REG_CONST CONST
/* names and declarations */
#define __REG_WIDE_COMPILE TclReComp
#define __REG_WIDE_EXEC TclReExec
#define __REG_NOFRONT /* don't want regcomp() and regexec() */
#define __REG_NOCHAR /* or the char versions */
#define regfree TclReFree
#define regerror TclReError
/* --- end --- */
/*
* interface types etc.
*/
/*
* regoff_t has to be large enough to hold either off_t or ssize_t,
* and must be signed; it's only a guess that long is suitable, so we
* offer <sys/types.h> an override.
*/
#ifdef __REG_REGOFF_T
typedef __REG_REGOFF_T regoff_t;
#else
typedef long regoff_t;
#endif
/*
* For benefit of old compilers, we offer <sys/types.h> the option of
* overriding the `void' type used to declare nonexistent return types.
*/
#ifdef __REG_VOID_T
typedef __REG_VOID_T re_void;
#else
typedef void re_void;
#endif
/*
* Also for benefit of old compilers, <sys/types.h> can supply a macro
* which expands to a substitute for `const'.
*/
#ifndef __REG_CONST
#define __REG_CONST const
#endif
/*
* other interface types
*/
/* the biggie, a compiled RE (or rather, a front end to same) */
typedef struct {
int re_magic;
size_t re_nsub; /* number of parenthesized subexpressions */
const char *re_endp; /* end pointer for REG_PEND */
struct re_guts *re_g; /* none of your business :-) */
int re_magic; /* magic number */
size_t re_nsub; /* number of subexpressions */
long re_info; /* information about RE */
# define REG_UBACKREF 000001
# define REG_ULOOKAHEAD 000002
# define REG_UBOUNDS 000004
# define REG_UBRACES 000010
# define REG_UBSALNUM 000020
# define REG_UPBOTCH 000040
# define REG_UBBS 000100
# define REG_UNONPOSIX 000200
# define REG_UUNSPEC 000400
# define REG_UUNPORT 001000
# define REG_ULOCALE 002000
# define REG_UEMPTYMATCH 004000
# define REG_UIMPOSSIBLE 010000
# define REG_USHORTEST 020000
int re_csize; /* sizeof(character) */
char *re_endp; /* backward compatibility kludge */
/* the rest is opaque pointers to hidden innards */
char *re_guts; /* `char *' is more portable than `void *' */
char *re_fns;
} regex_t;
/* result reporting (may acquire more fields later) */
typedef struct {
regoff_t rm_so; /* start of match */
regoff_t rm_eo; /* end of match */
regoff_t rm_so; /* start of substring */
regoff_t rm_eo; /* end of substring */
} regmatch_t;
/* === regcomp.c === */
extern int regcomp(regex_t *, const char *, int);
#define REG_BASIC 0000
#define REG_EXTENDED 0001
#define REG_ICASE 0002
#define REG_NOSUB 0004
#define REG_NEWLINE 0010
#define REG_NOSPEC 0020
#define REG_PEND 0040
#define REG_DUMP 0200
/* supplementary control and reporting */
typedef struct {
regmatch_t rm_extend; /* see REG_EXPECT */
} rm_detail_t;
/* === regerror.c === */
#define REG_OKAY 0
#define REG_NOMATCH 1
#define REG_BADPAT 2
#define REG_ECOLLATE 3
#define REG_ECTYPE 4
#define REG_EESCAPE 5
#define REG_ESUBREG 6
#define REG_EBRACK 7
#define REG_EPAREN 8
#define REG_EBRACE 9
#define REG_BADBR 10
#define REG_ERANGE 11
#define REG_ESPACE 12
#define REG_BADRPT 13
#define REG_EMPTY 14
#define REG_ASSERT 15
#define REG_INVARG 16
#define REG_ATOI 255 /* convert name to number (!) */
#define REG_ITOA 0400 /* convert number to name (!) */
extern size_t regerror(int, const regex_t *, char *, size_t);
/*
* compilation
^ #ifndef __REG_NOCHAR
^ int re_comp(regex_t *, __REG_CONST char *, size_t, int);
^ #endif
^ #ifndef __REG_NOFRONT
^ int regcomp(regex_t *, __REG_CONST char *, int);
^ #endif
^ #ifdef __REG_WIDE_T
^ int __REG_WIDE_COMPILE(regex_t *, __REG_CONST __REG_WIDE_T *, size_t, int);
^ #endif
*/
#define REG_BASIC 000000 /* BREs (convenience) */
#define REG_EXTENDED 000001 /* EREs */
#define REG_ADVF 000002 /* advanced features in EREs */
#define REG_ADVANCED 000003 /* AREs (which are also EREs) */
#define REG_QUOTE 000004 /* no special characters, none */
#define REG_NOSPEC REG_QUOTE /* historical synonym */
#define REG_ICASE 000010 /* ignore case */
#define REG_NOSUB 000020 /* don't care about subexpressions */
#define REG_EXPANDED 000040 /* expanded format, white space & comments */
#define REG_NLSTOP 000100 /* \n doesn't match . or [^ ] */
#define REG_NLANCH 000200 /* ^ matches after \n, $ before */
#define REG_NEWLINE 000300 /* newlines are line terminators */
#define REG_PEND 000400 /* ugh -- backward-compatibility hack */
#define REG_EXPECT 001000 /* report details on partial/limited matches */
#define REG_BOSONLY 002000 /* temporary kludge for BOS-only matches */
#define REG_DUMP 004000 /* none of your business :-) */
#define REG_FAKE 010000 /* none of your business :-) */
#define REG_PROGRESS 020000 /* none of your business :-) */
/* === regexec.c === */
extern int regexec(const regex_t *, const char *, size_t, regmatch_t [], int);
#define REG_NOTBOL 00001
#define REG_NOTEOL 00002
#define REG_STARTEND 00004
#define REG_TRACE 00400 /* tracing of execution */
#define REG_LARGE 01000 /* force large representation */
#define REG_BACKR 02000 /* force use of backref code */
/*
* execution
^ #ifndef __REG_NOCHAR
^ int re_exec(regex_t *, __REG_CONST char *, size_t,
^ rm_detail_t *, size_t, regmatch_t [], int);
^ #endif
^ #ifndef __REG_NOFRONT
^ int regexec(regex_t *, __REG_CONST char *, size_t, regmatch_t [], int);
^ #endif
^ #ifdef __REG_WIDE_T
^ int __REG_WIDE_EXEC(regex_t *, __REG_CONST __REG_WIDE_T *, size_t,
^ rm_detail_t *, size_t, regmatch_t [], int);
^ #endif
*/
#define REG_NOTBOL 0001 /* BOS is not BOL */
#define REG_NOTEOL 0002 /* EOS is not EOL */
#define REG_STARTEND 0004 /* backward compatibility kludge */
#define REG_FTRACE 0010 /* none of your business */
#define REG_MTRACE 0020 /* none of your business */
#define REG_SMALL 0040 /* none of your business */
/* === regfree.c === */
extern void regfree(regex_t *);
/*
* misc generics (may be more functions here eventually)
^ re_void regfree(regex_t *);
*/
/*
* error reporting
* Be careful if modifying the list of error codes -- the table used by
* regerror() is generated automatically from this file!
*
* Note that there is no wide-char variant of regerror at this time; what
* kind of character is used for error reports is independent of what kind
* is used in matching.
*
^ extern size_t regerror(int, __REG_CONST regex_t *, char *, size_t);
*/
#define REG_OKAY 0 /* no errors detected */
#define REG_NOMATCH 1 /* failed to match */
#define REG_BADPAT 2 /* invalid regexp */
#define REG_ECOLLATE 3 /* invalid collating element */
#define REG_ECTYPE 4 /* invalid character class */
#define REG_EESCAPE 5 /* invalid escape \ sequence */
#define REG_ESUBREG 6 /* invalid backreference number */
#define REG_EBRACK 7 /* brackets [] not balanced */
#define REG_EPAREN 8 /* parentheses () not balanced */
#define REG_EBRACE 9 /* braces {} not balanced */
#define REG_BADBR 10 /* invalid repetition count(s) */
#define REG_ERANGE 11 /* invalid character range */
#define REG_ESPACE 12 /* out of memory */
#define REG_BADRPT 13 /* quantifier operand invalid */
#define REG_ASSERT 15 /* "can't happen" -- you found a bug */
#define REG_INVARG 16 /* invalid argument to regex function */
#define REG_MIXED 17 /* character widths of regex and string differ */
#define REG_BADOPT 18 /* invalid embedded option */
/* two specials for debugging and testing */
#define REG_ATOI 101 /* convert error-code name to number */
#define REG_ITOA 102 /* convert error-code number to name */
/*
* the prototypes, as possibly munched by regfwd
*/
/* =====^!^===== begin forwards =====^!^===== */
/* automatically gathered by fwd; do not hand-edit */
/* === regproto.h === */
#ifndef __REG_NOCHAR
int re_comp _ANSI_ARGS_((regex_t *, __REG_CONST char *, size_t, int));
#endif
#ifndef __REG_NOFRONT
int regcomp _ANSI_ARGS_((regex_t *, __REG_CONST char *, int));
#endif
#ifdef __REG_WIDE_T
int __REG_WIDE_COMPILE _ANSI_ARGS_((regex_t *, __REG_CONST __REG_WIDE_T *, size_t, int));
#endif
#ifndef __REG_NOCHAR
int re_exec _ANSI_ARGS_((regex_t *, __REG_CONST char *, size_t, rm_detail_t *, size_t, regmatch_t [], int));
#endif
#ifndef __REG_NOFRONT
int regexec _ANSI_ARGS_((regex_t *, __REG_CONST char *, size_t, regmatch_t [], int));
#endif
#ifdef __REG_WIDE_T
int __REG_WIDE_EXEC _ANSI_ARGS_((regex_t *, __REG_CONST __REG_WIDE_T *, size_t, rm_detail_t *, size_t, regmatch_t [], int));
#endif
re_void regfree _ANSI_ARGS_((regex_t *));
extern size_t regerror _ANSI_ARGS_((int, __REG_CONST regex_t *, char *, size_t));
/* automatically gathered by fwd; do not hand-edit */
/* =====^!^===== end forwards =====^!^===== */
/*
* more C++ voodoo
*/
#ifdef __cplusplus
}
#endif
/* ========= end header generated by ./mkh ========= */
#endif

View File

@@ -26,8 +26,6 @@
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $Id$
*/
@@ -45,27 +43,52 @@
* Things that regcustom.h might override.
*/
/* standard header files (NULL is a reasonable indicator for them) */
#ifndef NULL
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <limits.h>
#include <string.h>
#endif
/* assertions */
#ifndef assert
# ifndef REG_DEBUG
# ifndef NDEBUG
# define NDEBUG /* no assertions */
# endif
#endif
#include <assert.h>
#endif
/* voids */
#ifndef VOID
#define VOID void /* for function return values */
#endif
#ifndef DISCARD
#define DISCARD void /* for throwing values away */
#define DISCARD VOID /* for throwing values away */
#endif
#ifndef PVOID
#define PVOID VOID * /* generic pointer */
#endif
#ifndef VS
#define VS(x) ((void *)(x)) /* cast something to generic ptr */
#define VS(x) ((PVOID)(x)) /* cast something to generic ptr */
#endif
#ifndef NOPARMS
#define NOPARMS VOID /* for empty parm lists */
#endif
/* const */
#ifndef CONST
#define CONST const /* for old compilers, might be empty */
#endif
/* function-pointer declarator */
#ifndef FUNCPTR
#if __STDC__ >= 1
#define FUNCPTR(name, args) (*name)args
#else
#define FUNCPTR(name, args) (*name)()
#endif
#endif
/* memory allocation */
@@ -136,8 +159,7 @@
#ifndef BYTBITS
#define BYTBITS 8 /* bits in a byt */
#endif
#define BYTTAB (1<<BYTBITS) /* size of table with one entry per byt
* value */
#define BYTTAB (1<<BYTBITS) /* size of table with one entry per byt value */
#define BYTMASK (BYTTAB-1) /* bit mask for byt */
#define NBYTS ((CHRBITS+BYTBITS-1)/BYTBITS)
/* the definition of GETCOLOR(), below, assumes NBYTS <= 4 */
@@ -150,7 +172,6 @@
*/
typedef short color; /* colors of characters */
typedef int pcolor; /* what color promotes to */
#define COLORLESS (-1) /* impossible color */
#define WHITE 0 /* default color, parent of all others */
@@ -167,26 +188,21 @@ typedef int pcolor; /* what color promotes to */
*/
/* the tree itself */
struct colors
{
struct colors {
color ccolor[BYTTAB];
};
struct ptrs
{
struct ptrs {
union tree *pptr[BYTTAB];
};
union tree
{
union tree {
struct colors colors;
struct ptrs ptrs;
};
#define tcolor colors.ccolor
#define tptr ptrs.pptr
/* internal per-color structure for the color machinery */
struct colordesc
{
struct colordesc {
uchr nchrs; /* number of chars of this color */
color sub; /* open subcolor (if any); free chain ptr */
# define NOSUB COLORLESS
@@ -199,8 +215,7 @@ struct colordesc
};
/* the color map itself */
struct colormap
{
struct colormap {
int magic;
# define CMMAGIC 0x876
struct vars *v; /* for compile error reporting */
@@ -236,8 +251,7 @@ struct colormap
* Interface definitions for locale-interface functions in locale.c.
* Multi-character collating elements (MCCEs) cause most of the trouble.
*/
struct cvec
{
struct cvec {
int nchrs; /* number of chrs */
int chrspace; /* number of chrs possible */
chr *chrs; /* pointer to vector of chrs */
@@ -264,8 +278,7 @@ struct cvec
*/
struct state;
struct arc
{
struct arc {
int type;
# define ARCFREE '\0'
color co;
@@ -277,15 +290,13 @@ struct arc
struct arc *colorchain; /* color's arc chain */
};
struct arcbatch
{ /* for bulk allocation of arcs */
struct arcbatch { /* for bulk allocation of arcs */
struct arcbatch *next;
# define ABSIZE 10
struct arc a[ABSIZE];
};
struct state
{
struct state {
int no;
# define FREESTATE (-1)
char flag; /* marks special states */
@@ -297,13 +308,11 @@ struct state
struct state *tmp; /* temporary for traversal algorithms */
struct state *next; /* chain for traversing all */
struct state *prev; /* back chain */
struct arcbatch oas; /* first arcbatch, avoid malloc in easy
* case */
struct arcbatch oas; /* first arcbatch, avoid malloc in easy case */
int noas; /* number of arcs used in first arcbatch */
};
struct nfa
{
struct nfa {
struct state *pre; /* pre-initial state */
struct state *init; /* initial state */
struct state *final; /* final state */
@@ -324,14 +333,12 @@ struct nfa
/*
* definitions for compacted NFA
*/
struct carc
{
struct carc {
color co; /* COLORLESS is list terminator */
int to; /* state number */
};
struct cnfa
{
struct cnfa {
int nstates; /* number of states */
int ncolors; /* number of colors */
int flags;
@@ -343,7 +350,6 @@ struct cnfa
struct carc **states; /* vector of pointers to outarc lists */
struct carc *arcs; /* the area for the lists */
};
#define ZAPCNFA(cnfa) ((cnfa).nstates = 0)
#define NULLCNFA(cnfa) ((cnfa).nstates == 0)
@@ -352,10 +358,8 @@ struct cnfa
/*
* subexpression tree
*/
struct subre
{
char op; /* '|', '.' (concat), 'b' (backref), '(',
* '=' */
struct subre {
char op; /* '|', '.' (concat), 'b' (backref), '(', '=' */
char flags;
# define LONGER 01 /* prefers longer match */
# define SHORTER 02 /* prefers shorter match */
@@ -375,8 +379,7 @@ struct subre
int subno; /* subexpression number (for 'b' and '(') */
short min; /* min repetitions, for backref only */
short max; /* max repetitions, for backref only */
struct subre *left; /* left child, if any (also freelist
* chain) */
struct subre *left; /* left child, if any (also freelist chain) */
struct subre *right; /* right child, if any */
struct state *begin; /* outarcs from here... */
struct state *end; /* ...ending in inarcs here */
@@ -390,9 +393,8 @@ struct subre
* table of function pointers for generic manipulation functions
* A regex_t's re_fns points to one of these.
*/
struct fns
{
void FUNCPTR(free, (regex_t *));
struct fns {
VOID FUNCPTR(free, (regex_t *));
};
@@ -400,8 +402,7 @@ struct fns
/*
* the insides of a regex_t, hidden behind a void *
*/
struct guts
{
struct guts {
int magic;
# define GUTSMAGIC 0xfed9
int cflags; /* copy of compile flags */
@@ -411,7 +412,7 @@ struct guts
struct cnfa search; /* for fast preliminary search */
int ntree;
struct colormap cmap;
int FUNCPTR(compare, (const chr *, const chr *, size_t));
int FUNCPTR(compare, (CONST chr *, CONST chr *, size_t));
struct subre *lacons; /* lookahead-constraint vector */
int nlacons; /* size of lacons */
};