Compare commits
8 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
fa4a6afa3c | ||
|
4891a37694 | ||
|
7b43e63a7b | ||
|
4f4987da94 | ||
|
b174bc9ec6 | ||
|
b80c893495 | ||
|
efda8edb48 | ||
|
5493918356 |
@@ -1,20 +0,0 @@
|
||||
Copyright 1992, 1993, 1994, 1997 Henry Spencer. All rights reserved.
|
||||
This software is not subject to any license of the American Telephone
|
||||
and Telegraph Company or of the Regents of the University of California.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose on
|
||||
any computer system, and to alter it and redistribute it, subject
|
||||
to the following restrictions:
|
||||
|
||||
1. The author is not responsible for the consequences of use of this
|
||||
software, no matter how awful, even if they arise from flaws in it.
|
||||
|
||||
2. The origin of this software must not be misrepresented, either by
|
||||
explicit claim or by omission. Since few users ever read sources,
|
||||
credits must appear in the documentation.
|
||||
|
||||
3. Altered versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software. Since few users
|
||||
ever read sources, credits must appear in the documentation.
|
||||
|
||||
4. This notice may not be removed or altered.
|
@@ -1,130 +0,0 @@
|
||||
# You probably want to take -DREDEBUG out of CFLAGS, and put something like
|
||||
# -O in, *after* testing (-DREDEBUG strengthens testing by enabling a lot of
|
||||
# internal assertion checking and some debugging facilities).
|
||||
# Put -Dconst= in for a pre-ANSI compiler.
|
||||
# Do not take -DPOSIX_MISTAKE out.
|
||||
# REGCFLAGS isn't important to you (it's for my use in some special contexts).
|
||||
CFLAGS=-I. -DPOSIX_MISTAKE -DREDEBUG $(REGCFLAGS)
|
||||
|
||||
# If you have a pre-ANSI compiler, put -o into MKHFLAGS. If you want
|
||||
# the Berkeley __P macro, put -b in.
|
||||
MKHFLAGS=
|
||||
|
||||
# Flags for linking but not compiling, if any.
|
||||
LDFLAGS=
|
||||
|
||||
# Extra libraries for linking, if any.
|
||||
LIBS=
|
||||
|
||||
# Internal stuff, should not need changing.
|
||||
OBJPRODN=regcomp.o regexec.o regerror.o regfree.o
|
||||
OBJS=$(OBJPRODN) split.o debug.o main.o
|
||||
H=cclass.h cname.h regex2.h utils.h
|
||||
REGSRC=regcomp.c regerror.c regexec.c regfree.c
|
||||
ALLSRC=$(REGSRC) engine.c debug.c main.c split.c
|
||||
|
||||
# Stuff that matters only if you're trying to lint the package.
|
||||
LINTFLAGS=-I. -Dstatic= -Dconst= -DREDEBUG
|
||||
LINTC=regcomp.c regexec.c regerror.c regfree.c debug.c main.c
|
||||
JUNKLINT=possible pointer alignment|null effect
|
||||
|
||||
# arrangements to build forward-reference header files
|
||||
.SUFFIXES: .ih .h
|
||||
.c.ih:
|
||||
sh ./mkh $(MKHFLAGS) -p $< >$@
|
||||
|
||||
default: r
|
||||
|
||||
lib: purge $(OBJPRODN)
|
||||
rm -f libregex.a
|
||||
ar crv libregex.a $(OBJPRODN)
|
||||
|
||||
purge:
|
||||
rm -f *.o
|
||||
|
||||
# stuff to build regex.h
|
||||
REGEXH=regex.h
|
||||
REGEXHSRC=regex2.h $(REGSRC)
|
||||
$(REGEXH): $(REGEXHSRC) mkh
|
||||
sh ./mkh $(MKHFLAGS) -i _REGEX_H_ $(REGEXHSRC) >regex.tmp
|
||||
cmp -s regex.tmp regex.h 2>/dev/null || cp regex.tmp regex.h
|
||||
rm -f regex.tmp
|
||||
|
||||
# dependencies
|
||||
$(OBJPRODN) debug.o: utils.h regex.h regex2.h
|
||||
regcomp.o: cclass.h cname.h regcomp.ih
|
||||
regexec.o: engine.c engine.ih
|
||||
regerror.o: regerror.ih
|
||||
debug.o: debug.ih
|
||||
main.o: main.ih
|
||||
|
||||
# tester
|
||||
re: $(OBJS)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) $(OBJS) $(LIBS) -o $@
|
||||
|
||||
# regression test
|
||||
r: re tests
|
||||
./re <tests
|
||||
./re -el <tests
|
||||
./re -er <tests
|
||||
|
||||
# 57 variants, and other stuff, for development use -- not useful to you
|
||||
ra: ./re tests
|
||||
-./re <tests
|
||||
-./re -el <tests
|
||||
-./re -er <tests
|
||||
|
||||
rx: ./re tests
|
||||
./re -x <tests
|
||||
./re -x -el <tests
|
||||
./re -x -er <tests
|
||||
|
||||
t: ./re tests
|
||||
-time ./re <tests
|
||||
-time ./re -cs <tests
|
||||
-time ./re -el <tests
|
||||
-time ./re -cs -el <tests
|
||||
|
||||
l: $(LINTC)
|
||||
lint $(LINTFLAGS) -h $(LINTC) 2>&1 | egrep -v '$(JUNKLINT)' | tee lint
|
||||
|
||||
fullprint:
|
||||
ti README WHATSNEW notes todo | list
|
||||
ti *.h | list
|
||||
list *.c
|
||||
list regex.3 regex.7
|
||||
|
||||
print:
|
||||
ti README WHATSNEW notes todo | list
|
||||
ti *.h | list
|
||||
list reg*.c engine.c
|
||||
|
||||
|
||||
mf.tmp: Makefile
|
||||
sed '/^REGEXH=/s/=.*/=regex.h/' Makefile | sed '/#DEL$$/d' >$@
|
||||
|
||||
DTRH=cclass.h cname.h regex2.h utils.h
|
||||
PRE=COPYRIGHT README WHATSNEW
|
||||
POST=mkh regex.3 regex.7 tests $(DTRH) $(ALLSRC) fake/*.[ch]
|
||||
FILES=$(PRE) Makefile $(POST)
|
||||
DTR=$(PRE) Makefile=mf.tmp $(POST)
|
||||
dtr: $(FILES) mf.tmp
|
||||
makedtr $(DTR) >$@
|
||||
rm mf.tmp
|
||||
|
||||
cio: $(FILES)
|
||||
cio $(FILES)
|
||||
|
||||
rdf: $(FILES)
|
||||
rcsdiff -c $(FILES) 2>&1 | p
|
||||
|
||||
# various forms of cleanup
|
||||
tidy:
|
||||
rm -f junk* core core.* *.core dtr *.tmp lint
|
||||
|
||||
clean: tidy
|
||||
rm -f *.o *.s *.ih re libregex.a
|
||||
|
||||
# don't do this one unless you know what you're doing
|
||||
spotless: clean
|
||||
rm -f mkh regex.h
|
@@ -1,32 +0,0 @@
|
||||
alpha3.8 release.
|
||||
Tue Aug 10 15:51:48 EDT 1999
|
||||
henry@spsystems.net (formerly henry@zoo.toronto.edu)
|
||||
|
||||
See WHATSNEW for change listing.
|
||||
|
||||
installation notes:
|
||||
--------
|
||||
Read the comments at the beginning of Makefile before running.
|
||||
|
||||
Utils.h contains some things that just might have to be modified on
|
||||
some systems, as well as a nested include (ugh) of <assert.h>.
|
||||
|
||||
The "fake" directory contains quick-and-dirty fakes for some header
|
||||
files and routines that old systems may not have. Note also that
|
||||
-DUSEBCOPY will make utils.h substitute bcopy() for memmove().
|
||||
|
||||
After that, "make r" will build regcomp.o, regexec.o, regfree.o,
|
||||
and regerror.o (the actual routines), bundle them together into a test
|
||||
program, and run regression tests on them. No output is good output.
|
||||
|
||||
"make lib" builds just the .o files for the actual routines (when
|
||||
you're happy with testing and have adjusted CFLAGS for production),
|
||||
and puts them together into libregex.a. You can pick up either the
|
||||
library or *.o ("make lib" makes sure there are no other .o files left
|
||||
around to confuse things).
|
||||
|
||||
Main.c, debug.c, split.c are used for regression testing but are not part
|
||||
of the RE routines themselves.
|
||||
|
||||
Regex.h goes in /usr/include. All other .h files are internal only.
|
||||
--------
|
@@ -1,108 +0,0 @@
|
||||
New in alpha3.8: Bug fix for signed/unsigned mixup, found and fixed
|
||||
by the FreeBSD folks.
|
||||
|
||||
New in alpha3.7: A bit of cleanup aimed at maximizing portability,
|
||||
possibly at slight cost in efficiency. "ul" suffixes and "unsigned long"
|
||||
no longer appear, in particular.
|
||||
|
||||
New in alpha3.6: A couple more portability glitches fixed.
|
||||
|
||||
New in alpha3.5: Active development of this code has been stopped --
|
||||
I'm working on a complete reimplementation -- but folks have found some
|
||||
minor portability glitches and the like, hence this release to fix them.
|
||||
One penalty: slightly reduced compatibility with old compilers, because
|
||||
the ANSI C `unsigned long' type and `ul' constant suffix are used in a
|
||||
few places (I could avoid this but it would be considerably more work).
|
||||
|
||||
New in alpha3.4: The complex bug alluded to below has been fixed (in a
|
||||
slightly kludgey temporary way that may hurt efficiency a bit; this is
|
||||
another "get it out the door for 4.4" release). The tests at the end of
|
||||
the tests file have accordingly been uncommented. The primary sign of
|
||||
the bug was that something like a?b matching ab matched b rather than ab.
|
||||
(The bug was essentially specific to this exact situation, else it would
|
||||
have shown up earlier.)
|
||||
|
||||
New in alpha3.3: The definition of word boundaries has been altered
|
||||
slightly, to more closely match the usual programming notion that "_"
|
||||
is an alphabetic. Stuff used for pre-ANSI systems is now in a subdir,
|
||||
and the makefile no longer alludes to it in mysterious ways. The
|
||||
makefile has generally been cleaned up some. Fixes have been made
|
||||
(again!) so that the regression test will run without -DREDEBUG, at
|
||||
the cost of weaker checking. A workaround for a bug in some folks'
|
||||
<assert.h> has been added. And some more things have been added to
|
||||
tests, including a couple right at the end which are commented out
|
||||
because the code currently flunks them (complex bug; fix coming).
|
||||
Plus the usual minor cleanup.
|
||||
|
||||
New in alpha3.2: Assorted bits of cleanup and portability improvement
|
||||
(the development base is now a BSDI system using GCC instead of an ancient
|
||||
Sun system, and the newer compiler exposed some glitches). Fix for a
|
||||
serious bug that affected REs using many [] (including REG_ICASE REs
|
||||
because of the way they are implemented), *sometimes*, depending on
|
||||
memory-allocation patterns. The header-file prototypes no longer name
|
||||
the parameters, avoiding possible name conflicts. The possibility that
|
||||
some clot has defined CHAR_MIN as (say) `-128' instead of `(-128)' is
|
||||
now handled gracefully. "uchar" is no longer used as an internal type
|
||||
name (too many people have the same idea). Still the same old lousy
|
||||
performance, alas.
|
||||
|
||||
New in alpha3.1: Basically nothing, this release is just a bookkeeping
|
||||
convenience. Stay tuned.
|
||||
|
||||
New in alpha3.0: Performance is no better, alas, but some fixes have been
|
||||
made and some functionality has been added. (This is basically the "get
|
||||
it out the door in time for 4.4" release.) One bug fix: regfree() didn't
|
||||
free the main internal structure (how embarrassing). It is now possible
|
||||
to put NULs in either the RE or the target string, using (resp.) a new
|
||||
REG_PEND flag and the old REG_STARTEND flag. The REG_NOSPEC flag to
|
||||
regcomp() makes all characters ordinary, so you can match a literal
|
||||
string easily (this will become more useful when performance improves!).
|
||||
There are now primitives to match beginnings and ends of words, although
|
||||
the syntax is disgusting and so is the implementation. The REG_ATOI
|
||||
debugging interface has changed a bit. And there has been considerable
|
||||
internal cleanup of various kinds.
|
||||
|
||||
New in alpha2.3: Split change list out of README, and moved flags notes
|
||||
into Makefile. Macro-ized the name of regex(7) in regex(3), since it has
|
||||
to change for 4.4BSD. Cleanup work in engine.c, and some new regression
|
||||
tests to catch tricky cases thereof.
|
||||
|
||||
New in alpha2.2: Out-of-date manpages updated. Regerror() acquires two
|
||||
small extensions -- REG_ITOA and REG_ATOI -- which avoid debugging kludges
|
||||
in my own test program and might be useful to others for similar purposes.
|
||||
The regression test will now compile (and run) without REDEBUG. The
|
||||
BRE \$ bug is fixed. Most uses of "uchar" are gone; it's all chars now.
|
||||
Char/uchar parameters are now written int/unsigned, to avoid possible
|
||||
portability problems with unpromoted parameters. Some unsigned casts have
|
||||
been introduced to minimize portability problems with shifting into sign
|
||||
bits.
|
||||
|
||||
New in alpha2.1: Lots of little stuff, cleanup and fixes. The one big
|
||||
thing is that regex.h is now generated, using mkh, rather than being
|
||||
supplied in the distribution; due to circularities in dependencies,
|
||||
you have to build regex.h explicitly by "make h". The two known bugs
|
||||
have been fixed (and the regression test now checks for them), as has a
|
||||
problem with assertions not being suppressed in the absence of REDEBUG.
|
||||
No performance work yet.
|
||||
|
||||
New in alpha2: Backslash-anything is an ordinary character, not an
|
||||
error (except, of course, for the handful of backslashed metacharacters
|
||||
in BREs), which should reduce script breakage. The regression test
|
||||
checks *where* null strings are supposed to match, and has generally
|
||||
been tightened up somewhat. Small bug fixes in parameter passing (not
|
||||
harmful, but technically errors) and some other areas. Debugging
|
||||
invoked by defining REDEBUG rather than not defining NDEBUG.
|
||||
|
||||
New in alpha+3: full prototyping for internal routines, using a little
|
||||
helper program, mkh, which extracts prototypes given in stylized comments.
|
||||
More minor cleanup. Buglet fix: it's CHAR_BIT, not CHAR_BITS. Simple
|
||||
pre-screening of input when a literal string is known to be part of the
|
||||
RE; this does wonders for performance.
|
||||
|
||||
New in alpha+2: minor bits of cleanup. Notably, the number "32" for the
|
||||
word width isn't hardwired into regexec.c any more, the public header
|
||||
file prototypes the functions if __STDC__ is defined, and some small typos
|
||||
in the manpages have been fixed.
|
||||
|
||||
New in alpha+1: improvements to the manual pages, and an important
|
||||
extension, the REG_STARTEND option to regexec().
|
@@ -1,31 +0,0 @@
|
||||
/* character-class table */
|
||||
static struct cclass {
|
||||
char *name;
|
||||
char *chars;
|
||||
char *multis;
|
||||
} cclasses[] = {
|
||||
"alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
|
||||
0123456789", "",
|
||||
"alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
|
||||
"",
|
||||
"blank", " \t", "",
|
||||
"cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\
|
||||
\25\26\27\30\31\32\33\34\35\36\37\177", "",
|
||||
"digit", "0123456789", "",
|
||||
"graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
|
||||
0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
|
||||
"",
|
||||
"lower", "abcdefghijklmnopqrstuvwxyz",
|
||||
"",
|
||||
"print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
|
||||
0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ",
|
||||
"",
|
||||
"punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
|
||||
"",
|
||||
"space", "\t\n\v\f\r ", "",
|
||||
"upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
|
||||
"",
|
||||
"xdigit", "0123456789ABCDEFabcdef",
|
||||
"",
|
||||
NULL, 0, ""
|
||||
};
|
@@ -1,102 +0,0 @@
|
||||
/* character-name table */
|
||||
static struct cname {
|
||||
char *name;
|
||||
char code;
|
||||
} cnames[] = {
|
||||
"NUL", '\0',
|
||||
"SOH", '\001',
|
||||
"STX", '\002',
|
||||
"ETX", '\003',
|
||||
"EOT", '\004',
|
||||
"ENQ", '\005',
|
||||
"ACK", '\006',
|
||||
"BEL", '\007',
|
||||
"alert", '\007',
|
||||
"BS", '\010',
|
||||
"backspace", '\b',
|
||||
"HT", '\011',
|
||||
"tab", '\t',
|
||||
"LF", '\012',
|
||||
"newline", '\n',
|
||||
"VT", '\013',
|
||||
"vertical-tab", '\v',
|
||||
"FF", '\014',
|
||||
"form-feed", '\f',
|
||||
"CR", '\015',
|
||||
"carriage-return", '\r',
|
||||
"SO", '\016',
|
||||
"SI", '\017',
|
||||
"DLE", '\020',
|
||||
"DC1", '\021',
|
||||
"DC2", '\022',
|
||||
"DC3", '\023',
|
||||
"DC4", '\024',
|
||||
"NAK", '\025',
|
||||
"SYN", '\026',
|
||||
"ETB", '\027',
|
||||
"CAN", '\030',
|
||||
"EM", '\031',
|
||||
"SUB", '\032',
|
||||
"ESC", '\033',
|
||||
"IS4", '\034',
|
||||
"FS", '\034',
|
||||
"IS3", '\035',
|
||||
"GS", '\035',
|
||||
"IS2", '\036',
|
||||
"RS", '\036',
|
||||
"IS1", '\037',
|
||||
"US", '\037',
|
||||
"space", ' ',
|
||||
"exclamation-mark", '!',
|
||||
"quotation-mark", '"',
|
||||
"number-sign", '#',
|
||||
"dollar-sign", '$',
|
||||
"percent-sign", '%',
|
||||
"ampersand", '&',
|
||||
"apostrophe", '\'',
|
||||
"left-parenthesis", '(',
|
||||
"right-parenthesis", ')',
|
||||
"asterisk", '*',
|
||||
"plus-sign", '+',
|
||||
"comma", ',',
|
||||
"hyphen", '-',
|
||||
"hyphen-minus", '-',
|
||||
"period", '.',
|
||||
"full-stop", '.',
|
||||
"slash", '/',
|
||||
"solidus", '/',
|
||||
"zero", '0',
|
||||
"one", '1',
|
||||
"two", '2',
|
||||
"three", '3',
|
||||
"four", '4',
|
||||
"five", '5',
|
||||
"six", '6',
|
||||
"seven", '7',
|
||||
"eight", '8',
|
||||
"nine", '9',
|
||||
"colon", ':',
|
||||
"semicolon", ';',
|
||||
"less-than-sign", '<',
|
||||
"equals-sign", '=',
|
||||
"greater-than-sign", '>',
|
||||
"question-mark", '?',
|
||||
"commercial-at", '@',
|
||||
"left-square-bracket", '[',
|
||||
"backslash", '\\',
|
||||
"reverse-solidus", '\\',
|
||||
"right-square-bracket", ']',
|
||||
"circumflex", '^',
|
||||
"circumflex-accent", '^',
|
||||
"underscore", '_',
|
||||
"low-line", '_',
|
||||
"grave-accent", '`',
|
||||
"left-brace", '{',
|
||||
"left-curly-bracket", '{',
|
||||
"vertical-line", '|',
|
||||
"right-brace", '}',
|
||||
"right-curly-bracket", '}',
|
||||
"tilde", '~',
|
||||
"DEL", '\177',
|
||||
NULL, 0,
|
||||
};
|
@@ -1,242 +0,0 @@
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
#include <limits.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/types.h>
|
||||
#include <regex.h>
|
||||
|
||||
#include "utils.h"
|
||||
#include "regex2.h"
|
||||
#include "debug.ih"
|
||||
|
||||
/*
|
||||
- regprint - print a regexp for debugging
|
||||
== void regprint(regex_t *r, FILE *d);
|
||||
*/
|
||||
void
|
||||
regprint(r, d)
|
||||
regex_t *r;
|
||||
FILE *d;
|
||||
{
|
||||
register struct re_guts *g = r->re_g;
|
||||
register int i;
|
||||
register int c;
|
||||
register int last;
|
||||
int nincat[NC];
|
||||
|
||||
fprintf(d, "%ld states, %d categories", (long)g->nstates,
|
||||
g->ncategories);
|
||||
fprintf(d, ", first %ld last %ld", (long)g->firststate,
|
||||
(long)g->laststate);
|
||||
if (g->iflags&USEBOL)
|
||||
fprintf(d, ", USEBOL");
|
||||
if (g->iflags&USEEOL)
|
||||
fprintf(d, ", USEEOL");
|
||||
if (g->iflags&BAD)
|
||||
fprintf(d, ", BAD");
|
||||
if (g->nsub > 0)
|
||||
fprintf(d, ", nsub=%ld", (long)g->nsub);
|
||||
if (g->must != NULL)
|
||||
fprintf(d, ", must(%ld) `%*s'", (long)g->mlen, (int)g->mlen,
|
||||
g->must);
|
||||
if (g->backrefs)
|
||||
fprintf(d, ", backrefs");
|
||||
if (g->nplus > 0)
|
||||
fprintf(d, ", nplus %ld", (long)g->nplus);
|
||||
fprintf(d, "\n");
|
||||
s_print(g, d);
|
||||
for (i = 0; i < g->ncategories; i++) {
|
||||
nincat[i] = 0;
|
||||
for (c = CHAR_MIN; c <= CHAR_MAX; c++)
|
||||
if (g->categories[c] == i)
|
||||
nincat[i]++;
|
||||
}
|
||||
fprintf(d, "cc0#%d", nincat[0]);
|
||||
for (i = 1; i < g->ncategories; i++)
|
||||
if (nincat[i] == 1) {
|
||||
for (c = CHAR_MIN; c <= CHAR_MAX; c++)
|
||||
if (g->categories[c] == i)
|
||||
break;
|
||||
fprintf(d, ", %d=%s", i, regchar(c));
|
||||
}
|
||||
fprintf(d, "\n");
|
||||
for (i = 1; i < g->ncategories; i++)
|
||||
if (nincat[i] != 1) {
|
||||
fprintf(d, "cc%d\t", i);
|
||||
last = -1;
|
||||
for (c = CHAR_MIN; c <= CHAR_MAX+1; c++) /* +1 does flush */
|
||||
if (c <= CHAR_MAX && g->categories[c] == i) {
|
||||
if (last < 0) {
|
||||
fprintf(d, "%s", regchar(c));
|
||||
last = c;
|
||||
}
|
||||
} else {
|
||||
if (last >= 0) {
|
||||
if (last != c-1)
|
||||
fprintf(d, "-%s",
|
||||
regchar(c-1));
|
||||
last = -1;
|
||||
}
|
||||
}
|
||||
fprintf(d, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
- s_print - print the strip for debugging
|
||||
== static void s_print(register struct re_guts *g, FILE *d);
|
||||
*/
|
||||
static void
|
||||
s_print(g, d)
|
||||
register struct re_guts *g;
|
||||
FILE *d;
|
||||
{
|
||||
register sop *s;
|
||||
register cset *cs;
|
||||
register int i;
|
||||
register int done = 0;
|
||||
register sop opnd;
|
||||
register int col = 0;
|
||||
register int last;
|
||||
register sopno offset = 2;
|
||||
# define GAP() { if (offset % 5 == 0) { \
|
||||
if (col > 40) { \
|
||||
fprintf(d, "\n\t"); \
|
||||
col = 0; \
|
||||
} else { \
|
||||
fprintf(d, " "); \
|
||||
col++; \
|
||||
} \
|
||||
} else \
|
||||
col++; \
|
||||
offset++; \
|
||||
}
|
||||
|
||||
if (OP(g->strip[0]) != OEND)
|
||||
fprintf(d, "missing initial OEND!\n");
|
||||
for (s = &g->strip[1]; !done; s++) {
|
||||
opnd = OPND(*s);
|
||||
switch (OP(*s)) {
|
||||
case OEND:
|
||||
fprintf(d, "\n");
|
||||
done = 1;
|
||||
break;
|
||||
case OCHAR:
|
||||
if (strchr("\\|()^$.[+*?{}!<> ", (char)opnd) != NULL)
|
||||
fprintf(d, "\\%c", (char)opnd);
|
||||
else
|
||||
fprintf(d, "%s", regchar((char)opnd));
|
||||
break;
|
||||
case OBOL:
|
||||
fprintf(d, "^");
|
||||
break;
|
||||
case OEOL:
|
||||
fprintf(d, "$");
|
||||
break;
|
||||
case OBOW:
|
||||
fprintf(d, "\\{");
|
||||
break;
|
||||
case OEOW:
|
||||
fprintf(d, "\\}");
|
||||
break;
|
||||
case OANY:
|
||||
fprintf(d, ".");
|
||||
break;
|
||||
case OANYOF:
|
||||
fprintf(d, "[(%ld)", (long)opnd);
|
||||
cs = &g->sets[opnd];
|
||||
last = -1;
|
||||
for (i = 0; i < g->csetsize+1; i++) /* +1 flushes */
|
||||
if (CHIN(cs, i) && i < g->csetsize) {
|
||||
if (last < 0) {
|
||||
fprintf(d, "%s", regchar(i));
|
||||
last = i;
|
||||
}
|
||||
} else {
|
||||
if (last >= 0) {
|
||||
if (last != i-1)
|
||||
fprintf(d, "-%s",
|
||||
regchar(i-1));
|
||||
last = -1;
|
||||
}
|
||||
}
|
||||
fprintf(d, "]");
|
||||
break;
|
||||
case OBACK_:
|
||||
fprintf(d, "(\\<%ld>", (long)opnd);
|
||||
break;
|
||||
case O_BACK:
|
||||
fprintf(d, "<%ld>\\)", (long)opnd);
|
||||
break;
|
||||
case OPLUS_:
|
||||
fprintf(d, "(+");
|
||||
if (OP(*(s+opnd)) != O_PLUS)
|
||||
fprintf(d, "<%ld>", (long)opnd);
|
||||
break;
|
||||
case O_PLUS:
|
||||
if (OP(*(s-opnd)) != OPLUS_)
|
||||
fprintf(d, "<%ld>", (long)opnd);
|
||||
fprintf(d, "+)");
|
||||
break;
|
||||
case OQUEST_:
|
||||
fprintf(d, "(?");
|
||||
if (OP(*(s+opnd)) != O_QUEST)
|
||||
fprintf(d, "<%ld>", (long)opnd);
|
||||
break;
|
||||
case O_QUEST:
|
||||
if (OP(*(s-opnd)) != OQUEST_)
|
||||
fprintf(d, "<%ld>", (long)opnd);
|
||||
fprintf(d, "?)");
|
||||
break;
|
||||
case OLPAREN:
|
||||
fprintf(d, "((<%ld>", (long)opnd);
|
||||
break;
|
||||
case ORPAREN:
|
||||
fprintf(d, "<%ld>))", (long)opnd);
|
||||
break;
|
||||
case OCH_:
|
||||
fprintf(d, "<");
|
||||
if (OP(*(s+opnd)) != OOR2)
|
||||
fprintf(d, "<%ld>", (long)opnd);
|
||||
break;
|
||||
case OOR1:
|
||||
if (OP(*(s-opnd)) != OOR1 && OP(*(s-opnd)) != OCH_)
|
||||
fprintf(d, "<%ld>", (long)opnd);
|
||||
fprintf(d, "|");
|
||||
break;
|
||||
case OOR2:
|
||||
fprintf(d, "|");
|
||||
if (OP(*(s+opnd)) != OOR2 && OP(*(s+opnd)) != O_CH)
|
||||
fprintf(d, "<%ld>", (long)opnd);
|
||||
break;
|
||||
case O_CH:
|
||||
if (OP(*(s-opnd)) != OOR1)
|
||||
fprintf(d, "<%ld>", (long)opnd);
|
||||
fprintf(d, ">");
|
||||
break;
|
||||
default:
|
||||
fprintf(d, "!%d(%d)!", OP(*s), opnd);
|
||||
break;
|
||||
}
|
||||
if (!done)
|
||||
GAP();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
- regchar - make a character printable
|
||||
== static char *regchar(int ch);
|
||||
*/
|
||||
static char * /* -> representation */
|
||||
regchar(ch)
|
||||
int ch;
|
||||
{
|
||||
static char buf[10];
|
||||
|
||||
if (isprint(ch) || ch == ' ')
|
||||
sprintf(buf, "%c", ch);
|
||||
else
|
||||
sprintf(buf, "\\%o", ch);
|
||||
return(buf);
|
||||
}
|
1019
src/regex/engine.c
1019
src/regex/engine.c
File diff suppressed because it is too large
Load Diff
510
src/regex/main.c
510
src/regex/main.c
@@ -1,510 +0,0 @@
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include <regex.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "main.ih"
|
||||
|
||||
char *progname;
|
||||
int debug = 0;
|
||||
int line = 0;
|
||||
int status = 0;
|
||||
|
||||
int copts = REG_EXTENDED;
|
||||
int eopts = 0;
|
||||
regoff_t startoff = 0;
|
||||
regoff_t endoff = 0;
|
||||
|
||||
|
||||
extern int split();
|
||||
extern void regprint();
|
||||
|
||||
/*
|
||||
- main - do the simple case, hand off to regress() for regression
|
||||
*/
|
||||
main(argc, argv)
|
||||
int argc;
|
||||
char *argv[];
|
||||
{
|
||||
regex_t re;
|
||||
# define NS 10
|
||||
regmatch_t subs[NS];
|
||||
char erbuf[100];
|
||||
int err;
|
||||
size_t len;
|
||||
int c;
|
||||
int errflg = 0;
|
||||
register int i;
|
||||
extern int optind;
|
||||
extern char *optarg;
|
||||
|
||||
progname = argv[0];
|
||||
|
||||
while ((c = getopt(argc, argv, "c:e:S:E:x")) != EOF)
|
||||
switch (c) {
|
||||
case 'c': /* compile options */
|
||||
copts = options('c', optarg);
|
||||
break;
|
||||
case 'e': /* execute options */
|
||||
eopts = options('e', optarg);
|
||||
break;
|
||||
case 'S': /* start offset */
|
||||
startoff = (regoff_t)atoi(optarg);
|
||||
break;
|
||||
case 'E': /* end offset */
|
||||
endoff = (regoff_t)atoi(optarg);
|
||||
break;
|
||||
case 'x': /* Debugging. */
|
||||
debug++;
|
||||
break;
|
||||
case '?':
|
||||
default:
|
||||
errflg++;
|
||||
break;
|
||||
}
|
||||
if (errflg) {
|
||||
fprintf(stderr, "usage: %s ", progname);
|
||||
fprintf(stderr, "[-c copt][-C][-d] [re]\n");
|
||||
exit(2);
|
||||
}
|
||||
|
||||
if (optind >= argc) {
|
||||
regress(stdin);
|
||||
exit(status);
|
||||
}
|
||||
|
||||
err = regcomp(&re, argv[optind++], copts);
|
||||
if (err) {
|
||||
len = regerror(err, &re, erbuf, sizeof(erbuf));
|
||||
fprintf(stderr, "error %s, %d/%d `%s'\n",
|
||||
eprint(err), len, sizeof(erbuf), erbuf);
|
||||
exit(status);
|
||||
}
|
||||
regprint(&re, stdout);
|
||||
|
||||
if (optind >= argc) {
|
||||
regfree(&re);
|
||||
exit(status);
|
||||
}
|
||||
|
||||
if (eopts®_STARTEND) {
|
||||
subs[0].rm_so = startoff;
|
||||
subs[0].rm_eo = strlen(argv[optind]) - endoff;
|
||||
}
|
||||
err = regexec(&re, argv[optind], (size_t)NS, subs, eopts);
|
||||
if (err) {
|
||||
len = regerror(err, &re, erbuf, sizeof(erbuf));
|
||||
fprintf(stderr, "error %s, %d/%d `%s'\n",
|
||||
eprint(err), len, sizeof(erbuf), erbuf);
|
||||
exit(status);
|
||||
}
|
||||
if (!(copts®_NOSUB)) {
|
||||
len = (int)(subs[0].rm_eo - subs[0].rm_so);
|
||||
if (subs[0].rm_so != -1) {
|
||||
if (len != 0)
|
||||
printf("match `%.*s'\n", len,
|
||||
argv[optind] + subs[0].rm_so);
|
||||
else
|
||||
printf("match `'@%.1s\n",
|
||||
argv[optind] + subs[0].rm_so);
|
||||
}
|
||||
for (i = 1; i < NS; i++)
|
||||
if (subs[i].rm_so != -1)
|
||||
printf("(%d) `%.*s'\n", i,
|
||||
(int)(subs[i].rm_eo - subs[i].rm_so),
|
||||
argv[optind] + subs[i].rm_so);
|
||||
}
|
||||
exit(status);
|
||||
}
|
||||
|
||||
/*
|
||||
- regress - main loop of regression test
|
||||
== void regress(FILE *in);
|
||||
*/
|
||||
void
|
||||
regress(in)
|
||||
FILE *in;
|
||||
{
|
||||
char inbuf[1000];
|
||||
# define MAXF 10
|
||||
char *f[MAXF];
|
||||
int nf;
|
||||
int i;
|
||||
char erbuf[100];
|
||||
size_t ne;
|
||||
char *badpat = "invalid regular expression";
|
||||
# define SHORT 10
|
||||
char *bpname = "REG_BADPAT";
|
||||
regex_t re;
|
||||
|
||||
while (fgets(inbuf, sizeof(inbuf), in) != NULL) {
|
||||
line++;
|
||||
if (inbuf[0] == '#' || inbuf[0] == '\n')
|
||||
continue; /* NOTE CONTINUE */
|
||||
inbuf[strlen(inbuf)-1] = '\0'; /* get rid of stupid \n */
|
||||
if (debug)
|
||||
fprintf(stdout, "%d:\n", line);
|
||||
nf = split(inbuf, f, MAXF, "\t\t");
|
||||
if (nf < 3) {
|
||||
fprintf(stderr, "bad input, line %d\n", line);
|
||||
exit(1);
|
||||
}
|
||||
for (i = 0; i < nf; i++)
|
||||
if (strcmp(f[i], "\"\"") == 0)
|
||||
f[i] = "";
|
||||
if (nf <= 3)
|
||||
f[3] = NULL;
|
||||
if (nf <= 4)
|
||||
f[4] = NULL;
|
||||
try(f[0], f[1], f[2], f[3], f[4], options('c', f[1]));
|
||||
if (opt('&', f[1])) /* try with either type of RE */
|
||||
try(f[0], f[1], f[2], f[3], f[4],
|
||||
options('c', f[1]) &~ REG_EXTENDED);
|
||||
}
|
||||
|
||||
ne = regerror(REG_BADPAT, (regex_t *)NULL, erbuf, sizeof(erbuf));
|
||||
if (strcmp(erbuf, badpat) != 0 || ne != strlen(badpat)+1) {
|
||||
fprintf(stderr, "end: regerror() test gave `%s' not `%s'\n",
|
||||
erbuf, badpat);
|
||||
status = 1;
|
||||
}
|
||||
ne = regerror(REG_BADPAT, (regex_t *)NULL, erbuf, (size_t)SHORT);
|
||||
if (strncmp(erbuf, badpat, SHORT-1) != 0 || erbuf[SHORT-1] != '\0' ||
|
||||
ne != strlen(badpat)+1) {
|
||||
fprintf(stderr, "end: regerror() short test gave `%s' not `%.*s'\n",
|
||||
erbuf, SHORT-1, badpat);
|
||||
status = 1;
|
||||
}
|
||||
ne = regerror(REG_ITOA|REG_BADPAT, (regex_t *)NULL, erbuf, sizeof(erbuf));
|
||||
if (strcmp(erbuf, bpname) != 0 || ne != strlen(bpname)+1) {
|
||||
fprintf(stderr, "end: regerror() ITOA test gave `%s' not `%s'\n",
|
||||
erbuf, bpname);
|
||||
status = 1;
|
||||
}
|
||||
re.re_endp = bpname;
|
||||
ne = regerror(REG_ATOI, &re, erbuf, sizeof(erbuf));
|
||||
if (atoi(erbuf) != (int)REG_BADPAT) {
|
||||
fprintf(stderr, "end: regerror() ATOI test gave `%s' not `%ld'\n",
|
||||
erbuf, (long)REG_BADPAT);
|
||||
status = 1;
|
||||
} else if (ne != strlen(erbuf)+1) {
|
||||
fprintf(stderr, "end: regerror() ATOI test len(`%s') = %ld\n",
|
||||
erbuf, (long)REG_BADPAT);
|
||||
status = 1;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
- try - try it, and report on problems
|
||||
== void try(char *f0, char *f1, char *f2, char *f3, char *f4, int opts);
|
||||
*/
|
||||
void
|
||||
try(f0, f1, f2, f3, f4, opts)
|
||||
char *f0;
|
||||
char *f1;
|
||||
char *f2;
|
||||
char *f3;
|
||||
char *f4;
|
||||
int opts; /* may not match f1 */
|
||||
{
|
||||
regex_t re;
|
||||
# define NSUBS 10
|
||||
regmatch_t subs[NSUBS];
|
||||
# define NSHOULD 15
|
||||
char *should[NSHOULD];
|
||||
int nshould;
|
||||
char erbuf[100];
|
||||
int err;
|
||||
int len;
|
||||
char *type = (opts & REG_EXTENDED) ? "ERE" : "BRE";
|
||||
register int i;
|
||||
char *grump;
|
||||
char f0copy[1000];
|
||||
char f2copy[1000];
|
||||
|
||||
strcpy(f0copy, f0);
|
||||
re.re_endp = (opts®_PEND) ? f0copy + strlen(f0copy) : NULL;
|
||||
fixstr(f0copy);
|
||||
err = regcomp(&re, f0copy, opts);
|
||||
if (err != 0 && (!opt('C', f1) || err != efind(f2))) {
|
||||
/* unexpected error or wrong error */
|
||||
len = regerror(err, &re, erbuf, sizeof(erbuf));
|
||||
fprintf(stderr, "%d: %s error %s, %d/%d `%s'\n",
|
||||
line, type, eprint(err), len,
|
||||
sizeof(erbuf), erbuf);
|
||||
status = 1;
|
||||
} else if (err == 0 && opt('C', f1)) {
|
||||
/* unexpected success */
|
||||
fprintf(stderr, "%d: %s should have given REG_%s\n",
|
||||
line, type, f2);
|
||||
status = 1;
|
||||
err = 1; /* so we won't try regexec */
|
||||
}
|
||||
|
||||
if (err != 0) {
|
||||
regfree(&re);
|
||||
return;
|
||||
}
|
||||
|
||||
strcpy(f2copy, f2);
|
||||
fixstr(f2copy);
|
||||
|
||||
if (options('e', f1)®_STARTEND) {
|
||||
if (strchr(f2, '(') == NULL || strchr(f2, ')') == NULL)
|
||||
fprintf(stderr, "%d: bad STARTEND syntax\n", line);
|
||||
subs[0].rm_so = strchr(f2, '(') - f2 + 1;
|
||||
subs[0].rm_eo = strchr(f2, ')') - f2;
|
||||
}
|
||||
err = regexec(&re, f2copy, NSUBS, subs, options('e', f1));
|
||||
|
||||
if (err != 0 && (f3 != NULL || err != REG_NOMATCH)) {
|
||||
/* unexpected error or wrong error */
|
||||
len = regerror(err, &re, erbuf, sizeof(erbuf));
|
||||
fprintf(stderr, "%d: %s exec error %s, %d/%d `%s'\n",
|
||||
line, type, eprint(err), len,
|
||||
sizeof(erbuf), erbuf);
|
||||
status = 1;
|
||||
} else if (err != 0) {
|
||||
/* nothing more to check */
|
||||
} else if (f3 == NULL) {
|
||||
/* unexpected success */
|
||||
fprintf(stderr, "%d: %s exec should have failed\n",
|
||||
line, type);
|
||||
status = 1;
|
||||
err = 1; /* just on principle */
|
||||
} else if (opts®_NOSUB) {
|
||||
/* nothing more to check */
|
||||
} else if ((grump = check(f2, subs[0], f3)) != NULL) {
|
||||
fprintf(stderr, "%d: %s %s\n", line, type, grump);
|
||||
status = 1;
|
||||
err = 1;
|
||||
}
|
||||
|
||||
if (err != 0 || f4 == NULL) {
|
||||
regfree(&re);
|
||||
return;
|
||||
}
|
||||
|
||||
for (i = 1; i < NSHOULD; i++)
|
||||
should[i] = NULL;
|
||||
nshould = split(f4, should+1, NSHOULD-1, ",");
|
||||
if (nshould == 0) {
|
||||
nshould = 1;
|
||||
should[1] = "";
|
||||
}
|
||||
for (i = 1; i < NSUBS; i++) {
|
||||
grump = check(f2, subs[i], should[i]);
|
||||
if (grump != NULL) {
|
||||
fprintf(stderr, "%d: %s $%d %s\n", line,
|
||||
type, i, grump);
|
||||
status = 1;
|
||||
err = 1;
|
||||
}
|
||||
}
|
||||
|
||||
regfree(&re);
|
||||
}
|
||||
|
||||
/*
|
||||
- options - pick options out of a regression-test string
|
||||
== int options(int type, char *s);
|
||||
*/
|
||||
int
|
||||
options(type, s)
|
||||
int type; /* 'c' compile, 'e' exec */
|
||||
char *s;
|
||||
{
|
||||
register char *p;
|
||||
register int o = (type == 'c') ? copts : eopts;
|
||||
register char *legal = (type == 'c') ? "bisnmp" : "^$#tl";
|
||||
|
||||
for (p = s; *p != '\0'; p++)
|
||||
if (strchr(legal, *p) != NULL)
|
||||
switch (*p) {
|
||||
case 'b':
|
||||
o &= ~REG_EXTENDED;
|
||||
break;
|
||||
case 'i':
|
||||
o |= REG_ICASE;
|
||||
break;
|
||||
case 's':
|
||||
o |= REG_NOSUB;
|
||||
break;
|
||||
case 'n':
|
||||
o |= REG_NEWLINE;
|
||||
break;
|
||||
case 'm':
|
||||
o &= ~REG_EXTENDED;
|
||||
o |= REG_NOSPEC;
|
||||
break;
|
||||
case 'p':
|
||||
o |= REG_PEND;
|
||||
break;
|
||||
case '^':
|
||||
o |= REG_NOTBOL;
|
||||
break;
|
||||
case '$':
|
||||
o |= REG_NOTEOL;
|
||||
break;
|
||||
case '#':
|
||||
o |= REG_STARTEND;
|
||||
break;
|
||||
case 't': /* trace */
|
||||
o |= REG_TRACE;
|
||||
break;
|
||||
case 'l': /* force long representation */
|
||||
o |= REG_LARGE;
|
||||
break;
|
||||
case 'r': /* force backref use */
|
||||
o |= REG_BACKR;
|
||||
break;
|
||||
}
|
||||
return(o);
|
||||
}
|
||||
|
||||
/*
|
||||
- opt - is a particular option in a regression string?
|
||||
== int opt(int c, char *s);
|
||||
*/
|
||||
int /* predicate */
|
||||
opt(c, s)
|
||||
int c;
|
||||
char *s;
|
||||
{
|
||||
return(strchr(s, c) != NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
- fixstr - transform magic characters in strings
|
||||
== void fixstr(register char *p);
|
||||
*/
|
||||
void
|
||||
fixstr(p)
|
||||
register char *p;
|
||||
{
|
||||
if (p == NULL)
|
||||
return;
|
||||
|
||||
for (; *p != '\0'; p++)
|
||||
if (*p == 'N')
|
||||
*p = '\n';
|
||||
else if (*p == 'T')
|
||||
*p = '\t';
|
||||
else if (*p == 'S')
|
||||
*p = ' ';
|
||||
else if (*p == 'Z')
|
||||
*p = '\0';
|
||||
}
|
||||
|
||||
/*
|
||||
- check - check a substring match
|
||||
== char *check(char *str, regmatch_t sub, char *should);
|
||||
*/
|
||||
char * /* NULL or complaint */
|
||||
check(str, sub, should)
|
||||
char *str;
|
||||
regmatch_t sub;
|
||||
char *should;
|
||||
{
|
||||
register int len;
|
||||
register int shlen;
|
||||
register char *p;
|
||||
static char grump[500];
|
||||
register char *at = NULL;
|
||||
|
||||
if (should != NULL && strcmp(should, "-") == 0)
|
||||
should = NULL;
|
||||
if (should != NULL && should[0] == '@') {
|
||||
at = should + 1;
|
||||
should = "";
|
||||
}
|
||||
|
||||
/* check rm_so and rm_eo for consistency */
|
||||
if (sub.rm_so > sub.rm_eo || (sub.rm_so == -1 && sub.rm_eo != -1) ||
|
||||
(sub.rm_so != -1 && sub.rm_eo == -1) ||
|
||||
(sub.rm_so != -1 && sub.rm_so < 0) ||
|
||||
(sub.rm_eo != -1 && sub.rm_eo < 0) ) {
|
||||
sprintf(grump, "start %ld end %ld", (long)sub.rm_so,
|
||||
(long)sub.rm_eo);
|
||||
return(grump);
|
||||
}
|
||||
|
||||
/* check for no match */
|
||||
if (sub.rm_so == -1 && should == NULL)
|
||||
return(NULL);
|
||||
if (sub.rm_so == -1)
|
||||
return("did not match");
|
||||
|
||||
/* check for in range */
|
||||
if (sub.rm_eo > strlen(str)) {
|
||||
sprintf(grump, "start %ld end %ld, past end of string",
|
||||
(long)sub.rm_so, (long)sub.rm_eo);
|
||||
return(grump);
|
||||
}
|
||||
|
||||
len = (int)(sub.rm_eo - sub.rm_so);
|
||||
shlen = (int)strlen(should);
|
||||
p = str + sub.rm_so;
|
||||
|
||||
/* check for not supposed to match */
|
||||
if (should == NULL) {
|
||||
sprintf(grump, "matched `%.*s'", len, p);
|
||||
return(grump);
|
||||
}
|
||||
|
||||
/* check for wrong match */
|
||||
if (len != shlen || strncmp(p, should, (size_t)shlen) != 0) {
|
||||
sprintf(grump, "matched `%.*s' instead", len, p);
|
||||
return(grump);
|
||||
}
|
||||
if (shlen > 0)
|
||||
return(NULL);
|
||||
|
||||
/* check null match in right place */
|
||||
if (at == NULL)
|
||||
return(NULL);
|
||||
shlen = strlen(at);
|
||||
if (shlen == 0)
|
||||
shlen = 1; /* force check for end-of-string */
|
||||
if (strncmp(p, at, shlen) != 0) {
|
||||
sprintf(grump, "matched null at `%.20s'", p);
|
||||
return(grump);
|
||||
}
|
||||
return(NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
- eprint - convert error number to name
|
||||
== static char *eprint(int err);
|
||||
*/
|
||||
static char *
|
||||
eprint(err)
|
||||
int err;
|
||||
{
|
||||
static char epbuf[100];
|
||||
size_t len;
|
||||
|
||||
len = regerror(REG_ITOA|err, (regex_t *)NULL, epbuf, sizeof(epbuf));
|
||||
assert(len <= sizeof(epbuf));
|
||||
return(epbuf);
|
||||
}
|
||||
|
||||
/*
|
||||
- efind - convert error name to number
|
||||
== static int efind(char *name);
|
||||
*/
|
||||
static int
|
||||
efind(name)
|
||||
char *name;
|
||||
{
|
||||
static char efbuf[100];
|
||||
size_t n;
|
||||
regex_t re;
|
||||
|
||||
sprintf(efbuf, "REG_%s", name);
|
||||
assert(strlen(efbuf) < sizeof(efbuf));
|
||||
re.re_endp = efbuf;
|
||||
(void) regerror(REG_ATOI, &re, efbuf, sizeof(efbuf));
|
||||
return(atoi(efbuf));
|
||||
}
|
@@ -1,76 +0,0 @@
|
||||
#! /bin/sh
|
||||
# mkh - pull headers out of C source
|
||||
PATH=/bin:/usr/bin ; export PATH
|
||||
|
||||
# egrep pattern to pick out marked lines
|
||||
egrep='^ =([ ]|$)'
|
||||
|
||||
# Sed program to process marked lines into lines for the header file.
|
||||
# The markers have already been removed. Two things are done here: removal
|
||||
# of backslashed newlines, and some fudging of comments. The first is done
|
||||
# because -o needs to have prototypes on one line to strip them down.
|
||||
# Getting comments into the output is tricky; we turn C++-style // comments
|
||||
# into /* */ comments, after altering any existing */'s to avoid trouble.
|
||||
peel=' /\\$/N
|
||||
/\\\n[ ]*/s///g
|
||||
/\/\//s;\*/;* /;g
|
||||
/\/\//s;//\(.*\);/*\1 */;'
|
||||
|
||||
for a
|
||||
do
|
||||
case "$a" in
|
||||
-o) # old (pre-function-prototype) compiler
|
||||
# add code to comment out argument lists
|
||||
peel="$peel
|
||||
"'/^\([^#\/][^\/]*[a-zA-Z0-9_)]\)(\(.*\))/s;;\1(/*\2*/);'
|
||||
shift
|
||||
;;
|
||||
-b) # funny Berkeley __P macro
|
||||
peel="$peel
|
||||
"'/^\([^#\/][^\/]*[a-zA-Z0-9_)]\)(\(.*\))/s;;\1 __P((\2));'
|
||||
shift
|
||||
;;
|
||||
-s) # compiler doesn't like `static foo();'
|
||||
# add code to get rid of the `static'
|
||||
peel="$peel
|
||||
"'/^static[ ][^\/]*[a-zA-Z0-9_)](.*)/s;static.;;'
|
||||
shift
|
||||
;;
|
||||
-p) # private declarations
|
||||
egrep='^ ==([ ]|$)'
|
||||
shift
|
||||
;;
|
||||
-i) # wrap in #ifndef, argument is name
|
||||
ifndef="$2"
|
||||
shift ; shift
|
||||
;;
|
||||
*) break
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if test " $ifndef" != " "
|
||||
then
|
||||
echo "#ifndef $ifndef"
|
||||
echo "#define $ifndef /* never again */"
|
||||
fi
|
||||
echo "/* ========= begin header generated by $0 ========= */"
|
||||
echo '#ifdef __cplusplus'
|
||||
echo 'extern "C" {'
|
||||
echo '#endif'
|
||||
for f
|
||||
do
|
||||
echo
|
||||
echo "/* === $f === */"
|
||||
egrep "$egrep" $f | sed 's/^ ==*[ ]//;s/^ ==*$//' | sed "$peel"
|
||||
echo
|
||||
done
|
||||
echo '#ifdef __cplusplus'
|
||||
echo '}'
|
||||
echo '#endif'
|
||||
echo "/* ========= end header generated by $0 ========= */"
|
||||
if test " $ifndef" != " "
|
||||
then
|
||||
echo "#endif"
|
||||
fi
|
||||
exit 0
|
208
src/regex/regc_cvec.c
Normal file
208
src/regex/regc_cvec.c
Normal file
@@ -0,0 +1,208 @@
|
||||
/*
|
||||
* Utility functions for handling cvecs
|
||||
* This file is #included by regcomp.c.
|
||||
*
|
||||
* Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
|
||||
*
|
||||
* Development of this software was funded, in part, by Cray Research Inc.,
|
||||
* UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
|
||||
* Corporation, none of whom are responsible for the results. The author
|
||||
* thanks all of them.
|
||||
*
|
||||
* Redistribution and use in source and binary forms -- with or without
|
||||
* modification -- are permitted for any purpose, provided that
|
||||
* redistributions in source form retain this entire copyright notice and
|
||||
* indicate the origin and nature of any modifications.
|
||||
*
|
||||
* I'd appreciate being given credit for this package in the documentation
|
||||
* of software which uses it, but that is not a requirement.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
||||
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
|
||||
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
||||
* HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
||||
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
||||
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
- newcvec - allocate a new cvec
|
||||
^ static struct cvec *newcvec(int, int, int);
|
||||
*/
|
||||
static struct cvec *
|
||||
newcvec(nchrs, nranges, nmcces)
|
||||
int nchrs; /* to hold this many chrs... */
|
||||
int nranges; /* ... and this many ranges... */
|
||||
int nmcces; /* ... and this many MCCEs */
|
||||
{
|
||||
size_t n;
|
||||
size_t nc;
|
||||
struct cvec *cv;
|
||||
|
||||
nc = (size_t)nchrs + (size_t)nmcces*(MAXMCCE+1) + (size_t)nranges*2;
|
||||
n = sizeof(struct cvec) + (size_t)(nmcces-1)*sizeof(chr *)
|
||||
+ nc*sizeof(chr);
|
||||
cv = (struct cvec *)MALLOC(n);
|
||||
if (cv == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
cv->chrspace = nchrs;
|
||||
cv->chrs = (chr *)&cv->mcces[nmcces]; /* chrs just after MCCE ptrs */
|
||||
cv->mccespace = nmcces;
|
||||
cv->ranges = cv->chrs + nchrs + nmcces*(MAXMCCE+1);
|
||||
cv->rangespace = nranges;
|
||||
return clearcvec(cv);
|
||||
}
|
||||
|
||||
/*
|
||||
- clearcvec - clear a possibly-new cvec
|
||||
* Returns pointer as convenience.
|
||||
^ static struct cvec *clearcvec(struct cvec *);
|
||||
*/
|
||||
static struct cvec *
|
||||
clearcvec(cv)
|
||||
struct cvec *cv; /* character vector */
|
||||
{
|
||||
int i;
|
||||
|
||||
assert(cv != NULL);
|
||||
cv->nchrs = 0;
|
||||
assert(cv->chrs == (chr *)&cv->mcces[cv->mccespace]);
|
||||
cv->nmcces = 0;
|
||||
cv->nmccechrs = 0;
|
||||
cv->nranges = 0;
|
||||
for (i = 0; i < cv->mccespace; i++) {
|
||||
cv->mcces[i] = NULL;
|
||||
}
|
||||
|
||||
return cv;
|
||||
}
|
||||
|
||||
/*
|
||||
- addchr - add a chr to a cvec
|
||||
^ static VOID addchr(struct cvec *, pchr);
|
||||
*/
|
||||
static VOID
|
||||
addchr(cv, c)
|
||||
struct cvec *cv; /* character vector */
|
||||
pchr c; /* character to add */
|
||||
{
|
||||
assert(cv->nchrs < cv->chrspace - cv->nmccechrs);
|
||||
cv->chrs[cv->nchrs++] = (chr)c;
|
||||
}
|
||||
|
||||
/*
|
||||
- addrange - add a range to a cvec
|
||||
^ static VOID addrange(struct cvec *, pchr, pchr);
|
||||
*/
|
||||
static VOID
|
||||
addrange(cv, from, to)
|
||||
struct cvec *cv; /* character vector */
|
||||
pchr from; /* first character of range */
|
||||
pchr to; /* last character of range */
|
||||
{
|
||||
assert(cv->nranges < cv->rangespace);
|
||||
cv->ranges[cv->nranges*2] = (chr)from;
|
||||
cv->ranges[cv->nranges*2 + 1] = (chr)to;
|
||||
cv->nranges++;
|
||||
}
|
||||
|
||||
/*
|
||||
- addmcce - add an MCCE to a cvec
|
||||
^ static VOID addmcce(struct cvec *, chr *, chr *);
|
||||
*/
|
||||
static VOID
|
||||
addmcce(cv, startp, endp)
|
||||
struct cvec *cv; /* character vector */
|
||||
chr *startp; /* beginning of text */
|
||||
chr *endp; /* just past end of text */
|
||||
{
|
||||
int len;
|
||||
int i;
|
||||
chr *s;
|
||||
chr *d;
|
||||
|
||||
if (startp == NULL && endp == NULL) {
|
||||
return;
|
||||
}
|
||||
len = endp - startp;
|
||||
assert(len > 0);
|
||||
assert(cv->nchrs + len < cv->chrspace - cv->nmccechrs);
|
||||
assert(cv->nmcces < cv->mccespace);
|
||||
d = &cv->chrs[cv->chrspace - cv->nmccechrs - len - 1];
|
||||
cv->mcces[cv->nmcces++] = d;
|
||||
for (s = startp, i = len; i > 0; s++, i--) {
|
||||
*d++ = *s;
|
||||
}
|
||||
*d++ = 0; /* endmarker */
|
||||
assert(d == &cv->chrs[cv->chrspace - cv->nmccechrs]);
|
||||
cv->nmccechrs += len + 1;
|
||||
}
|
||||
|
||||
/*
|
||||
- haschr - does a cvec contain this chr?
|
||||
^ static int haschr(struct cvec *, pchr);
|
||||
*/
|
||||
static int /* predicate */
|
||||
haschr(cv, c)
|
||||
struct cvec *cv; /* character vector */
|
||||
pchr c; /* character to test for */
|
||||
{
|
||||
int i;
|
||||
chr *p;
|
||||
|
||||
for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--) {
|
||||
if (*p == c) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--) {
|
||||
if ((*p <= c) && (c <= *(p+1))) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
- getcvec - get a cvec, remembering it as v->cv
|
||||
^ static struct cvec *getcvec(struct vars *, int, int, int);
|
||||
*/
|
||||
static struct cvec *
|
||||
getcvec(v, nchrs, nranges, nmcces)
|
||||
struct vars *v; /* context */
|
||||
int nchrs; /* to hold this many chrs... */
|
||||
int nranges; /* ... and this many ranges... */
|
||||
int nmcces; /* ... and this many MCCEs */
|
||||
{
|
||||
if (v->cv != NULL && nchrs <= v->cv->chrspace &&
|
||||
nranges <= v->cv->rangespace && nmcces <= v->cv->mccespace) {
|
||||
return clearcvec(v->cv);
|
||||
}
|
||||
|
||||
if (v->cv != NULL) {
|
||||
freecvec(v->cv);
|
||||
}
|
||||
v->cv = newcvec(nchrs, nranges, nmcces);
|
||||
if (v->cv == NULL) {
|
||||
ERR(REG_ESPACE);
|
||||
}
|
||||
|
||||
return v->cv;
|
||||
}
|
||||
|
||||
/*
|
||||
- freecvec - free a cvec
|
||||
^ static VOID freecvec(struct cvec *);
|
||||
*/
|
||||
static VOID
|
||||
freecvec(cv)
|
||||
struct cvec *cv; /* character vector */
|
||||
{
|
||||
FREE(cv);
|
||||
}
|
989
src/regex/regc_locale.c
Normal file
989
src/regex/regc_locale.c
Normal file
@@ -0,0 +1,989 @@
|
||||
/*
|
||||
* regc_locale.c --
|
||||
*
|
||||
* This file contains the Unicode locale specific regexp routines.
|
||||
* This file is #included by regcomp.c.
|
||||
*
|
||||
* Copyright (c) 1998 by Scriptics Corporation.
|
||||
*
|
||||
* See the file "license.terms" for information on usage and redistribution
|
||||
* of this file, and for a DISCLAIMER OF ALL WARRANTIES.
|
||||
*
|
||||
* RCS: @(#) $Id$
|
||||
*/
|
||||
|
||||
/* ASCII character-name table */
|
||||
|
||||
static struct cname {
|
||||
char *name;
|
||||
char code;
|
||||
} cnames[] = {
|
||||
{"NUL", '\0'},
|
||||
{"SOH", '\001'},
|
||||
{"STX", '\002'},
|
||||
{"ETX", '\003'},
|
||||
{"EOT", '\004'},
|
||||
{"ENQ", '\005'},
|
||||
{"ACK", '\006'},
|
||||
{"BEL", '\007'},
|
||||
{"alert", '\007'},
|
||||
{"BS", '\010'},
|
||||
{"backspace", '\b'},
|
||||
{"HT", '\011'},
|
||||
{"tab", '\t'},
|
||||
{"LF", '\012'},
|
||||
{"newline", '\n'},
|
||||
{"VT", '\013'},
|
||||
{"vertical-tab", '\v'},
|
||||
{"FF", '\014'},
|
||||
{"form-feed", '\f'},
|
||||
{"CR", '\015'},
|
||||
{"carriage-return", '\r'},
|
||||
{"SO", '\016'},
|
||||
{"SI", '\017'},
|
||||
{"DLE", '\020'},
|
||||
{"DC1", '\021'},
|
||||
{"DC2", '\022'},
|
||||
{"DC3", '\023'},
|
||||
{"DC4", '\024'},
|
||||
{"NAK", '\025'},
|
||||
{"SYN", '\026'},
|
||||
{"ETB", '\027'},
|
||||
{"CAN", '\030'},
|
||||
{"EM", '\031'},
|
||||
{"SUB", '\032'},
|
||||
{"ESC", '\033'},
|
||||
{"IS4", '\034'},
|
||||
{"FS", '\034'},
|
||||
{"IS3", '\035'},
|
||||
{"GS", '\035'},
|
||||
{"IS2", '\036'},
|
||||
{"RS", '\036'},
|
||||
{"IS1", '\037'},
|
||||
{"US", '\037'},
|
||||
{"space", ' '},
|
||||
{"exclamation-mark",'!'},
|
||||
{"quotation-mark", '"'},
|
||||
{"number-sign", '#'},
|
||||
{"dollar-sign", '$'},
|
||||
{"percent-sign", '%'},
|
||||
{"ampersand", '&'},
|
||||
{"apostrophe", '\''},
|
||||
{"left-parenthesis",'('},
|
||||
{"right-parenthesis", ')'},
|
||||
{"asterisk", '*'},
|
||||
{"plus-sign", '+'},
|
||||
{"comma", ','},
|
||||
{"hyphen", '-'},
|
||||
{"hyphen-minus", '-'},
|
||||
{"period", '.'},
|
||||
{"full-stop", '.'},
|
||||
{"slash", '/'},
|
||||
{"solidus", '/'},
|
||||
{"zero", '0'},
|
||||
{"one", '1'},
|
||||
{"two", '2'},
|
||||
{"three", '3'},
|
||||
{"four", '4'},
|
||||
{"five", '5'},
|
||||
{"six", '6'},
|
||||
{"seven", '7'},
|
||||
{"eight", '8'},
|
||||
{"nine", '9'},
|
||||
{"colon", ':'},
|
||||
{"semicolon", ';'},
|
||||
{"less-than-sign", '<'},
|
||||
{"equals-sign", '='},
|
||||
{"greater-than-sign", '>'},
|
||||
{"question-mark", '?'},
|
||||
{"commercial-at", '@'},
|
||||
{"left-square-bracket", '['},
|
||||
{"backslash", '\\'},
|
||||
{"reverse-solidus", '\\'},
|
||||
{"right-square-bracket", ']'},
|
||||
{"circumflex", '^'},
|
||||
{"circumflex-accent", '^'},
|
||||
{"underscore", '_'},
|
||||
{"low-line", '_'},
|
||||
{"grave-accent", '`'},
|
||||
{"left-brace", '{'},
|
||||
{"left-curly-bracket", '{'},
|
||||
{"vertical-line", '|'},
|
||||
{"right-brace", '}'},
|
||||
{"right-curly-bracket", '}'},
|
||||
{"tilde", '~'},
|
||||
{"DEL", '\177'},
|
||||
{NULL, 0}
|
||||
};
|
||||
|
||||
/* Unicode character-class tables */
|
||||
|
||||
typedef struct crange {
|
||||
chr start;
|
||||
chr end;
|
||||
} crange;
|
||||
|
||||
/*
|
||||
* Declarations of Unicode character ranges. This code
|
||||
* is automatically generated by the tools/uniClass.tcl script
|
||||
* and used in generic/regc_locale.c. Do not modify by hand.
|
||||
*/
|
||||
|
||||
/* Unicode: alphabetic characters */
|
||||
|
||||
static crange alphaRangeTable[] = {
|
||||
{0x0041, 0x005a}, {0x0061, 0x007a}, {0x00c0, 0x00d6}, {0x00d8, 0x00f6},
|
||||
{0x00f8, 0x021f}, {0x0222, 0x0233}, {0x0250, 0x02ad}, {0x02b0, 0x02b8},
|
||||
{0x02bb, 0x02c1}, {0x02e0, 0x02e4}, {0x0388, 0x038a}, {0x038e, 0x03a1},
|
||||
{0x03a3, 0x03ce}, {0x03d0, 0x03d7}, {0x03da, 0x03f5}, {0x0400, 0x0481},
|
||||
{0x048c, 0x04c4}, {0x04d0, 0x04f5}, {0x0531, 0x0556}, {0x0561, 0x0587},
|
||||
{0x05d0, 0x05ea}, {0x05f0, 0x05f2}, {0x0621, 0x063a}, {0x0640, 0x064a},
|
||||
{0x0671, 0x06d3}, {0x06fa, 0x06fc}, {0x0712, 0x072c}, {0x0780, 0x07a5},
|
||||
{0x0905, 0x0939}, {0x0958, 0x0961}, {0x0985, 0x098c}, {0x0993, 0x09a8},
|
||||
{0x09aa, 0x09b0}, {0x09b6, 0x09b9}, {0x09df, 0x09e1}, {0x0a05, 0x0a0a},
|
||||
{0x0a13, 0x0a28}, {0x0a2a, 0x0a30}, {0x0a59, 0x0a5c}, {0x0a72, 0x0a74},
|
||||
{0x0a85, 0x0a8b}, {0x0a8f, 0x0a91}, {0x0a93, 0x0aa8}, {0x0aaa, 0x0ab0},
|
||||
{0x0ab5, 0x0ab9}, {0x0b05, 0x0b0c}, {0x0b13, 0x0b28}, {0x0b2a, 0x0b30},
|
||||
{0x0b36, 0x0b39}, {0x0b5f, 0x0b61}, {0x0b85, 0x0b8a}, {0x0b8e, 0x0b90},
|
||||
{0x0b92, 0x0b95}, {0x0ba8, 0x0baa}, {0x0bae, 0x0bb5}, {0x0bb7, 0x0bb9},
|
||||
{0x0c05, 0x0c0c}, {0x0c0e, 0x0c10}, {0x0c12, 0x0c28}, {0x0c2a, 0x0c33},
|
||||
{0x0c35, 0x0c39}, {0x0c85, 0x0c8c}, {0x0c8e, 0x0c90}, {0x0c92, 0x0ca8},
|
||||
{0x0caa, 0x0cb3}, {0x0cb5, 0x0cb9}, {0x0d05, 0x0d0c}, {0x0d0e, 0x0d10},
|
||||
{0x0d12, 0x0d28}, {0x0d2a, 0x0d39}, {0x0d85, 0x0d96}, {0x0d9a, 0x0db1},
|
||||
{0x0db3, 0x0dbb}, {0x0dc0, 0x0dc6}, {0x0e01, 0x0e30}, {0x0e40, 0x0e46},
|
||||
{0x0e94, 0x0e97}, {0x0e99, 0x0e9f}, {0x0ea1, 0x0ea3}, {0x0ead, 0x0eb0},
|
||||
{0x0ec0, 0x0ec4}, {0x0f40, 0x0f47}, {0x0f49, 0x0f6a}, {0x0f88, 0x0f8b},
|
||||
{0x1000, 0x1021}, {0x1023, 0x1027}, {0x1050, 0x1055}, {0x10a0, 0x10c5},
|
||||
{0x10d0, 0x10f6}, {0x1100, 0x1159}, {0x115f, 0x11a2}, {0x11a8, 0x11f9},
|
||||
{0x1200, 0x1206}, {0x1208, 0x1246}, {0x124a, 0x124d}, {0x1250, 0x1256},
|
||||
{0x125a, 0x125d}, {0x1260, 0x1286}, {0x128a, 0x128d}, {0x1290, 0x12ae},
|
||||
{0x12b2, 0x12b5}, {0x12b8, 0x12be}, {0x12c2, 0x12c5}, {0x12c8, 0x12ce},
|
||||
{0x12d0, 0x12d6}, {0x12d8, 0x12ee}, {0x12f0, 0x130e}, {0x1312, 0x1315},
|
||||
{0x1318, 0x131e}, {0x1320, 0x1346}, {0x1348, 0x135a}, {0x13a0, 0x13f4},
|
||||
{0x1401, 0x166c}, {0x166f, 0x1676}, {0x1681, 0x169a}, {0x16a0, 0x16ea},
|
||||
{0x1780, 0x17b3}, {0x1820, 0x1877}, {0x1880, 0x18a8}, {0x1e00, 0x1e9b},
|
||||
{0x1ea0, 0x1ef9}, {0x1f00, 0x1f15}, {0x1f18, 0x1f1d}, {0x1f20, 0x1f45},
|
||||
{0x1f48, 0x1f4d}, {0x1f50, 0x1f57}, {0x1f5f, 0x1f7d}, {0x1f80, 0x1fb4},
|
||||
{0x1fb6, 0x1fbc}, {0x1fc2, 0x1fc4}, {0x1fc6, 0x1fcc}, {0x1fd0, 0x1fd3},
|
||||
{0x1fd6, 0x1fdb}, {0x1fe0, 0x1fec}, {0x1ff2, 0x1ff4}, {0x1ff6, 0x1ffc},
|
||||
{0x210a, 0x2113}, {0x2119, 0x211d}, {0x212a, 0x212d}, {0x212f, 0x2131},
|
||||
{0x2133, 0x2139}, {0x3031, 0x3035}, {0x3041, 0x3094}, {0x30a1, 0x30fa},
|
||||
{0x30fc, 0x30fe}, {0x3105, 0x312c}, {0x3131, 0x318e}, {0x31a0, 0x31b7},
|
||||
{0x3400, 0x4db5}, {0x4e00, 0x9fa5}, {0xa000, 0xa48c}, {0xac00, 0xd7a3},
|
||||
{0xf900, 0xfa2d}, {0xfb00, 0xfb06}, {0xfb13, 0xfb17}, {0xfb1f, 0xfb28},
|
||||
{0xfb2a, 0xfb36}, {0xfb38, 0xfb3c}, {0xfb46, 0xfbb1}, {0xfbd3, 0xfd3d},
|
||||
{0xfd50, 0xfd8f}, {0xfd92, 0xfdc7}, {0xfdf0, 0xfdfb}, {0xfe70, 0xfe72},
|
||||
{0xfe76, 0xfefc}, {0xff21, 0xff3a}, {0xff41, 0xff5a}, {0xff66, 0xffbe},
|
||||
{0xffc2, 0xffc7}, {0xffca, 0xffcf}, {0xffd2, 0xffd7}, {0xffda, 0xffdc}
|
||||
};
|
||||
|
||||
#define NUM_ALPHA_RANGE (sizeof(alphaRangeTable)/sizeof(crange))
|
||||
|
||||
static chr alphaCharTable[] = {
|
||||
0x00aa, 0x00b5, 0x00ba, 0x02d0, 0x02d1, 0x02ee, 0x037a, 0x0386, 0x038c,
|
||||
0x04c7, 0x04c8, 0x04cb, 0x04cc, 0x04f8, 0x04f9, 0x0559, 0x06d5, 0x06e5,
|
||||
0x06e6, 0x0710, 0x093d, 0x0950, 0x098f, 0x0990, 0x09b2, 0x09dc, 0x09dd,
|
||||
0x09f0, 0x09f1, 0x0a0f, 0x0a10, 0x0a32, 0x0a33, 0x0a35, 0x0a36, 0x0a38,
|
||||
0x0a39, 0x0a5e, 0x0a8d, 0x0ab2, 0x0ab3, 0x0abd, 0x0ad0, 0x0ae0, 0x0b0f,
|
||||
0x0b10, 0x0b32, 0x0b33, 0x0b3d, 0x0b5c, 0x0b5d, 0x0b99, 0x0b9a, 0x0b9c,
|
||||
0x0b9e, 0x0b9f, 0x0ba3, 0x0ba4, 0x0c60, 0x0c61, 0x0cde, 0x0ce0, 0x0ce1,
|
||||
0x0d60, 0x0d61, 0x0dbd, 0x0e32, 0x0e33, 0x0e81, 0x0e82, 0x0e84, 0x0e87,
|
||||
0x0e88, 0x0e8a, 0x0e8d, 0x0ea5, 0x0ea7, 0x0eaa, 0x0eab, 0x0eb2, 0x0eb3,
|
||||
0x0ebd, 0x0ec6, 0x0edc, 0x0edd, 0x0f00, 0x1029, 0x102a, 0x1248, 0x1258,
|
||||
0x1288, 0x12b0, 0x12c0, 0x1310, 0x1f59, 0x1f5b, 0x1f5d, 0x1fbe, 0x207f,
|
||||
0x2102, 0x2107, 0x2115, 0x2124, 0x2126, 0x2128, 0x3005, 0x3006, 0x309d,
|
||||
0x309e, 0xfb1d, 0xfb3e, 0xfb40, 0xfb41, 0xfb43, 0xfb44, 0xfe74, 0xfffe
|
||||
};
|
||||
|
||||
#define NUM_ALPHA_CHAR (sizeof(alphaCharTable)/sizeof(chr))
|
||||
|
||||
/* Unicode: decimal digit characters */
|
||||
|
||||
static crange digitRangeTable[] = {
|
||||
{0x0030, 0x0039}, {0x0660, 0x0669}, {0x06f0, 0x06f9}, {0x0966, 0x096f},
|
||||
{0x09e6, 0x09ef}, {0x0a66, 0x0a6f}, {0x0ae6, 0x0aef}, {0x0b66, 0x0b6f},
|
||||
{0x0be7, 0x0bef}, {0x0c66, 0x0c6f}, {0x0ce6, 0x0cef}, {0x0d66, 0x0d6f},
|
||||
{0x0e50, 0x0e59}, {0x0ed0, 0x0ed9}, {0x0f20, 0x0f29}, {0x1040, 0x1049},
|
||||
{0x1369, 0x1371}, {0x17e0, 0x17e9}, {0x1810, 0x1819}, {0xff10, 0xff19}
|
||||
};
|
||||
|
||||
#define NUM_DIGIT_RANGE (sizeof(digitRangeTable)/sizeof(crange))
|
||||
|
||||
/* no singletons of digit characters */
|
||||
|
||||
/* Unicode: punctuation characters */
|
||||
|
||||
static crange punctRangeTable[] = {
|
||||
{0x0021, 0x0023}, {0x0025, 0x002a}, {0x002c, 0x002f}, {0x005b, 0x005d},
|
||||
{0x055a, 0x055f}, {0x066a, 0x066d}, {0x0700, 0x070d}, {0x0f04, 0x0f12},
|
||||
{0x0f3a, 0x0f3d}, {0x104a, 0x104f}, {0x1361, 0x1368}, {0x16eb, 0x16ed},
|
||||
{0x17d4, 0x17da}, {0x1800, 0x180a}, {0x2010, 0x2027}, {0x2030, 0x2043},
|
||||
{0x2048, 0x204d}, {0x3001, 0x3003}, {0x3008, 0x3011}, {0x3014, 0x301f},
|
||||
{0xfe30, 0xfe44}, {0xfe49, 0xfe52}, {0xfe54, 0xfe61}, {0xff01, 0xff03},
|
||||
{0xff05, 0xff0a}, {0xff0c, 0xff0f}, {0xff3b, 0xff3d}, {0xff61, 0xff65}
|
||||
};
|
||||
|
||||
#define NUM_PUNCT_RANGE (sizeof(punctRangeTable)/sizeof(crange))
|
||||
|
||||
static chr punctCharTable[] = {
|
||||
0x003a, 0x003b, 0x003f, 0x0040, 0x005f, 0x007b, 0x007d, 0x00a1, 0x00ab,
|
||||
0x00ad, 0x00b7, 0x00bb, 0x00bf, 0x037e, 0x0387, 0x0589, 0x058a, 0x05be,
|
||||
0x05c0, 0x05c3, 0x05f3, 0x05f4, 0x060c, 0x061b, 0x061f, 0x06d4, 0x0964,
|
||||
0x0965, 0x0970, 0x0df4, 0x0e4f, 0x0e5a, 0x0e5b, 0x0f85, 0x10fb, 0x166d,
|
||||
0x166e, 0x169b, 0x169c, 0x17dc, 0x2045, 0x2046, 0x207d, 0x207e, 0x208d,
|
||||
0x208e, 0x2329, 0x232a, 0x3030, 0x30fb, 0xfd3e, 0xfd3f, 0xfe63, 0xfe68,
|
||||
0xfe6a, 0xfe6b, 0xff1a, 0xff1b, 0xff1f, 0xff20, 0xff3f, 0xff5b, 0xff5d
|
||||
};
|
||||
|
||||
#define NUM_PUNCT_CHAR (sizeof(punctCharTable)/sizeof(chr))
|
||||
|
||||
/* Unicode: white space characters */
|
||||
|
||||
static crange spaceRangeTable[] = {
|
||||
{0x0009, 0x000d}, {0x2000, 0x200b}
|
||||
};
|
||||
|
||||
#define NUM_SPACE_RANGE (sizeof(spaceRangeTable)/sizeof(crange))
|
||||
|
||||
static chr spaceCharTable[] = {
|
||||
0x0020, 0x00a0, 0x1680, 0x2028, 0x2029, 0x202f, 0x3000
|
||||
};
|
||||
|
||||
#define NUM_SPACE_CHAR (sizeof(spaceCharTable)/sizeof(chr))
|
||||
|
||||
/* Unicode: lowercase characters */
|
||||
|
||||
static crange lowerRangeTable[] = {
|
||||
{0x0061, 0x007a}, {0x00df, 0x00f6}, {0x00f8, 0x00ff}, {0x017e, 0x0180},
|
||||
{0x0199, 0x019b}, {0x01bd, 0x01bf}, {0x0250, 0x02ad}, {0x03ac, 0x03ce},
|
||||
{0x03d5, 0x03d7}, {0x03ef, 0x03f3}, {0x0430, 0x045f}, {0x0561, 0x0587},
|
||||
{0x1e95, 0x1e9b}, {0x1f00, 0x1f07}, {0x1f10, 0x1f15}, {0x1f20, 0x1f27},
|
||||
{0x1f30, 0x1f37}, {0x1f40, 0x1f45}, {0x1f50, 0x1f57}, {0x1f60, 0x1f67},
|
||||
{0x1f70, 0x1f7d}, {0x1f80, 0x1f87}, {0x1f90, 0x1f97}, {0x1fa0, 0x1fa7},
|
||||
{0x1fb0, 0x1fb4}, {0x1fc2, 0x1fc4}, {0x1fd0, 0x1fd3}, {0x1fe0, 0x1fe7},
|
||||
{0x1ff2, 0x1ff4}, {0xfb00, 0xfb06}, {0xfb13, 0xfb17}, {0xff41, 0xff5a}
|
||||
};
|
||||
|
||||
#define NUM_LOWER_RANGE (sizeof(lowerRangeTable)/sizeof(crange))
|
||||
|
||||
static chr lowerCharTable[] = {
|
||||
0x00aa, 0x00b5, 0x00ba, 0x0101, 0x0103, 0x0105, 0x0107, 0x0109, 0x010b,
|
||||
0x010d, 0x010f, 0x0111, 0x0113, 0x0115, 0x0117, 0x0119, 0x011b, 0x011d,
|
||||
0x011f, 0x0121, 0x0123, 0x0125, 0x0127, 0x0129, 0x012b, 0x012d, 0x012f,
|
||||
0x0131, 0x0133, 0x0135, 0x0137, 0x0138, 0x013a, 0x013c, 0x013e, 0x0140,
|
||||
0x0142, 0x0144, 0x0146, 0x0148, 0x0149, 0x014b, 0x014d, 0x014f, 0x0151,
|
||||
0x0153, 0x0155, 0x0157, 0x0159, 0x015b, 0x015d, 0x015f, 0x0161, 0x0163,
|
||||
0x0165, 0x0167, 0x0169, 0x016b, 0x016d, 0x016f, 0x0171, 0x0173, 0x0175,
|
||||
0x0177, 0x017a, 0x017c, 0x0183, 0x0185, 0x0188, 0x018c, 0x018d, 0x0192,
|
||||
0x0195, 0x019e, 0x01a1, 0x01a3, 0x01a5, 0x01a8, 0x01aa, 0x01ab, 0x01ad,
|
||||
0x01b0, 0x01b4, 0x01b6, 0x01b9, 0x01ba, 0x01c6, 0x01c9, 0x01cc, 0x01ce,
|
||||
0x01d0, 0x01d2, 0x01d4, 0x01d6, 0x01d8, 0x01da, 0x01dc, 0x01dd, 0x01df,
|
||||
0x01e1, 0x01e3, 0x01e5, 0x01e7, 0x01e9, 0x01eb, 0x01ed, 0x01ef, 0x01f0,
|
||||
0x01f3, 0x01f5, 0x01f9, 0x01fb, 0x01fd, 0x01ff, 0x0201, 0x0203, 0x0205,
|
||||
0x0207, 0x0209, 0x020b, 0x020d, 0x020f, 0x0211, 0x0213, 0x0215, 0x0217,
|
||||
0x0219, 0x021b, 0x021d, 0x021f, 0x0223, 0x0225, 0x0227, 0x0229, 0x022b,
|
||||
0x022d, 0x022f, 0x0231, 0x0233, 0x0390, 0x03d0, 0x03d1, 0x03db, 0x03dd,
|
||||
0x03df, 0x03e1, 0x03e3, 0x03e5, 0x03e7, 0x03e9, 0x03eb, 0x03ed, 0x03f5,
|
||||
0x0461, 0x0463, 0x0465, 0x0467, 0x0469, 0x046b, 0x046d, 0x046f, 0x0471,
|
||||
0x0473, 0x0475, 0x0477, 0x0479, 0x047b, 0x047d, 0x047f, 0x0481, 0x048d,
|
||||
0x048f, 0x0491, 0x0493, 0x0495, 0x0497, 0x0499, 0x049b, 0x049d, 0x049f,
|
||||
0x04a1, 0x04a3, 0x04a5, 0x04a7, 0x04a9, 0x04ab, 0x04ad, 0x04af, 0x04b1,
|
||||
0x04b3, 0x04b5, 0x04b7, 0x04b9, 0x04bb, 0x04bd, 0x04bf, 0x04c2, 0x04c4,
|
||||
0x04c8, 0x04cc, 0x04d1, 0x04d3, 0x04d5, 0x04d7, 0x04d9, 0x04db, 0x04dd,
|
||||
0x04df, 0x04e1, 0x04e3, 0x04e5, 0x04e7, 0x04e9, 0x04eb, 0x04ed, 0x04ef,
|
||||
0x04f1, 0x04f3, 0x04f5, 0x04f9, 0x1e01, 0x1e03, 0x1e05, 0x1e07, 0x1e09,
|
||||
0x1e0b, 0x1e0d, 0x1e0f, 0x1e11, 0x1e13, 0x1e15, 0x1e17, 0x1e19, 0x1e1b,
|
||||
0x1e1d, 0x1e1f, 0x1e21, 0x1e23, 0x1e25, 0x1e27, 0x1e29, 0x1e2b, 0x1e2d,
|
||||
0x1e2f, 0x1e31, 0x1e33, 0x1e35, 0x1e37, 0x1e39, 0x1e3b, 0x1e3d, 0x1e3f,
|
||||
0x1e41, 0x1e43, 0x1e45, 0x1e47, 0x1e49, 0x1e4b, 0x1e4d, 0x1e4f, 0x1e51,
|
||||
0x1e53, 0x1e55, 0x1e57, 0x1e59, 0x1e5b, 0x1e5d, 0x1e5f, 0x1e61, 0x1e63,
|
||||
0x1e65, 0x1e67, 0x1e69, 0x1e6b, 0x1e6d, 0x1e6f, 0x1e71, 0x1e73, 0x1e75,
|
||||
0x1e77, 0x1e79, 0x1e7b, 0x1e7d, 0x1e7f, 0x1e81, 0x1e83, 0x1e85, 0x1e87,
|
||||
0x1e89, 0x1e8b, 0x1e8d, 0x1e8f, 0x1e91, 0x1e93, 0x1ea1, 0x1ea3, 0x1ea5,
|
||||
0x1ea7, 0x1ea9, 0x1eab, 0x1ead, 0x1eaf, 0x1eb1, 0x1eb3, 0x1eb5, 0x1eb7,
|
||||
0x1eb9, 0x1ebb, 0x1ebd, 0x1ebf, 0x1ec1, 0x1ec3, 0x1ec5, 0x1ec7, 0x1ec9,
|
||||
0x1ecb, 0x1ecd, 0x1ecf, 0x1ed1, 0x1ed3, 0x1ed5, 0x1ed7, 0x1ed9, 0x1edb,
|
||||
0x1edd, 0x1edf, 0x1ee1, 0x1ee3, 0x1ee5, 0x1ee7, 0x1ee9, 0x1eeb, 0x1eed,
|
||||
0x1eef, 0x1ef1, 0x1ef3, 0x1ef5, 0x1ef7, 0x1ef9, 0x1fb6, 0x1fb7, 0x1fbe,
|
||||
0x1fc6, 0x1fc7, 0x1fd6, 0x1fd7, 0x1ff6, 0x1ff7, 0x207f, 0x210a, 0x210e,
|
||||
0x210f, 0x2113, 0x212f, 0x2134, 0x2139
|
||||
};
|
||||
|
||||
#define NUM_LOWER_CHAR (sizeof(lowerCharTable)/sizeof(chr))
|
||||
|
||||
/* Unicode: uppercase characters */
|
||||
|
||||
static crange upperRangeTable[] = {
|
||||
{0x0041, 0x005a}, {0x00c0, 0x00d6}, {0x00d8, 0x00de}, {0x0189, 0x018b},
|
||||
{0x018e, 0x0191}, {0x0196, 0x0198}, {0x01b1, 0x01b3}, {0x01f6, 0x01f8},
|
||||
{0x0388, 0x038a}, {0x0391, 0x03a1}, {0x03a3, 0x03ab}, {0x03d2, 0x03d4},
|
||||
{0x0400, 0x042f}, {0x0531, 0x0556}, {0x10a0, 0x10c5}, {0x1f08, 0x1f0f},
|
||||
{0x1f18, 0x1f1d}, {0x1f28, 0x1f2f}, {0x1f38, 0x1f3f}, {0x1f48, 0x1f4d},
|
||||
{0x1f68, 0x1f6f}, {0x1fb8, 0x1fbb}, {0x1fc8, 0x1fcb}, {0x1fd8, 0x1fdb},
|
||||
{0x1fe8, 0x1fec}, {0x1ff8, 0x1ffb}, {0x210b, 0x210d}, {0x2110, 0x2112},
|
||||
{0x2119, 0x211d}, {0x212a, 0x212d}, {0xff21, 0xff3a}
|
||||
};
|
||||
|
||||
#define NUM_UPPER_RANGE (sizeof(upperRangeTable)/sizeof(crange))
|
||||
|
||||
static chr upperCharTable[] = {
|
||||
0x0100, 0x0102, 0x0104, 0x0106, 0x0108, 0x010a, 0x010c, 0x010e, 0x0110,
|
||||
0x0112, 0x0114, 0x0116, 0x0118, 0x011a, 0x011c, 0x011e, 0x0120, 0x0122,
|
||||
0x0124, 0x0126, 0x0128, 0x012a, 0x012c, 0x012e, 0x0130, 0x0132, 0x0134,
|
||||
0x0136, 0x0139, 0x013b, 0x013d, 0x013f, 0x0141, 0x0143, 0x0145, 0x0147,
|
||||
0x014a, 0x014c, 0x014e, 0x0150, 0x0152, 0x0154, 0x0156, 0x0158, 0x015a,
|
||||
0x015c, 0x015e, 0x0160, 0x0162, 0x0164, 0x0166, 0x0168, 0x016a, 0x016c,
|
||||
0x016e, 0x0170, 0x0172, 0x0174, 0x0176, 0x0178, 0x0179, 0x017b, 0x017d,
|
||||
0x0181, 0x0182, 0x0184, 0x0186, 0x0187, 0x0193, 0x0194, 0x019c, 0x019d,
|
||||
0x019f, 0x01a0, 0x01a2, 0x01a4, 0x01a6, 0x01a7, 0x01a9, 0x01ac, 0x01ae,
|
||||
0x01af, 0x01b5, 0x01b7, 0x01b8, 0x01bc, 0x01c4, 0x01c7, 0x01ca, 0x01cd,
|
||||
0x01cf, 0x01d1, 0x01d3, 0x01d5, 0x01d7, 0x01d9, 0x01db, 0x01de, 0x01e0,
|
||||
0x01e2, 0x01e4, 0x01e6, 0x01e8, 0x01ea, 0x01ec, 0x01ee, 0x01f1, 0x01f4,
|
||||
0x01fa, 0x01fc, 0x01fe, 0x0200, 0x0202, 0x0204, 0x0206, 0x0208, 0x020a,
|
||||
0x020c, 0x020e, 0x0210, 0x0212, 0x0214, 0x0216, 0x0218, 0x021a, 0x021c,
|
||||
0x021e, 0x0222, 0x0224, 0x0226, 0x0228, 0x022a, 0x022c, 0x022e, 0x0230,
|
||||
0x0232, 0x0386, 0x038c, 0x038e, 0x038f, 0x03da, 0x03dc, 0x03de, 0x03e0,
|
||||
0x03e2, 0x03e4, 0x03e6, 0x03e8, 0x03ea, 0x03ec, 0x03ee, 0x03f4, 0x0460,
|
||||
0x0462, 0x0464, 0x0466, 0x0468, 0x046a, 0x046c, 0x046e, 0x0470, 0x0472,
|
||||
0x0474, 0x0476, 0x0478, 0x047a, 0x047c, 0x047e, 0x0480, 0x048c, 0x048e,
|
||||
0x0490, 0x0492, 0x0494, 0x0496, 0x0498, 0x049a, 0x049c, 0x049e, 0x04a0,
|
||||
0x04a2, 0x04a4, 0x04a6, 0x04a8, 0x04aa, 0x04ac, 0x04ae, 0x04b0, 0x04b2,
|
||||
0x04b4, 0x04b6, 0x04b8, 0x04ba, 0x04bc, 0x04be, 0x04c0, 0x04c1, 0x04c3,
|
||||
0x04c7, 0x04cb, 0x04d0, 0x04d2, 0x04d4, 0x04d6, 0x04d8, 0x04da, 0x04dc,
|
||||
0x04de, 0x04e0, 0x04e2, 0x04e4, 0x04e6, 0x04e8, 0x04ea, 0x04ec, 0x04ee,
|
||||
0x04f0, 0x04f2, 0x04f4, 0x04f8, 0x1e00, 0x1e02, 0x1e04, 0x1e06, 0x1e08,
|
||||
0x1e0a, 0x1e0c, 0x1e0e, 0x1e10, 0x1e12, 0x1e14, 0x1e16, 0x1e18, 0x1e1a,
|
||||
0x1e1c, 0x1e1e, 0x1e20, 0x1e22, 0x1e24, 0x1e26, 0x1e28, 0x1e2a, 0x1e2c,
|
||||
0x1e2e, 0x1e30, 0x1e32, 0x1e34, 0x1e36, 0x1e38, 0x1e3a, 0x1e3c, 0x1e3e,
|
||||
0x1e40, 0x1e42, 0x1e44, 0x1e46, 0x1e48, 0x1e4a, 0x1e4c, 0x1e4e, 0x1e50,
|
||||
0x1e52, 0x1e54, 0x1e56, 0x1e58, 0x1e5a, 0x1e5c, 0x1e5e, 0x1e60, 0x1e62,
|
||||
0x1e64, 0x1e66, 0x1e68, 0x1e6a, 0x1e6c, 0x1e6e, 0x1e70, 0x1e72, 0x1e74,
|
||||
0x1e76, 0x1e78, 0x1e7a, 0x1e7c, 0x1e7e, 0x1e80, 0x1e82, 0x1e84, 0x1e86,
|
||||
0x1e88, 0x1e8a, 0x1e8c, 0x1e8e, 0x1e90, 0x1e92, 0x1e94, 0x1ea0, 0x1ea2,
|
||||
0x1ea4, 0x1ea6, 0x1ea8, 0x1eaa, 0x1eac, 0x1eae, 0x1eb0, 0x1eb2, 0x1eb4,
|
||||
0x1eb6, 0x1eb8, 0x1eba, 0x1ebc, 0x1ebe, 0x1ec0, 0x1ec2, 0x1ec4, 0x1ec6,
|
||||
0x1ec8, 0x1eca, 0x1ecc, 0x1ece, 0x1ed0, 0x1ed2, 0x1ed4, 0x1ed6, 0x1ed8,
|
||||
0x1eda, 0x1edc, 0x1ede, 0x1ee0, 0x1ee2, 0x1ee4, 0x1ee6, 0x1ee8, 0x1eea,
|
||||
0x1eec, 0x1eee, 0x1ef0, 0x1ef2, 0x1ef4, 0x1ef6, 0x1ef8, 0x1f59, 0x1f5b,
|
||||
0x1f5d, 0x1f5f, 0x2102, 0x2107, 0x2115, 0x2124, 0x2126, 0x2128, 0x2130,
|
||||
0x2131, 0x2133
|
||||
};
|
||||
|
||||
#define NUM_UPPER_CHAR (sizeof(upperCharTable)/sizeof(chr))
|
||||
|
||||
/* Unicode: unicode print characters excluding space */
|
||||
|
||||
static crange graphRangeTable[] = {
|
||||
{0x0021, 0x007e}, {0x00a0, 0x011f}, {0x0121, 0x021f}, {0x0222, 0x0233},
|
||||
{0x0250, 0x02ad}, {0x02b0, 0x02ee}, {0x0300, 0x031f}, {0x0321, 0x034e},
|
||||
{0x0360, 0x0362}, {0x0384, 0x038a}, {0x038e, 0x03a1}, {0x03a3, 0x03ce},
|
||||
{0x03d0, 0x03d7}, {0x03da, 0x03f5}, {0x0400, 0x041f}, {0x0421, 0x0486},
|
||||
{0x048c, 0x04c4}, {0x04d0, 0x04f5}, {0x0531, 0x0556}, {0x0559, 0x055f},
|
||||
{0x0561, 0x0587}, {0x0591, 0x05a1}, {0x05a3, 0x05b9}, {0x05bb, 0x05c4},
|
||||
{0x05d0, 0x05ea}, {0x05f0, 0x05f4}, {0x0621, 0x063a}, {0x0640, 0x0655},
|
||||
{0x0660, 0x066d}, {0x0670, 0x06ed}, {0x06f0, 0x06fe}, {0x0700, 0x070d},
|
||||
{0x0710, 0x071f}, {0x0721, 0x072c}, {0x0730, 0x074a}, {0x0780, 0x07b0},
|
||||
{0x0901, 0x0903}, {0x0905, 0x091f}, {0x0921, 0x0939}, {0x093c, 0x094d},
|
||||
{0x0950, 0x0954}, {0x0958, 0x0970}, {0x0981, 0x0983}, {0x0985, 0x098c},
|
||||
{0x0993, 0x09a8}, {0x09aa, 0x09b0}, {0x09b6, 0x09b9}, {0x09be, 0x09c4},
|
||||
{0x09cb, 0x09cd}, {0x09df, 0x09e3}, {0x09e6, 0x09fa}, {0x0a05, 0x0a0a},
|
||||
{0x0a13, 0x0a1f}, {0x0a21, 0x0a28}, {0x0a2a, 0x0a30}, {0x0a3e, 0x0a42},
|
||||
{0x0a4b, 0x0a4d}, {0x0a59, 0x0a5c}, {0x0a66, 0x0a74}, {0x0a81, 0x0a83},
|
||||
{0x0a85, 0x0a8b}, {0x0a8f, 0x0a91}, {0x0a93, 0x0aa8}, {0x0aaa, 0x0ab0},
|
||||
{0x0ab5, 0x0ab9}, {0x0abc, 0x0ac5}, {0x0ac7, 0x0ac9}, {0x0acb, 0x0acd},
|
||||
{0x0ae6, 0x0aef}, {0x0b01, 0x0b03}, {0x0b05, 0x0b0c}, {0x0b13, 0x0b1f},
|
||||
{0x0b21, 0x0b28}, {0x0b2a, 0x0b30}, {0x0b36, 0x0b39}, {0x0b3c, 0x0b43},
|
||||
{0x0b4b, 0x0b4d}, {0x0b5f, 0x0b61}, {0x0b66, 0x0b70}, {0x0b85, 0x0b8a},
|
||||
{0x0b8e, 0x0b90}, {0x0b92, 0x0b95}, {0x0ba8, 0x0baa}, {0x0bae, 0x0bb5},
|
||||
{0x0bb7, 0x0bb9}, {0x0bbe, 0x0bc2}, {0x0bc6, 0x0bc8}, {0x0bca, 0x0bcd},
|
||||
{0x0be7, 0x0bf2}, {0x0c01, 0x0c03}, {0x0c05, 0x0c0c}, {0x0c0e, 0x0c10},
|
||||
{0x0c12, 0x0c1f}, {0x0c21, 0x0c28}, {0x0c2a, 0x0c33}, {0x0c35, 0x0c39},
|
||||
{0x0c3e, 0x0c44}, {0x0c46, 0x0c48}, {0x0c4a, 0x0c4d}, {0x0c66, 0x0c6f},
|
||||
{0x0c85, 0x0c8c}, {0x0c8e, 0x0c90}, {0x0c92, 0x0ca8}, {0x0caa, 0x0cb3},
|
||||
{0x0cb5, 0x0cb9}, {0x0cbe, 0x0cc4}, {0x0cc6, 0x0cc8}, {0x0cca, 0x0ccd},
|
||||
{0x0ce6, 0x0cef}, {0x0d05, 0x0d0c}, {0x0d0e, 0x0d10}, {0x0d12, 0x0d1f},
|
||||
{0x0d21, 0x0d28}, {0x0d2a, 0x0d39}, {0x0d3e, 0x0d43}, {0x0d46, 0x0d48},
|
||||
{0x0d4a, 0x0d4d}, {0x0d66, 0x0d6f}, {0x0d85, 0x0d96}, {0x0d9a, 0x0db1},
|
||||
{0x0db3, 0x0dbb}, {0x0dc0, 0x0dc6}, {0x0dcf, 0x0dd4}, {0x0dd8, 0x0ddf},
|
||||
{0x0df2, 0x0df4}, {0x0e01, 0x0e1f}, {0x0e21, 0x0e3a}, {0x0e3f, 0x0e5b},
|
||||
{0x0e94, 0x0e97}, {0x0e99, 0x0e9f}, {0x0ea1, 0x0ea3}, {0x0ead, 0x0eb9},
|
||||
{0x0ebb, 0x0ebd}, {0x0ec0, 0x0ec4}, {0x0ec8, 0x0ecd}, {0x0ed0, 0x0ed9},
|
||||
{0x0f00, 0x0f1f}, {0x0f21, 0x0f47}, {0x0f49, 0x0f6a}, {0x0f71, 0x0f8b},
|
||||
{0x0f90, 0x0f97}, {0x0f99, 0x0fbc}, {0x0fbe, 0x0fcc}, {0x1000, 0x101f},
|
||||
{0x1023, 0x1027}, {0x102c, 0x1032}, {0x1036, 0x1039}, {0x1040, 0x1059},
|
||||
{0x10a0, 0x10c5}, {0x10d0, 0x10f6}, {0x1100, 0x111f}, {0x1121, 0x1159},
|
||||
{0x115f, 0x11a2}, {0x11a8, 0x11f9}, {0x1200, 0x1206}, {0x1208, 0x121f},
|
||||
{0x1221, 0x1246}, {0x124a, 0x124d}, {0x1250, 0x1256}, {0x125a, 0x125d},
|
||||
{0x1260, 0x1286}, {0x128a, 0x128d}, {0x1290, 0x12ae}, {0x12b2, 0x12b5},
|
||||
{0x12b8, 0x12be}, {0x12c2, 0x12c5}, {0x12c8, 0x12ce}, {0x12d0, 0x12d6},
|
||||
{0x12d8, 0x12ee}, {0x12f0, 0x130e}, {0x1312, 0x1315}, {0x1318, 0x131e},
|
||||
{0x1321, 0x1346}, {0x1348, 0x135a}, {0x1361, 0x137c}, {0x13a0, 0x13f4},
|
||||
{0x1401, 0x141f}, {0x1421, 0x151f}, {0x1521, 0x161f}, {0x1621, 0x1676},
|
||||
{0x1680, 0x169c}, {0x16a0, 0x16f0}, {0x1780, 0x17dc}, {0x17e0, 0x17e9},
|
||||
{0x1800, 0x180a}, {0x1810, 0x1819}, {0x1821, 0x1877}, {0x1880, 0x18a9},
|
||||
{0x1e00, 0x1e1f}, {0x1e21, 0x1e9b}, {0x1ea0, 0x1ef9}, {0x1f00, 0x1f15},
|
||||
{0x1f18, 0x1f1d}, {0x1f21, 0x1f45}, {0x1f48, 0x1f4d}, {0x1f50, 0x1f57},
|
||||
{0x1f5f, 0x1f7d}, {0x1f80, 0x1fb4}, {0x1fb6, 0x1fc4}, {0x1fc6, 0x1fd3},
|
||||
{0x1fd6, 0x1fdb}, {0x1fdd, 0x1fef}, {0x1ff2, 0x1ff4}, {0x1ff6, 0x1ffe},
|
||||
{0x2000, 0x200b}, {0x2010, 0x201f}, {0x2021, 0x2029}, {0x202f, 0x2046},
|
||||
{0x2048, 0x204d}, {0x2074, 0x208e}, {0x20a0, 0x20af}, {0x20d0, 0x20e3},
|
||||
{0x2100, 0x211f}, {0x2121, 0x213a}, {0x2153, 0x2183}, {0x2190, 0x21f3},
|
||||
{0x2200, 0x221f}, {0x2221, 0x22f1}, {0x2300, 0x231f}, {0x2321, 0x237b},
|
||||
{0x237d, 0x239a}, {0x2400, 0x241f}, {0x2421, 0x2426}, {0x2440, 0x244a},
|
||||
{0x2460, 0x24ea}, {0x2500, 0x251f}, {0x2521, 0x2595}, {0x25a0, 0x25f7},
|
||||
{0x2600, 0x2613}, {0x2619, 0x261f}, {0x2621, 0x2671}, {0x2701, 0x2704},
|
||||
{0x2706, 0x2709}, {0x270c, 0x271f}, {0x2721, 0x2727}, {0x2729, 0x274b},
|
||||
{0x274f, 0x2752}, {0x2758, 0x275e}, {0x2761, 0x2767}, {0x2776, 0x2794},
|
||||
{0x2798, 0x27af}, {0x27b1, 0x27be}, {0x2800, 0x281f}, {0x2821, 0x28ff},
|
||||
{0x2e80, 0x2e99}, {0x2e9b, 0x2ef3}, {0x2f00, 0x2f1f}, {0x2f21, 0x2fd5},
|
||||
{0x2ff0, 0x2ffb}, {0x3000, 0x301f}, {0x3021, 0x303a}, {0x3041, 0x3094},
|
||||
{0x3099, 0x309e}, {0x30a1, 0x30fe}, {0x3105, 0x311f}, {0x3121, 0x312c},
|
||||
{0x3131, 0x318e}, {0x3190, 0x31b7}, {0x3200, 0x321c}, {0x3221, 0x3243},
|
||||
{0x3260, 0x327b}, {0x327f, 0x32b0}, {0x32c0, 0x32cb}, {0x32d0, 0x32fe},
|
||||
{0x3300, 0x331f}, {0x3321, 0x3376}, {0x337b, 0x33dd}, {0x33e0, 0x33fe},
|
||||
{0x3400, 0x341f}, {0x3421, 0x351f}, {0x3521, 0x361f}, {0x3621, 0x371f},
|
||||
{0x3721, 0x381f}, {0x3821, 0x391f}, {0x3921, 0x3a1f}, {0x3a21, 0x3b1f},
|
||||
{0x3b21, 0x3c1f}, {0x3c21, 0x3d1f}, {0x3d21, 0x3e1f}, {0x3e21, 0x3f1f},
|
||||
{0x3f21, 0x401f}, {0x4021, 0x411f}, {0x4121, 0x421f}, {0x4221, 0x431f},
|
||||
{0x4321, 0x441f}, {0x4421, 0x451f}, {0x4521, 0x461f}, {0x4621, 0x471f},
|
||||
{0x4721, 0x481f}, {0x4821, 0x491f}, {0x4921, 0x4a1f}, {0x4a21, 0x4b1f},
|
||||
{0x4b21, 0x4c1f}, {0x4c21, 0x4d1f}, {0x4d21, 0x4db5}, {0x4e00, 0x4e1f},
|
||||
{0x4e21, 0x4f1f}, {0x4f21, 0x501f}, {0x5021, 0x511f}, {0x5121, 0x521f},
|
||||
{0x5221, 0x531f}, {0x5321, 0x541f}, {0x5421, 0x551f}, {0x5521, 0x561f},
|
||||
{0x5621, 0x571f}, {0x5721, 0x581f}, {0x5821, 0x591f}, {0x5921, 0x5a1f},
|
||||
{0x5a21, 0x5b1f}, {0x5b21, 0x5c1f}, {0x5c21, 0x5d1f}, {0x5d21, 0x5e1f},
|
||||
{0x5e21, 0x5f1f}, {0x5f21, 0x601f}, {0x6021, 0x611f}, {0x6121, 0x621f},
|
||||
{0x6221, 0x631f}, {0x6321, 0x641f}, {0x6421, 0x651f}, {0x6521, 0x661f},
|
||||
{0x6621, 0x671f}, {0x6721, 0x681f}, {0x6821, 0x691f}, {0x6921, 0x6a1f},
|
||||
{0x6a21, 0x6b1f}, {0x6b21, 0x6c1f}, {0x6c21, 0x6d1f}, {0x6d21, 0x6e1f},
|
||||
{0x6e21, 0x6f1f}, {0x6f21, 0x701f}, {0x7021, 0x711f}, {0x7121, 0x721f},
|
||||
{0x7221, 0x731f}, {0x7321, 0x741f}, {0x7421, 0x751f}, {0x7521, 0x761f},
|
||||
{0x7621, 0x771f}, {0x7721, 0x781f}, {0x7821, 0x791f}, {0x7921, 0x7a1f},
|
||||
{0x7a21, 0x7b1f}, {0x7b21, 0x7c1f}, {0x7c21, 0x7d1f}, {0x7d21, 0x7e1f},
|
||||
{0x7e21, 0x7f1f}, {0x7f21, 0x801f}, {0x8021, 0x811f}, {0x8121, 0x821f},
|
||||
{0x8221, 0x831f}, {0x8321, 0x841f}, {0x8421, 0x851f}, {0x8521, 0x861f},
|
||||
{0x8621, 0x871f}, {0x8721, 0x881f}, {0x8821, 0x891f}, {0x8921, 0x8a1f},
|
||||
{0x8a21, 0x8b1f}, {0x8b21, 0x8c1f}, {0x8c21, 0x8d1f}, {0x8d21, 0x8e1f},
|
||||
{0x8e21, 0x8f1f}, {0x8f21, 0x901f}, {0x9021, 0x911f}, {0x9121, 0x921f},
|
||||
{0x9221, 0x931f}, {0x9321, 0x941f}, {0x9421, 0x951f}, {0x9521, 0x961f},
|
||||
{0x9621, 0x971f}, {0x9721, 0x981f}, {0x9821, 0x991f}, {0x9921, 0x9a1f},
|
||||
{0x9a21, 0x9b1f}, {0x9b21, 0x9c1f}, {0x9c21, 0x9d1f}, {0x9d21, 0x9e1f},
|
||||
{0x9e21, 0x9f1f}, {0x9f21, 0x9fa5}, {0xa000, 0xa01f}, {0xa021, 0xa11f},
|
||||
{0xa121, 0xa21f}, {0xa221, 0xa31f}, {0xa321, 0xa41f}, {0xa421, 0xa48c},
|
||||
{0xa490, 0xa4a1}, {0xa4a4, 0xa4b3}, {0xa4b5, 0xa4c0}, {0xa4c2, 0xa4c4},
|
||||
{0xac00, 0xac1f}, {0xac21, 0xad1f}, {0xad21, 0xae1f}, {0xae21, 0xaf1f},
|
||||
{0xaf21, 0xb01f}, {0xb021, 0xb11f}, {0xb121, 0xb21f}, {0xb221, 0xb31f},
|
||||
{0xb321, 0xb41f}, {0xb421, 0xb51f}, {0xb521, 0xb61f}, {0xb621, 0xb71f},
|
||||
{0xb721, 0xb81f}, {0xb821, 0xb91f}, {0xb921, 0xba1f}, {0xba21, 0xbb1f},
|
||||
{0xbb21, 0xbc1f}, {0xbc21, 0xbd1f}, {0xbd21, 0xbe1f}, {0xbe21, 0xbf1f},
|
||||
{0xbf21, 0xc01f}, {0xc021, 0xc11f}, {0xc121, 0xc21f}, {0xc221, 0xc31f},
|
||||
{0xc321, 0xc41f}, {0xc421, 0xc51f}, {0xc521, 0xc61f}, {0xc621, 0xc71f},
|
||||
{0xc721, 0xc81f}, {0xc821, 0xc91f}, {0xc921, 0xca1f}, {0xca21, 0xcb1f},
|
||||
{0xcb21, 0xcc1f}, {0xcc21, 0xcd1f}, {0xcd21, 0xce1f}, {0xce21, 0xcf1f},
|
||||
{0xcf21, 0xd01f}, {0xd021, 0xd11f}, {0xd121, 0xd21f}, {0xd221, 0xd31f},
|
||||
{0xd321, 0xd41f}, {0xd421, 0xd51f}, {0xd521, 0xd61f}, {0xd621, 0xd71f},
|
||||
{0xd721, 0xd7a3}, {0xf900, 0xf91f}, {0xf921, 0xfa1f}, {0xfa21, 0xfa2d},
|
||||
{0xfb00, 0xfb06}, {0xfb13, 0xfb17}, {0xfb1d, 0xfb1f}, {0xfb21, 0xfb36},
|
||||
{0xfb38, 0xfb3c}, {0xfb46, 0xfbb1}, {0xfbd3, 0xfc1f}, {0xfc21, 0xfd1f},
|
||||
{0xfd21, 0xfd3f}, {0xfd50, 0xfd8f}, {0xfd92, 0xfdc7}, {0xfdf0, 0xfdfb},
|
||||
{0xfe21, 0xfe23}, {0xfe30, 0xfe44}, {0xfe49, 0xfe52}, {0xfe54, 0xfe66},
|
||||
{0xfe68, 0xfe6b}, {0xfe70, 0xfe72}, {0xfe76, 0xfefc}, {0xff01, 0xff1f},
|
||||
{0xff21, 0xff5e}, {0xff61, 0xffbe}, {0xffc2, 0xffc7}, {0xffca, 0xffcf},
|
||||
{0xffd2, 0xffd7}, {0xffda, 0xffdc}, {0xffe0, 0xffe6}, {0xffe8, 0xffee},
|
||||
{0xfffc, 0xffff}
|
||||
};
|
||||
|
||||
#define NUM_GRAPH_RANGE (sizeof(graphRangeTable)/sizeof(crange))
|
||||
|
||||
static chr graphCharTable[] = {
|
||||
0x0374, 0x0375, 0x037a, 0x037e, 0x038c, 0x0488, 0x0489, 0x04c7, 0x04c8,
|
||||
0x04cb, 0x04cc, 0x04f8, 0x04f9, 0x0589, 0x058a, 0x060c, 0x061b, 0x061f,
|
||||
0x098f, 0x0990, 0x09b2, 0x09bc, 0x09c7, 0x09c8, 0x09d7, 0x09dc, 0x09dd,
|
||||
0x0a02, 0x0a0f, 0x0a10, 0x0a32, 0x0a33, 0x0a35, 0x0a36, 0x0a38, 0x0a39,
|
||||
0x0a3c, 0x0a47, 0x0a48, 0x0a5e, 0x0a8d, 0x0ab2, 0x0ab3, 0x0ad0, 0x0ae0,
|
||||
0x0b0f, 0x0b10, 0x0b32, 0x0b33, 0x0b47, 0x0b48, 0x0b56, 0x0b57, 0x0b5c,
|
||||
0x0b5d, 0x0b82, 0x0b83, 0x0b99, 0x0b9a, 0x0b9c, 0x0b9e, 0x0b9f, 0x0ba3,
|
||||
0x0ba4, 0x0bd7, 0x0c55, 0x0c56, 0x0c60, 0x0c61, 0x0c82, 0x0c83, 0x0cd5,
|
||||
0x0cd6, 0x0cde, 0x0ce0, 0x0ce1, 0x0d02, 0x0d03, 0x0d57, 0x0d60, 0x0d61,
|
||||
0x0d82, 0x0d83, 0x0dbd, 0x0dca, 0x0dd6, 0x0e81, 0x0e82, 0x0e84, 0x0e87,
|
||||
0x0e88, 0x0e8a, 0x0e8d, 0x0ea5, 0x0ea7, 0x0eaa, 0x0eab, 0x0ec6, 0x0edc,
|
||||
0x0edd, 0x0fcf, 0x1021, 0x1029, 0x102a, 0x10fb, 0x1248, 0x1258, 0x1288,
|
||||
0x12b0, 0x12c0, 0x1310, 0x1f59, 0x1f5b, 0x1f5d, 0x2070, 0x274d, 0x2756,
|
||||
0x303e, 0x303f, 0xa4c6, 0xfb3e, 0xfb40, 0xfb41, 0xfb43, 0xfb44, 0xfe74
|
||||
};
|
||||
|
||||
#define NUM_GRAPH_CHAR (sizeof(graphCharTable)/sizeof(chr))
|
||||
|
||||
/*
|
||||
* End of auto-generated Unicode character ranges declarations.
|
||||
*/
|
||||
|
||||
#define CH NOCELT
|
||||
|
||||
/*
|
||||
- nmcces - how many distinct MCCEs are there?
|
||||
^ static int nmcces(struct vars *);
|
||||
*/
|
||||
static int
|
||||
nmcces(v)
|
||||
struct vars *v; /* context */
|
||||
{
|
||||
/*
|
||||
* No multi-character collating elements defined at the moment.
|
||||
*/
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
- nleaders - how many chrs can be first chrs of MCCEs?
|
||||
^ static int nleaders(struct vars *);
|
||||
*/
|
||||
static int
|
||||
nleaders(v)
|
||||
struct vars *v; /* context */
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
- allmcces - return a cvec with all the MCCEs of the locale
|
||||
^ static struct cvec *allmcces(struct vars *, struct cvec *);
|
||||
*/
|
||||
static struct cvec *
|
||||
allmcces(v, cv)
|
||||
struct vars *v; /* context */
|
||||
struct cvec *cv; /* this is supposed to have enough room */
|
||||
{
|
||||
return clearcvec(cv);
|
||||
}
|
||||
|
||||
/*
|
||||
- element - map collating-element name to celt
|
||||
^ static celt element(struct vars *, chr *, chr *);
|
||||
*/
|
||||
static celt
|
||||
element(v, startp, endp)
|
||||
struct vars *v; /* context */
|
||||
chr *startp; /* points to start of name */
|
||||
chr *endp; /* points just past end of name */
|
||||
{
|
||||
struct cname *cn;
|
||||
size_t len;
|
||||
Tcl_DString ds;
|
||||
CONST char *np;
|
||||
|
||||
/* generic: one-chr names stand for themselves */
|
||||
assert(startp < endp);
|
||||
len = endp - startp;
|
||||
if (len == 1) {
|
||||
return *startp;
|
||||
}
|
||||
|
||||
NOTE(REG_ULOCALE);
|
||||
|
||||
/* search table */
|
||||
Tcl_DStringInit(&ds);
|
||||
np = Tcl_UniCharToUtfDString(startp, (int)len, &ds);
|
||||
for (cn=cnames; cn->name!=NULL; cn++) {
|
||||
if (strlen(cn->name)==len && strncmp(cn->name, np, len)==0) {
|
||||
break; /* NOTE BREAK OUT */
|
||||
}
|
||||
}
|
||||
Tcl_DStringFree(&ds);
|
||||
if (cn->name != NULL) {
|
||||
return CHR(cn->code);
|
||||
}
|
||||
|
||||
/* couldn't find it */
|
||||
ERR(REG_ECOLLATE);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
- range - supply cvec for a range, including legality check
|
||||
^ static struct cvec *range(struct vars *, celt, celt, int);
|
||||
*/
|
||||
static struct cvec *
|
||||
range(v, a, b, cases)
|
||||
struct vars *v; /* context */
|
||||
celt a; /* range start */
|
||||
celt b; /* range end, might equal a */
|
||||
int cases; /* case-independent? */
|
||||
{
|
||||
int nchrs;
|
||||
struct cvec *cv;
|
||||
celt c, lc, uc, tc;
|
||||
|
||||
if (a != b && !before(a, b)) {
|
||||
ERR(REG_ERANGE);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!cases) { /* easy version */
|
||||
cv = getcvec(v, 0, 1, 0);
|
||||
NOERRN();
|
||||
addrange(cv, a, b);
|
||||
return cv;
|
||||
}
|
||||
|
||||
/*
|
||||
* When case-independent, it's hard to decide when cvec ranges are
|
||||
* usable, so for now at least, we won't try. We allocate enough
|
||||
* space for two case variants plus a little extra for the two
|
||||
* title case variants.
|
||||
*/
|
||||
|
||||
nchrs = (b - a + 1)*2 + 4;
|
||||
|
||||
cv = getcvec(v, nchrs, 0, 0);
|
||||
NOERRN();
|
||||
|
||||
for (c=a; c<=b; c++) {
|
||||
addchr(cv, c);
|
||||
lc = Tcl_UniCharToLower((chr)c);
|
||||
uc = Tcl_UniCharToUpper((chr)c);
|
||||
tc = Tcl_UniCharToTitle((chr)c);
|
||||
if (c != lc) {
|
||||
addchr(cv, lc);
|
||||
}
|
||||
if (c != uc) {
|
||||
addchr(cv, uc);
|
||||
}
|
||||
if (c != tc && tc != uc) {
|
||||
addchr(cv, tc);
|
||||
}
|
||||
}
|
||||
|
||||
return cv;
|
||||
}
|
||||
|
||||
/*
|
||||
- before - is celt x before celt y, for purposes of range legality?
|
||||
^ static int before(celt, celt);
|
||||
*/
|
||||
static int /* predicate */
|
||||
before(x, y)
|
||||
celt x, y; /* collating elements */
|
||||
{
|
||||
/* trivial because no MCCEs */
|
||||
if (x < y) {
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
- eclass - supply cvec for an equivalence class
|
||||
* Must include case counterparts on request.
|
||||
^ static struct cvec *eclass(struct vars *, celt, int);
|
||||
*/
|
||||
static struct cvec *
|
||||
eclass(v, c, cases)
|
||||
struct vars *v; /* context */
|
||||
celt c; /* Collating element representing
|
||||
* the equivalence class. */
|
||||
int cases; /* all cases? */
|
||||
{
|
||||
struct cvec *cv;
|
||||
|
||||
/* crude fake equivalence class for testing */
|
||||
if ((v->cflags®_FAKE) && c == 'x') {
|
||||
cv = getcvec(v, 4, 0, 0);
|
||||
addchr(cv, (chr)'x');
|
||||
addchr(cv, (chr)'y');
|
||||
if (cases) {
|
||||
addchr(cv, (chr)'X');
|
||||
addchr(cv, (chr)'Y');
|
||||
}
|
||||
return cv;
|
||||
}
|
||||
|
||||
/* otherwise, none */
|
||||
if (cases) {
|
||||
return allcases(v, c);
|
||||
}
|
||||
cv = getcvec(v, 1, 0, 0);
|
||||
assert(cv != NULL);
|
||||
addchr(cv, (chr)c);
|
||||
return cv;
|
||||
}
|
||||
|
||||
/*
|
||||
- cclass - supply cvec for a character class
|
||||
* Must include case counterparts on request.
|
||||
^ static struct cvec *cclass(struct vars *, chr *, chr *, int);
|
||||
*/
|
||||
static struct cvec *
|
||||
cclass(v, startp, endp, cases)
|
||||
struct vars *v; /* context */
|
||||
chr *startp; /* where the name starts */
|
||||
chr *endp; /* just past the end of the name */
|
||||
int cases; /* case-independent? */
|
||||
{
|
||||
size_t len;
|
||||
struct cvec *cv = NULL;
|
||||
Tcl_DString ds;
|
||||
CONST char *np;
|
||||
char **namePtr;
|
||||
int i, index;
|
||||
|
||||
/*
|
||||
* The following arrays define the valid character class names.
|
||||
*/
|
||||
|
||||
static char *classNames[] = {
|
||||
"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
|
||||
"lower", "print", "punct", "space", "upper", "xdigit", NULL
|
||||
};
|
||||
|
||||
enum classes {
|
||||
CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
|
||||
CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Extract the class name
|
||||
*/
|
||||
|
||||
len = endp - startp;
|
||||
Tcl_DStringInit(&ds);
|
||||
np = Tcl_UniCharToUtfDString(startp, (int)len, &ds);
|
||||
|
||||
/*
|
||||
* Remap lower and upper to alpha if the match is case insensitive.
|
||||
*/
|
||||
|
||||
if (cases && len == 5 && (strncmp("lower", np, 5) == 0
|
||||
|| strncmp("upper", np, 5) == 0)) {
|
||||
np = "alpha";
|
||||
}
|
||||
|
||||
/*
|
||||
* Map the name to the corresponding enumerated value.
|
||||
*/
|
||||
|
||||
index = -1;
|
||||
for (namePtr=classNames,i=0 ; *namePtr!=NULL ; namePtr++,i++) {
|
||||
if ((strlen(*namePtr) == len) && (strncmp(*namePtr, np, len) == 0)) {
|
||||
index = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
Tcl_DStringInit(&ds);
|
||||
if (index == -1) {
|
||||
ERR(REG_ECTYPE);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Now compute the character class contents.
|
||||
*/
|
||||
|
||||
switch((enum classes) index) {
|
||||
case CC_PRINT:
|
||||
case CC_ALNUM:
|
||||
cv = getcvec(v, NUM_ALPHA_CHAR, NUM_DIGIT_RANGE + NUM_ALPHA_RANGE, 0);
|
||||
if (cv) {
|
||||
for (i=0 ; i<NUM_ALPHA_CHAR ; i++) {
|
||||
addchr(cv, alphaCharTable[i]);
|
||||
}
|
||||
for (i=0 ; i<NUM_ALPHA_RANGE ; i++) {
|
||||
addrange(cv, alphaRangeTable[i].start,
|
||||
alphaRangeTable[i].end);
|
||||
}
|
||||
for (i=0 ; i<NUM_DIGIT_RANGE ; i++) {
|
||||
addrange(cv, digitRangeTable[i].start,
|
||||
digitRangeTable[i].end);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case CC_ALPHA:
|
||||
cv = getcvec(v, NUM_ALPHA_CHAR, NUM_ALPHA_RANGE, 0);
|
||||
if (cv) {
|
||||
for (i=0 ; i<NUM_ALPHA_RANGE ; i++) {
|
||||
addrange(cv, alphaRangeTable[i].start,
|
||||
alphaRangeTable[i].end);
|
||||
}
|
||||
for (i=0 ; i<NUM_ALPHA_CHAR ; i++) {
|
||||
addchr(cv, alphaCharTable[i]);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case CC_ASCII:
|
||||
cv = getcvec(v, 0, 1, 0);
|
||||
if (cv) {
|
||||
addrange(cv, 0, 0x7f);
|
||||
}
|
||||
break;
|
||||
case CC_BLANK:
|
||||
cv = getcvec(v, 2, 0, 0);
|
||||
addchr(cv, '\t');
|
||||
addchr(cv, ' ');
|
||||
break;
|
||||
case CC_CNTRL:
|
||||
cv = getcvec(v, 0, 2, 0);
|
||||
addrange(cv, 0x0, 0x1f);
|
||||
addrange(cv, 0x7f, 0x9f);
|
||||
break;
|
||||
case CC_DIGIT:
|
||||
cv = getcvec(v, 0, NUM_DIGIT_RANGE, 0);
|
||||
if (cv) {
|
||||
for (i=0 ; i<NUM_DIGIT_RANGE ; i++) {
|
||||
addrange(cv, digitRangeTable[i].start,
|
||||
digitRangeTable[i].end);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case CC_PUNCT:
|
||||
cv = getcvec(v, NUM_PUNCT_CHAR, NUM_PUNCT_RANGE, 0);
|
||||
if (cv) {
|
||||
for (i=0 ; i<NUM_PUNCT_RANGE ; i++) {
|
||||
addrange(cv, punctRangeTable[i].start,
|
||||
punctRangeTable[i].end);
|
||||
}
|
||||
for (i=0 ; i<NUM_PUNCT_CHAR ; i++) {
|
||||
addchr(cv, punctCharTable[i]);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case CC_XDIGIT:
|
||||
/*
|
||||
* This is a 3 instead of (NUM_DIGIT_RANGE+2) because I've no
|
||||
* idea how to define the digits 'a' through 'f' in
|
||||
* non-western locales. The concept is quite possibly non
|
||||
* portable, or only used in contextx where the characters
|
||||
* used would be the western ones anyway! Whatever is
|
||||
* actually the case, the number of ranges is fixed (until
|
||||
* someone comes up with a better arrangement!)
|
||||
*/
|
||||
cv = getcvec(v, 0, 3, 0);
|
||||
if (cv) {
|
||||
addrange(cv, '0', '9');
|
||||
addrange(cv, 'a', 'f');
|
||||
addrange(cv, 'A', 'F');
|
||||
}
|
||||
break;
|
||||
case CC_SPACE:
|
||||
cv = getcvec(v, NUM_SPACE_CHAR, NUM_SPACE_RANGE, 0);
|
||||
if (cv) {
|
||||
for (i=0 ; i<NUM_SPACE_RANGE ; i++) {
|
||||
addrange(cv, spaceRangeTable[i].start,
|
||||
spaceRangeTable[i].end);
|
||||
}
|
||||
for (i=0 ; i<NUM_SPACE_CHAR ; i++) {
|
||||
addchr(cv, spaceCharTable[i]);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case CC_LOWER:
|
||||
cv = getcvec(v, NUM_LOWER_CHAR, NUM_LOWER_RANGE, 0);
|
||||
if (cv) {
|
||||
for (i=0 ; i<NUM_LOWER_RANGE ; i++) {
|
||||
addrange(cv, lowerRangeTable[i].start,
|
||||
lowerRangeTable[i].end);
|
||||
}
|
||||
for (i=0 ; i<NUM_LOWER_CHAR ; i++) {
|
||||
addchr(cv, lowerCharTable[i]);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case CC_UPPER:
|
||||
cv = getcvec(v, NUM_UPPER_CHAR, NUM_UPPER_RANGE, 0);
|
||||
if (cv) {
|
||||
for (i=0 ; i<NUM_UPPER_RANGE ; i++) {
|
||||
addrange(cv, upperRangeTable[i].start,
|
||||
upperRangeTable[i].end);
|
||||
}
|
||||
for (i=0 ; i<NUM_UPPER_CHAR ; i++) {
|
||||
addchr(cv, upperCharTable[i]);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case CC_GRAPH:
|
||||
cv = getcvec(v, NUM_GRAPH_CHAR, NUM_GRAPH_RANGE, 0);
|
||||
if (cv) {
|
||||
for (i=0 ; i<NUM_GRAPH_RANGE ; i++) {
|
||||
addrange(cv, graphRangeTable[i].start,
|
||||
graphRangeTable[i].end);
|
||||
}
|
||||
for (i=0 ; i<NUM_GRAPH_CHAR ; i++) {
|
||||
addchr(cv, graphCharTable[i]);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (cv == NULL) {
|
||||
ERR(REG_ESPACE);
|
||||
}
|
||||
return cv;
|
||||
}
|
||||
|
||||
/*
|
||||
- allcases - supply cvec for all case counterparts of a chr (including itself)
|
||||
* This is a shortcut, preferably an efficient one, for simple characters;
|
||||
* messy cases are done via range().
|
||||
^ static struct cvec *allcases(struct vars *, pchr);
|
||||
*/
|
||||
static struct cvec *
|
||||
allcases(v, pc)
|
||||
struct vars *v; /* context */
|
||||
pchr pc; /* character to get case equivs of */
|
||||
{
|
||||
struct cvec *cv;
|
||||
chr c = (chr)pc;
|
||||
chr lc, uc, tc;
|
||||
|
||||
lc = Tcl_UniCharToLower((chr)c);
|
||||
uc = Tcl_UniCharToUpper((chr)c);
|
||||
tc = Tcl_UniCharToTitle((chr)c);
|
||||
|
||||
if (tc != uc) {
|
||||
cv = getcvec(v, 3, 0, 0);
|
||||
addchr(cv, tc);
|
||||
} else {
|
||||
cv = getcvec(v, 2, 0, 0);
|
||||
}
|
||||
addchr(cv, lc);
|
||||
if (lc != uc) {
|
||||
addchr(cv, uc);
|
||||
}
|
||||
return cv;
|
||||
}
|
||||
|
||||
/*
|
||||
- cmp - chr-substring compare
|
||||
* Backrefs need this. It should preferably be efficient.
|
||||
* Note that it does not need to report anything except equal/unequal.
|
||||
* Note also that the length is exact, and the comparison should not
|
||||
* stop at embedded NULs!
|
||||
^ static int cmp(CONST chr *, CONST chr *, size_t);
|
||||
*/
|
||||
static int /* 0 for equal, nonzero for unequal */
|
||||
cmp(x, y, len)
|
||||
CONST chr *x, *y; /* strings to compare */
|
||||
size_t len; /* exact length of comparison */
|
||||
{
|
||||
return memcmp(VS(x), VS(y), len*sizeof(chr));
|
||||
}
|
||||
|
||||
/*
|
||||
- casecmp - case-independent chr-substring compare
|
||||
* REG_ICASE backrefs need this. It should preferably be efficient.
|
||||
* Note that it does not need to report anything except equal/unequal.
|
||||
* Note also that the length is exact, and the comparison should not
|
||||
* stop at embedded NULs!
|
||||
^ static int casecmp(CONST chr *, CONST chr *, size_t);
|
||||
*/
|
||||
static int /* 0 for equal, nonzero for unequal */
|
||||
casecmp(x, y, len)
|
||||
CONST chr *x, *y; /* strings to compare */
|
||||
size_t len; /* exact length of comparison */
|
||||
{
|
||||
for (; len > 0; len--, x++, y++) {
|
||||
if ((*x!=*y) && (Tcl_UniCharToLower(*x) != Tcl_UniCharToLower(*y))) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
3560
src/regex/regcomp.c
3560
src/regex/regcomp.c
File diff suppressed because it is too large
Load Diff
120
src/regex/regcustom.h
Normal file
120
src/regex/regcustom.h
Normal file
@@ -0,0 +1,120 @@
|
||||
/*
|
||||
* Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
|
||||
*
|
||||
* Development of this software was funded, in part, by Cray Research Inc.,
|
||||
* UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
|
||||
* Corporation, none of whom are responsible for the results. The author
|
||||
* thanks all of them.
|
||||
*
|
||||
* Redistribution and use in source and binary forms -- with or without
|
||||
* modification -- are permitted for any purpose, provided that
|
||||
* redistributions in source form retain this entire copyright notice and
|
||||
* indicate the origin and nature of any modifications.
|
||||
*
|
||||
* I'd appreciate being given credit for this package in the documentation
|
||||
* of software which uses it, but that is not a requirement.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
||||
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
|
||||
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
||||
* HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
||||
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
||||
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/* headers if any */
|
||||
#include "tclInt.h"
|
||||
|
||||
/* overrides for regguts.h definitions, if any */
|
||||
#define FUNCPTR(name, args) (*name) _ANSI_ARGS_(args)
|
||||
#define MALLOC(n) ckalloc(n)
|
||||
#define FREE(p) ckfree(VS(p))
|
||||
#define REALLOC(p,n) ckrealloc(VS(p),n)
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Do not insert extras between the "begin" and "end" lines -- this
|
||||
* chunk is automatically extracted to be fitted into regex.h.
|
||||
*/
|
||||
/* --- begin --- */
|
||||
/* ensure certain things don't sneak in from system headers */
|
||||
#ifdef __REG_WIDE_T
|
||||
#undef __REG_WIDE_T
|
||||
#endif
|
||||
#ifdef __REG_WIDE_COMPILE
|
||||
#undef __REG_WIDE_COMPILE
|
||||
#endif
|
||||
#ifdef __REG_WIDE_EXEC
|
||||
#undef __REG_WIDE_EXEC
|
||||
#endif
|
||||
#ifdef __REG_REGOFF_T
|
||||
#undef __REG_REGOFF_T
|
||||
#endif
|
||||
#ifdef __REG_VOID_T
|
||||
#undef __REG_VOID_T
|
||||
#endif
|
||||
#ifdef __REG_CONST
|
||||
#undef __REG_CONST
|
||||
#endif
|
||||
#ifdef __REG_NOFRONT
|
||||
#undef __REG_NOFRONT
|
||||
#endif
|
||||
#ifdef __REG_NOCHAR
|
||||
#undef __REG_NOCHAR
|
||||
#endif
|
||||
/* interface types */
|
||||
#define __REG_WIDE_T Tcl_UniChar
|
||||
#define __REG_REGOFF_T long /* not really right, but good enough... */
|
||||
#define __REG_VOID_T VOID
|
||||
#define __REG_CONST CONST
|
||||
/* names and declarations */
|
||||
#define __REG_WIDE_COMPILE TclReComp
|
||||
#define __REG_WIDE_EXEC TclReExec
|
||||
#define __REG_NOFRONT /* don't want regcomp() and regexec() */
|
||||
#define __REG_NOCHAR /* or the char versions */
|
||||
#define regfree TclReFree
|
||||
#define regerror TclReError
|
||||
/* --- end --- */
|
||||
|
||||
|
||||
|
||||
/* internal character type and related */
|
||||
typedef Tcl_UniChar chr; /* the type itself */
|
||||
typedef int pchr; /* what it promotes to */
|
||||
typedef unsigned uchr; /* unsigned type that will hold a chr */
|
||||
typedef int celt; /* type to hold chr, MCCE number, or NOCELT */
|
||||
#define NOCELT (-1) /* celt value which is not valid chr or MCCE */
|
||||
#define CHR(c) (UCHAR(c)) /* turn char literal into chr literal */
|
||||
#define DIGITVAL(c) ((c)-'0') /* turn chr digit into its value */
|
||||
#if TCL_UTF_MAX > 3
|
||||
#define CHRBITS 32 /* bits in a chr; must not use sizeof */
|
||||
#define CHR_MIN 0x00000000 /* smallest and largest chr; the value */
|
||||
#define CHR_MAX 0xffffffff /* CHR_MAX-CHR_MIN+1 should fit in uchr */
|
||||
#else
|
||||
#define CHRBITS 16 /* bits in a chr; must not use sizeof */
|
||||
#define CHR_MIN 0x0000 /* smallest and largest chr; the value */
|
||||
#define CHR_MAX 0xffff /* CHR_MAX-CHR_MIN+1 should fit in uchr */
|
||||
#endif
|
||||
|
||||
/* functions operating on chr */
|
||||
#define iscalnum(x) Tcl_UniCharIsAlnum(x)
|
||||
#define iscalpha(x) Tcl_UniCharIsAlpha(x)
|
||||
#define iscdigit(x) Tcl_UniCharIsDigit(x)
|
||||
#define iscspace(x) Tcl_UniCharIsSpace(x)
|
||||
|
||||
/* name the external functions */
|
||||
#define compile TclReComp
|
||||
#define exec TclReExec
|
||||
|
||||
/* enable/disable debugging code (by whether REG_DEBUG is defined or not) */
|
||||
#if 0 /* no debug unless requested by makefile */
|
||||
#define REG_DEBUG /* */
|
||||
#endif
|
||||
|
||||
/* and pick up the standard header */
|
||||
#include "regex.h"
|
@@ -1,509 +0,0 @@
|
||||
.TH REGEX 3 "25 Sept 1997"
|
||||
.BY "Henry Spencer"
|
||||
.de ZR
|
||||
.\" one other place knows this name: the SEE ALSO section
|
||||
.IR regex (7) \\$1
|
||||
..
|
||||
.SH NAME
|
||||
regcomp, regexec, regerror, regfree \- regular-expression library
|
||||
.SH SYNOPSIS
|
||||
.ft B
|
||||
.\".na
|
||||
#include <sys/types.h>
|
||||
.br
|
||||
#include <regex.h>
|
||||
.HP 10
|
||||
int regcomp(regex_t\ *preg, const\ char\ *pattern, int\ cflags);
|
||||
.HP
|
||||
int\ regexec(const\ regex_t\ *preg, const\ char\ *string,
|
||||
size_t\ nmatch, regmatch_t\ pmatch[], int\ eflags);
|
||||
.HP
|
||||
size_t\ regerror(int\ errcode, const\ regex_t\ *preg,
|
||||
char\ *errbuf, size_t\ errbuf_size);
|
||||
.HP
|
||||
void\ regfree(regex_t\ *preg);
|
||||
.\".ad
|
||||
.ft
|
||||
.SH DESCRIPTION
|
||||
These routines implement POSIX 1003.2 regular expressions (``RE''s);
|
||||
see
|
||||
.ZR .
|
||||
.I Regcomp
|
||||
compiles an RE written as a string into an internal form,
|
||||
.I regexec
|
||||
matches that internal form against a string and reports results,
|
||||
.I regerror
|
||||
transforms error codes from either into human-readable messages,
|
||||
and
|
||||
.I regfree
|
||||
frees any dynamically-allocated storage used by the internal form
|
||||
of an RE.
|
||||
.PP
|
||||
The header
|
||||
.I <regex.h>
|
||||
declares two structure types,
|
||||
.I regex_t
|
||||
and
|
||||
.IR regmatch_t ,
|
||||
the former for compiled internal forms and the latter for match reporting.
|
||||
It also declares the four functions,
|
||||
a type
|
||||
.IR regoff_t ,
|
||||
and a number of constants with names starting with ``REG_''.
|
||||
.PP
|
||||
.I Regcomp
|
||||
compiles the regular expression contained in the
|
||||
.I pattern
|
||||
string,
|
||||
subject to the flags in
|
||||
.IR cflags ,
|
||||
and places the results in the
|
||||
.I regex_t
|
||||
structure pointed to by
|
||||
.IR preg .
|
||||
.I Cflags
|
||||
is the bitwise OR of zero or more of the following flags:
|
||||
.IP REG_EXTENDED \w'REG_EXTENDED'u+2n
|
||||
Compile modern (``extended'') REs,
|
||||
rather than the obsolete (``basic'') REs that
|
||||
are the default.
|
||||
.IP REG_BASIC
|
||||
This is a synonym for 0,
|
||||
provided as a counterpart to REG_EXTENDED to improve readability.
|
||||
This is an extension,
|
||||
compatible with but not specified by POSIX 1003.2,
|
||||
and should be used with
|
||||
caution in software intended to be portable to other systems.
|
||||
.IP REG_NOSPEC
|
||||
Compile with recognition of all special characters turned off.
|
||||
All characters are thus considered ordinary,
|
||||
so the ``RE'' is a literal string.
|
||||
This is an extension,
|
||||
compatible with but not specified by POSIX 1003.2,
|
||||
and should be used with
|
||||
caution in software intended to be portable to other systems.
|
||||
REG_EXTENDED and REG_NOSPEC may not be used
|
||||
in the same call to
|
||||
.IR regcomp .
|
||||
.IP REG_ICASE
|
||||
Compile for matching that ignores upper/lower case distinctions.
|
||||
See
|
||||
.ZR .
|
||||
.IP REG_NOSUB
|
||||
Compile for matching that need only report success or failure,
|
||||
not what was matched.
|
||||
.IP REG_NEWLINE
|
||||
Compile for newline-sensitive matching.
|
||||
By default, newline is a completely ordinary character with no special
|
||||
meaning in either REs or strings.
|
||||
With this flag,
|
||||
`[^' bracket expressions and `.' never match newline,
|
||||
a `^' anchor matches the null string after any newline in the string
|
||||
in addition to its normal function,
|
||||
and the `$' anchor matches the null string before any newline in the
|
||||
string in addition to its normal function.
|
||||
.IP REG_PEND
|
||||
The regular expression ends,
|
||||
not at the first NUL,
|
||||
but just before the character pointed to by the
|
||||
.I re_endp
|
||||
member of the structure pointed to by
|
||||
.IR preg .
|
||||
The
|
||||
.I re_endp
|
||||
member is of type
|
||||
.IR const\ char\ * .
|
||||
This flag permits inclusion of NULs in the RE;
|
||||
they are considered ordinary characters.
|
||||
This is an extension,
|
||||
compatible with but not specified by POSIX 1003.2,
|
||||
and should be used with
|
||||
caution in software intended to be portable to other systems.
|
||||
.PP
|
||||
When successful,
|
||||
.I regcomp
|
||||
returns 0 and fills in the structure pointed to by
|
||||
.IR preg .
|
||||
One member of that structure
|
||||
(other than
|
||||
.IR re_endp )
|
||||
is publicized:
|
||||
.IR re_nsub ,
|
||||
of type
|
||||
.IR size_t ,
|
||||
contains the number of parenthesized subexpressions within the RE
|
||||
(except that the value of this member is undefined if the
|
||||
REG_NOSUB flag was used).
|
||||
If
|
||||
.I regcomp
|
||||
fails, it returns a non-zero error code;
|
||||
see DIAGNOSTICS.
|
||||
.PP
|
||||
.I Regexec
|
||||
matches the compiled RE pointed to by
|
||||
.I preg
|
||||
against the
|
||||
.IR string ,
|
||||
subject to the flags in
|
||||
.IR eflags ,
|
||||
and reports results using
|
||||
.IR nmatch ,
|
||||
.IR pmatch ,
|
||||
and the returned value.
|
||||
The RE must have been compiled by a previous invocation of
|
||||
.IR regcomp .
|
||||
The compiled form is not altered during execution of
|
||||
.IR regexec ,
|
||||
so a single compiled RE can be used simultaneously by multiple threads.
|
||||
.PP
|
||||
By default,
|
||||
the NUL-terminated string pointed to by
|
||||
.I string
|
||||
is considered to be the text of an entire line,
|
||||
with the NUL indicating the end of the line.
|
||||
(That is,
|
||||
any other end-of-line marker is considered to have been removed
|
||||
and replaced by the NUL.)
|
||||
The
|
||||
.I eflags
|
||||
argument is the bitwise OR of zero or more of the following flags:
|
||||
.IP REG_NOTBOL \w'REG_STARTEND'u+2n
|
||||
The first character of
|
||||
the string
|
||||
is not the beginning of a line, so the `^' anchor should not match before it.
|
||||
This does not affect the behavior of newlines under REG_NEWLINE.
|
||||
.IP REG_NOTEOL
|
||||
The NUL terminating
|
||||
the string
|
||||
does not end a line, so the `$' anchor should not match before it.
|
||||
This does not affect the behavior of newlines under REG_NEWLINE.
|
||||
.IP REG_STARTEND
|
||||
The string is considered to start at
|
||||
\fIstring\fR\ + \fIpmatch\fR[0].\fIrm_so\fR
|
||||
and to have a terminating NUL located at
|
||||
\fIstring\fR\ + \fIpmatch\fR[0].\fIrm_eo\fR
|
||||
(there need not actually be a NUL at that location),
|
||||
regardless of the value of
|
||||
.IR nmatch .
|
||||
See below for the definition of
|
||||
.IR pmatch
|
||||
and
|
||||
.IR nmatch .
|
||||
This is an extension,
|
||||
compatible with but not specified by POSIX 1003.2,
|
||||
and should be used with
|
||||
caution in software intended to be portable to other systems.
|
||||
Note that a non-zero \fIrm_so\fR does not imply REG_NOTBOL;
|
||||
REG_STARTEND affects only the location of the string,
|
||||
not how it is matched.
|
||||
.PP
|
||||
See
|
||||
.ZR
|
||||
for a discussion of what is matched in situations where an RE or a
|
||||
portion thereof could match any of several substrings of
|
||||
.IR string .
|
||||
.PP
|
||||
Normally,
|
||||
.I regexec
|
||||
returns 0 for success and the non-zero code REG_NOMATCH for failure.
|
||||
Other non-zero error codes may be returned in exceptional situations;
|
||||
see DIAGNOSTICS.
|
||||
.PP
|
||||
If REG_NOSUB was specified in the compilation of the RE,
|
||||
or if
|
||||
.I nmatch
|
||||
is 0,
|
||||
.I regexec
|
||||
ignores the
|
||||
.I pmatch
|
||||
argument (but see below for the case where REG_STARTEND is specified).
|
||||
Otherwise,
|
||||
.I pmatch
|
||||
points to an array of
|
||||
.I nmatch
|
||||
structures of type
|
||||
.IR regmatch_t .
|
||||
Such a structure has at least the members
|
||||
.I rm_so
|
||||
and
|
||||
.IR rm_eo ,
|
||||
both of type
|
||||
.I regoff_t
|
||||
(a signed arithmetic type at least as large as an
|
||||
.I off_t
|
||||
and a
|
||||
.IR ssize_t ),
|
||||
containing respectively the offset of the first character of a substring
|
||||
and the offset of the first character after the end of the substring.
|
||||
Offsets are measured from the beginning of the
|
||||
.I string
|
||||
argument given to
|
||||
.IR regexec .
|
||||
An empty substring is denoted by equal offsets,
|
||||
both indicating the character following the empty substring.
|
||||
.PP
|
||||
The 0th member of the
|
||||
.I pmatch
|
||||
array is filled in to indicate what substring of
|
||||
.I string
|
||||
was matched by the entire RE.
|
||||
Remaining members report what substring was matched by parenthesized
|
||||
subexpressions within the RE;
|
||||
member
|
||||
.I i
|
||||
reports subexpression
|
||||
.IR i ,
|
||||
with subexpressions counted (starting at 1) by the order of their opening
|
||||
parentheses in the RE, left to right.
|
||||
Unused entries in the array\(emcorresponding either to subexpressions that
|
||||
did not participate in the match at all, or to subexpressions that do not
|
||||
exist in the RE (that is, \fIi\fR\ > \fIpreg\fR\->\fIre_nsub\fR)\(emhave both
|
||||
.I rm_so
|
||||
and
|
||||
.I rm_eo
|
||||
set to \-1.
|
||||
If a subexpression participated in the match several times,
|
||||
the reported substring is the last one it matched.
|
||||
(Note, as an example in particular, that when the RE `(b*)+' matches `bbb',
|
||||
the parenthesized subexpression matches the three `b's and then
|
||||
an infinite number of empty strings following the last `b',
|
||||
so the reported substring is one of the empties.)
|
||||
.PP
|
||||
If REG_STARTEND is specified,
|
||||
.I pmatch
|
||||
must point to at least one
|
||||
.I regmatch_t
|
||||
(even if
|
||||
.I nmatch
|
||||
is 0 or REG_NOSUB was specified),
|
||||
to hold the input offsets for REG_STARTEND.
|
||||
Use for output is still entirely controlled by
|
||||
.IR nmatch ;
|
||||
if
|
||||
.I nmatch
|
||||
is 0 or REG_NOSUB was specified,
|
||||
the value of
|
||||
.IR pmatch [0]
|
||||
will not be changed by a successful
|
||||
.IR regexec .
|
||||
.PP
|
||||
.I Regerror
|
||||
maps a non-zero
|
||||
.I errcode
|
||||
from either
|
||||
.I regcomp
|
||||
or
|
||||
.I regexec
|
||||
to a human-readable, printable message.
|
||||
If
|
||||
.I preg
|
||||
is non-NULL,
|
||||
the error code should have arisen from use of
|
||||
the
|
||||
.I regex_t
|
||||
pointed to by
|
||||
.IR preg ,
|
||||
and if the error code came from
|
||||
.IR regcomp ,
|
||||
it should have been the result from the most recent
|
||||
.I regcomp
|
||||
using that
|
||||
.IR regex_t .
|
||||
.RI ( Regerror
|
||||
may be able to supply a more detailed message using information
|
||||
from the
|
||||
.IR regex_t .)
|
||||
.I Regerror
|
||||
places the NUL-terminated message into the buffer pointed to by
|
||||
.IR errbuf ,
|
||||
limiting the length (including the NUL) to at most
|
||||
.I errbuf_size
|
||||
bytes.
|
||||
If the whole message won't fit,
|
||||
as much of it as will fit before the terminating NUL is supplied.
|
||||
In any case,
|
||||
the returned value is the size of buffer needed to hold the whole
|
||||
message (including terminating NUL).
|
||||
If
|
||||
.I errbuf_size
|
||||
is 0,
|
||||
.I errbuf
|
||||
is ignored but the return value is still correct.
|
||||
.PP
|
||||
If the
|
||||
.I errcode
|
||||
given to
|
||||
.I regerror
|
||||
is first ORed with REG_ITOA,
|
||||
the ``message'' that results is the printable name of the error code,
|
||||
e.g. ``REG_NOMATCH'',
|
||||
rather than an explanation thereof.
|
||||
If
|
||||
.I errcode
|
||||
is REG_ATOI,
|
||||
then
|
||||
.I preg
|
||||
shall be non-NULL and the
|
||||
.I re_endp
|
||||
member of the structure it points to
|
||||
must point to the printable name of an error code;
|
||||
in this case, the result in
|
||||
.I errbuf
|
||||
is the decimal digits of
|
||||
the numeric value of the error code
|
||||
(0 if the name is not recognized).
|
||||
REG_ITOA and REG_ATOI are intended primarily as debugging facilities;
|
||||
they are extensions,
|
||||
compatible with but not specified by POSIX 1003.2,
|
||||
and should be used with
|
||||
caution in software intended to be portable to other systems.
|
||||
Be warned also that they are considered experimental and changes are possible.
|
||||
.PP
|
||||
.I Regfree
|
||||
frees any dynamically-allocated storage associated with the compiled RE
|
||||
pointed to by
|
||||
.IR preg .
|
||||
The remaining
|
||||
.I regex_t
|
||||
is no longer a valid compiled RE
|
||||
and the effect of supplying it to
|
||||
.I regexec
|
||||
or
|
||||
.I regerror
|
||||
is undefined.
|
||||
.PP
|
||||
None of these functions references global variables except for tables
|
||||
of constants;
|
||||
all are safe for use from multiple threads if the arguments are safe.
|
||||
.SH IMPLEMENTATION CHOICES
|
||||
There are a number of decisions that 1003.2 leaves up to the implementor,
|
||||
either by explicitly saying ``undefined'' or by virtue of them being
|
||||
forbidden by the RE grammar.
|
||||
This implementation treats them as follows.
|
||||
.PP
|
||||
See
|
||||
.ZR
|
||||
for a discussion of the definition of case-independent matching.
|
||||
.PP
|
||||
There is no particular limit on the length of REs,
|
||||
except insofar as memory is limited.
|
||||
Memory usage is approximately linear in RE size, and largely insensitive
|
||||
to RE complexity, except for bounded repetitions.
|
||||
See BUGS for one short RE using them
|
||||
that will run almost any system out of memory.
|
||||
.PP
|
||||
A backslashed character other than one specifically given a magic meaning
|
||||
by 1003.2 (such magic meanings occur only in obsolete [``basic''] REs)
|
||||
is taken as an ordinary character.
|
||||
.PP
|
||||
Any unmatched [ is a REG_EBRACK error.
|
||||
.PP
|
||||
Equivalence classes cannot begin or end bracket-expression ranges.
|
||||
The endpoint of one range cannot begin another.
|
||||
.PP
|
||||
RE_DUP_MAX, the limit on repetition counts in bounded repetitions, is 255.
|
||||
.PP
|
||||
A repetition operator (?, *, +, or bounds) cannot follow another
|
||||
repetition operator.
|
||||
A repetition operator cannot begin an expression or subexpression
|
||||
or follow `^' or `|'.
|
||||
.PP
|
||||
`|' cannot appear first or last in a (sub)expression or after another `|',
|
||||
i.e. an operand of `|' cannot be an empty subexpression.
|
||||
An empty parenthesized subexpression, `()', is legal and matches an
|
||||
empty (sub)string.
|
||||
An empty string is not a legal RE.
|
||||
.PP
|
||||
A `{' followed by a digit is considered the beginning of bounds for a
|
||||
bounded repetition, which must then follow the syntax for bounds.
|
||||
A `{' \fInot\fR followed by a digit is considered an ordinary character.
|
||||
.PP
|
||||
`^' and `$' beginning and ending subexpressions in obsolete (``basic'')
|
||||
REs are anchors, not ordinary characters.
|
||||
.SH SEE ALSO
|
||||
grep(1), regex(7)
|
||||
.PP
|
||||
POSIX 1003.2, sections 2.8 (Regular Expression Notation)
|
||||
and
|
||||
B.5 (C Binding for Regular Expression Matching).
|
||||
.SH DIAGNOSTICS
|
||||
Non-zero error codes from
|
||||
.I regcomp
|
||||
and
|
||||
.I regexec
|
||||
include the following:
|
||||
.PP
|
||||
.nf
|
||||
.ta \w'REG_ECOLLATE'u+3n
|
||||
REG_NOMATCH regexec() failed to match
|
||||
REG_BADPAT invalid regular expression
|
||||
REG_ECOLLATE invalid collating element
|
||||
REG_ECTYPE invalid character class
|
||||
REG_EESCAPE \e applied to unescapable character
|
||||
REG_ESUBREG invalid backreference number
|
||||
REG_EBRACK brackets [ ] not balanced
|
||||
REG_EPAREN parentheses ( ) not balanced
|
||||
REG_EBRACE braces { } not balanced
|
||||
REG_BADBR invalid repetition count(s) in { }
|
||||
REG_ERANGE invalid character range in [ ]
|
||||
REG_ESPACE ran out of memory
|
||||
REG_BADRPT ?, *, or + operand invalid
|
||||
REG_EMPTY empty (sub)expression
|
||||
REG_ASSERT ``can't happen''\(emyou found a bug
|
||||
REG_INVARG invalid argument, e.g. negative-length string
|
||||
.fi
|
||||
.SH HISTORY
|
||||
Written by Henry Spencer,
|
||||
henry@zoo.toronto.edu.
|
||||
.SH BUGS
|
||||
This is an alpha release with known defects.
|
||||
Please report problems.
|
||||
.PP
|
||||
There is one known functionality bug.
|
||||
The implementation of internationalization is incomplete:
|
||||
the locale is always assumed to be the default one of 1003.2,
|
||||
and only the collating elements etc. of that locale are available.
|
||||
.PP
|
||||
The back-reference code is subtle and doubts linger about its correctness
|
||||
in complex cases.
|
||||
.PP
|
||||
.I Regexec
|
||||
performance is poor.
|
||||
This will improve with later releases.
|
||||
.I Nmatch
|
||||
exceeding 0 is expensive;
|
||||
.I nmatch
|
||||
exceeding 1 is worse.
|
||||
.I Regexec
|
||||
is largely insensitive to RE complexity \fIexcept\fR that back
|
||||
references are massively expensive.
|
||||
RE length does matter; in particular, there is a strong speed bonus
|
||||
for keeping RE length under about 30 characters,
|
||||
with most special characters counting roughly double.
|
||||
.PP
|
||||
.I Regcomp
|
||||
implements bounded repetitions by macro expansion,
|
||||
which is costly in time and space if counts are large
|
||||
or bounded repetitions are nested.
|
||||
An RE like, say,
|
||||
`((((a{1,100}){1,100}){1,100}){1,100}){1,100}'
|
||||
will (eventually) run almost any existing machine out of swap space.
|
||||
.PP
|
||||
There are suspected problems with response to obscure error conditions.
|
||||
Notably,
|
||||
certain kinds of internal overflow,
|
||||
produced only by truly enormous REs or by multiply nested bounded repetitions,
|
||||
are probably not handled well.
|
||||
.PP
|
||||
Due to a mistake in 1003.2, things like `a)b' are legal REs because `)' is
|
||||
a special character only in the presence of a previous unmatched `('.
|
||||
This can't be fixed until the spec is fixed.
|
||||
.PP
|
||||
The standard's definition of back references is vague.
|
||||
For example, does
|
||||
`a\e(\e(b\e)*\e2\e)*d' match `abbbd'?
|
||||
Until the standard is clarified,
|
||||
behavior in such cases should not be relied on.
|
||||
.PP
|
||||
The implementation of word-boundary matching is a bit of a kludge,
|
||||
and bugs may lurk in combinations of word-boundary matching and anchoring.
|
@@ -1,235 +0,0 @@
|
||||
.TH REGEX 7 "25 Oct 1995"
|
||||
.BY "Henry Spencer"
|
||||
.SH NAME
|
||||
regex \- POSIX 1003.2 regular expressions
|
||||
.SH DESCRIPTION
|
||||
Regular expressions (``RE''s),
|
||||
as defined in POSIX 1003.2, come in two forms:
|
||||
modern REs (roughly those of
|
||||
.IR egrep ;
|
||||
1003.2 calls these ``extended'' REs)
|
||||
and obsolete REs (roughly those of
|
||||
.IR ed ;
|
||||
1003.2 ``basic'' REs).
|
||||
Obsolete REs mostly exist for backward compatibility in some old programs;
|
||||
they will be discussed at the end.
|
||||
1003.2 leaves some aspects of RE syntax and semantics open;
|
||||
`\(dg' marks decisions on these aspects that
|
||||
may not be fully portable to other 1003.2 implementations.
|
||||
.PP
|
||||
A (modern) RE is one\(dg or more non-empty\(dg \fIbranches\fR,
|
||||
separated by `|'.
|
||||
It matches anything that matches one of the branches.
|
||||
.PP
|
||||
A branch is one\(dg or more \fIpieces\fR, concatenated.
|
||||
It matches a match for the first, followed by a match for the second, etc.
|
||||
.PP
|
||||
A piece is an \fIatom\fR possibly followed
|
||||
by a single\(dg `*', `+', `?', or \fIbound\fR.
|
||||
An atom followed by `*' matches a sequence of 0 or more matches of the atom.
|
||||
An atom followed by `+' matches a sequence of 1 or more matches of the atom.
|
||||
An atom followed by `?' matches a sequence of 0 or 1 matches of the atom.
|
||||
.PP
|
||||
A \fIbound\fR is `{' followed by an unsigned decimal integer,
|
||||
possibly followed by `,'
|
||||
possibly followed by another unsigned decimal integer,
|
||||
always followed by `}'.
|
||||
The integers must lie between 0 and RE_DUP_MAX (255\(dg) inclusive,
|
||||
and if there are two of them, the first may not exceed the second.
|
||||
An atom followed by a bound containing one integer \fIi\fR
|
||||
and no comma matches
|
||||
a sequence of exactly \fIi\fR matches of the atom.
|
||||
An atom followed by a bound
|
||||
containing one integer \fIi\fR and a comma matches
|
||||
a sequence of \fIi\fR or more matches of the atom.
|
||||
An atom followed by a bound
|
||||
containing two integers \fIi\fR and \fIj\fR matches
|
||||
a sequence of \fIi\fR through \fIj\fR (inclusive) matches of the atom.
|
||||
.PP
|
||||
An atom is a regular expression enclosed in `()' (matching a match for the
|
||||
regular expression),
|
||||
an empty set of `()' (matching the null string)\(dg,
|
||||
a \fIbracket expression\fR (see below), `.'
|
||||
(matching any single character), `^' (matching the null string at the
|
||||
beginning of a line), `$' (matching the null string at the
|
||||
end of a line), a `\e' followed by one of the characters
|
||||
`^.[$()|*+?{\e'
|
||||
(matching that character taken as an ordinary character),
|
||||
a `\e' followed by any other character\(dg
|
||||
(matching that character taken as an ordinary character,
|
||||
as if the `\e' had not been present\(dg),
|
||||
or a single character with no other significance (matching that character).
|
||||
A `{' followed by a character other than a digit is an ordinary
|
||||
character, not the beginning of a bound\(dg.
|
||||
It is illegal to end an RE with `\e'.
|
||||
.PP
|
||||
A \fIbracket expression\fR is a list of characters enclosed in `[]'.
|
||||
It normally matches any single character from the list (but see below).
|
||||
If the list begins with `^',
|
||||
it matches any single character
|
||||
(but see below) \fInot\fR from the rest of the list.
|
||||
If two characters in the list are separated by `\-', this is shorthand
|
||||
for the full \fIrange\fR of characters between those two (inclusive) in the
|
||||
collating sequence,
|
||||
e.g. `[0\-9]' in ASCII matches any decimal digit.
|
||||
It is illegal\(dg for two ranges to share an
|
||||
endpoint, e.g. `a\-c\-e'.
|
||||
Ranges are very collating-sequence-dependent,
|
||||
and portable programs should avoid relying on them.
|
||||
.PP
|
||||
To include a literal `]' in the list, make it the first character
|
||||
(following a possible `^').
|
||||
To include a literal `\-', make it the first or last character,
|
||||
or the second endpoint of a range.
|
||||
To use a literal `\-' as the first endpoint of a range,
|
||||
enclose it in `[.' and `.]' to make it a collating element (see below).
|
||||
With the exception of these and some combinations using `[' (see next
|
||||
paragraphs), all other special characters, including `\e', lose their
|
||||
special significance within a bracket expression.
|
||||
.PP
|
||||
Within a bracket expression, a collating element (a character,
|
||||
a multi-character sequence that collates as if it were a single character,
|
||||
or a collating-sequence name for either)
|
||||
enclosed in `[.' and `.]' stands for the
|
||||
sequence of characters of that collating element.
|
||||
The sequence is a single element of the bracket expression's list.
|
||||
A bracket expression containing a multi-character collating element
|
||||
can thus match more than one character,
|
||||
e.g. if the collating sequence includes a `ch' collating element,
|
||||
then the RE `[[.ch.]]*c' matches the first five characters
|
||||
of `chchcc'.
|
||||
.PP
|
||||
Within a bracket expression, a collating element enclosed in `[=' and
|
||||
`=]' is an equivalence class, standing for the sequences of characters
|
||||
of all collating elements equivalent to that one, including itself.
|
||||
(If there are no other equivalent collating elements,
|
||||
the treatment is as if the enclosing delimiters were `[.' and `.]'.)
|
||||
For example, if o and \o'o^' are the members of an equivalence class,
|
||||
then `[[=o=]]', `[[=\o'o^'=]]', and `[o\o'o^']' are all synonymous.
|
||||
An equivalence class may not\(dg be an endpoint
|
||||
of a range.
|
||||
.PP
|
||||
Within a bracket expression, the name of a \fIcharacter class\fR enclosed
|
||||
in `[:' and `:]' stands for the list of all characters belonging to that
|
||||
class.
|
||||
Standard character class names are:
|
||||
.PP
|
||||
.RS
|
||||
.nf
|
||||
.ta 3c 6c 9c
|
||||
alnum digit punct
|
||||
alpha graph space
|
||||
blank lower upper
|
||||
cntrl print xdigit
|
||||
.fi
|
||||
.RE
|
||||
.PP
|
||||
These stand for the character classes defined in
|
||||
.IR ctype (3).
|
||||
A locale may provide others.
|
||||
A character class may not be used as an endpoint of a range.
|
||||
.PP
|
||||
There are two special cases\(dg of bracket expressions:
|
||||
the bracket expressions `[[:<:]]' and `[[:>:]]' match the null string at
|
||||
the beginning and end of a word respectively.
|
||||
A word is defined as a sequence of
|
||||
word characters
|
||||
which is neither preceded nor followed by
|
||||
word characters.
|
||||
A word character is an
|
||||
.I alnum
|
||||
character (as defined by
|
||||
.IR ctype (3))
|
||||
or an underscore.
|
||||
This is an extension,
|
||||
compatible with but not specified by POSIX 1003.2,
|
||||
and should be used with
|
||||
caution in software intended to be portable to other systems.
|
||||
.PP
|
||||
In the event that an RE could match more than one substring of a given
|
||||
string,
|
||||
the RE matches the one starting earliest in the string.
|
||||
If the RE could match more than one substring starting at that point,
|
||||
it matches the longest.
|
||||
Subexpressions also match the longest possible substrings, subject to
|
||||
the constraint that the whole match be as long as possible,
|
||||
with subexpressions starting earlier in the RE taking priority over
|
||||
ones starting later.
|
||||
Note that higher-level subexpressions thus take priority over
|
||||
their lower-level component subexpressions.
|
||||
.PP
|
||||
Match lengths are measured in characters, not collating elements.
|
||||
A null string is considered longer than no match at all.
|
||||
For example,
|
||||
`bb*' matches the three middle characters of `abbbc',
|
||||
`(wee|week)(knights|nights)' matches all ten characters of `weeknights',
|
||||
when `(.*).*' is matched against `abc' the parenthesized subexpression
|
||||
matches all three characters, and
|
||||
when `(a*)*' is matched against `bc' both the whole RE and the parenthesized
|
||||
subexpression match the null string.
|
||||
.PP
|
||||
If case-independent matching is specified,
|
||||
the effect is much as if all case distinctions had vanished from the
|
||||
alphabet.
|
||||
When an alphabetic that exists in multiple cases appears as an
|
||||
ordinary character outside a bracket expression, it is effectively
|
||||
transformed into a bracket expression containing both cases,
|
||||
e.g. `x' becomes `[xX]'.
|
||||
When it appears inside a bracket expression, all case counterparts
|
||||
of it are added to the bracket expression, so that (e.g.) `[x]'
|
||||
becomes `[xX]' and `[^x]' becomes `[^xX]'.
|
||||
.PP
|
||||
No particular limit is imposed on the length of REs\(dg.
|
||||
Programs intended to be portable should not employ REs longer
|
||||
than 256 bytes,
|
||||
as an implementation can refuse to accept such REs and remain
|
||||
POSIX-compliant.
|
||||
.PP
|
||||
Obsolete (``basic'') regular expressions differ in several respects.
|
||||
`|', `+', and `?' are ordinary characters and there is no equivalent
|
||||
for their functionality.
|
||||
The delimiters for bounds are `\e{' and `\e}',
|
||||
with `{' and `}' by themselves ordinary characters.
|
||||
The parentheses for nested subexpressions are `\e(' and `\e)',
|
||||
with `(' and `)' by themselves ordinary characters.
|
||||
`^' is an ordinary character except at the beginning of the
|
||||
RE or\(dg the beginning of a parenthesized subexpression,
|
||||
`$' is an ordinary character except at the end of the
|
||||
RE or\(dg the end of a parenthesized subexpression,
|
||||
and `*' is an ordinary character if it appears at the beginning of the
|
||||
RE or the beginning of a parenthesized subexpression
|
||||
(after a possible leading `^').
|
||||
Finally, there is one new type of atom, a \fIback reference\fR:
|
||||
`\e' followed by a non-zero decimal digit \fId\fR
|
||||
matches the same sequence of characters
|
||||
matched by the \fId\fRth parenthesized subexpression
|
||||
(numbering subexpressions by the positions of their opening parentheses,
|
||||
left to right),
|
||||
so that (e.g.) `\e([bc]\e)\e1' matches `bb' or `cc' but not `bc'.
|
||||
.SH SEE ALSO
|
||||
regex(3)
|
||||
.PP
|
||||
POSIX 1003.2, section 2.8 (Regular Expression Notation).
|
||||
.SH HISTORY
|
||||
Written by Henry Spencer, based on the 1003.2 spec.
|
||||
.SH BUGS
|
||||
Having two kinds of REs is a botch.
|
||||
.PP
|
||||
The current 1003.2 spec says that `)' is an ordinary character in
|
||||
the absence of an unmatched `(';
|
||||
this was an unintentional result of a wording error,
|
||||
and change is likely.
|
||||
Avoid relying on it.
|
||||
.PP
|
||||
Back references are a dreadful botch,
|
||||
posing major problems for efficient implementations.
|
||||
They are also somewhat vaguely defined
|
||||
(does
|
||||
`a\e(\e(b\e)*\e2\e)*d' match `abbbd'?).
|
||||
Avoid using them.
|
||||
.PP
|
||||
1003.2's specification of case-independent matching is vague.
|
||||
The ``one case implies all cases'' definition given above
|
||||
is current consensus among implementors as to the right interpretation.
|
||||
.PP
|
||||
The syntax for word boundaries is incredibly ugly.
|
@@ -1,134 +0,0 @@
|
||||
/*
|
||||
* First, the stuff that ends up in the outside-world include file
|
||||
= typedef off_t regoff_t;
|
||||
= typedef struct {
|
||||
= int re_magic;
|
||||
= size_t re_nsub; // number of parenthesized subexpressions
|
||||
= const char *re_endp; // end pointer for REG_PEND
|
||||
= struct re_guts *re_g; // none of your business :-)
|
||||
= } regex_t;
|
||||
= typedef struct {
|
||||
= regoff_t rm_so; // start of match
|
||||
= regoff_t rm_eo; // end of match
|
||||
= } regmatch_t;
|
||||
*/
|
||||
/*
|
||||
* internals of regex_t
|
||||
*/
|
||||
#define MAGIC1 ((('r'^0200)<<8) | 'e')
|
||||
|
||||
/*
|
||||
* The internal representation is a *strip*, a sequence of
|
||||
* operators ending with an endmarker. (Some terminology etc. is a
|
||||
* historical relic of earlier versions which used multiple strips.)
|
||||
* Certain oddities in the representation are there to permit running
|
||||
* the machinery backwards; in particular, any deviation from sequential
|
||||
* flow must be marked at both its source and its destination. Some
|
||||
* fine points:
|
||||
*
|
||||
* - OPLUS_ and O_PLUS are *inside* the loop they create.
|
||||
* - OQUEST_ and O_QUEST are *outside* the bypass they create.
|
||||
* - OCH_ and O_CH are *outside* the multi-way branch they create, while
|
||||
* OOR1 and OOR2 are respectively the end and the beginning of one of
|
||||
* the branches. Note that there is an implicit OOR2 following OCH_
|
||||
* and an implicit OOR1 preceding O_CH.
|
||||
*
|
||||
* In state representations, an operator's bit is on to signify a state
|
||||
* immediately *preceding* "execution" of that operator.
|
||||
*/
|
||||
typedef long sop; /* strip operator */
|
||||
typedef long sopno;
|
||||
#define OPRMASK 0x7c000000
|
||||
#define OPDMASK 0x03ffffff
|
||||
#define OPSHIFT (26)
|
||||
#define OP(n) ((n)&OPRMASK)
|
||||
#define OPND(n) ((n)&OPDMASK)
|
||||
#define SOP(op, opnd) ((op)|(opnd))
|
||||
/* operators meaning operand */
|
||||
/* (back, fwd are offsets) */
|
||||
#define OEND (1<<OPSHIFT) /* endmarker - */
|
||||
#define OCHAR (2<<OPSHIFT) /* character unsigned char */
|
||||
#define OBOL (3<<OPSHIFT) /* left anchor - */
|
||||
#define OEOL (4<<OPSHIFT) /* right anchor - */
|
||||
#define OANY (5<<OPSHIFT) /* . - */
|
||||
#define OANYOF (6<<OPSHIFT) /* [...] set number */
|
||||
#define OBACK_ (7<<OPSHIFT) /* begin \d paren number */
|
||||
#define O_BACK (8<<OPSHIFT) /* end \d paren number */
|
||||
#define OPLUS_ (9<<OPSHIFT) /* + prefix fwd to suffix */
|
||||
#define O_PLUS (10<<OPSHIFT) /* + suffix back to prefix */
|
||||
#define OQUEST_ (11<<OPSHIFT) /* ? prefix fwd to suffix */
|
||||
#define O_QUEST (12<<OPSHIFT) /* ? suffix back to prefix */
|
||||
#define OLPAREN (13<<OPSHIFT) /* ( fwd to ) */
|
||||
#define ORPAREN (14<<OPSHIFT) /* ) back to ( */
|
||||
#define OCH_ (15<<OPSHIFT) /* begin choice fwd to OOR2 */
|
||||
#define OOR1 (16<<OPSHIFT) /* | pt. 1 back to OOR1 or OCH_ */
|
||||
#define OOR2 (17<<OPSHIFT) /* | pt. 2 fwd to OOR2 or O_CH */
|
||||
#define O_CH (18<<OPSHIFT) /* end choice back to OOR1 */
|
||||
#define OBOW (19<<OPSHIFT) /* begin word - */
|
||||
#define OEOW (20<<OPSHIFT) /* end word - */
|
||||
|
||||
/*
|
||||
* Structure for [] character-set representation. Character sets are
|
||||
* done as bit vectors, grouped 8 to a byte vector for compactness.
|
||||
* The individual set therefore has both a pointer to the byte vector
|
||||
* and a mask to pick out the relevant bit of each byte. A hash code
|
||||
* simplifies testing whether two sets could be identical.
|
||||
*
|
||||
* This will get trickier for multicharacter collating elements. As
|
||||
* preliminary hooks for dealing with such things, we also carry along
|
||||
* a string of multi-character elements, and decide the size of the
|
||||
* vectors at run time.
|
||||
*/
|
||||
typedef struct {
|
||||
uch *ptr; /* -> uch [csetsize] */
|
||||
uch mask; /* bit within array */
|
||||
uch hash; /* hash code */
|
||||
size_t smultis;
|
||||
char *multis; /* -> char[smulti] ab\0cd\0ef\0\0 */
|
||||
} cset;
|
||||
/* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */
|
||||
#define CHadd(cs, c) ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (c))
|
||||
#define CHsub(cs, c) ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (c))
|
||||
#define CHIN(cs, c) ((cs)->ptr[(uch)(c)] & (cs)->mask)
|
||||
#define MCadd(p, cs, cp) mcadd(p, cs, cp) /* regcomp() internal fns */
|
||||
#define MCsub(p, cs, cp) mcsub(p, cs, cp)
|
||||
#define MCin(p, cs, cp) mcin(p, cs, cp)
|
||||
|
||||
/* stuff for character categories */
|
||||
typedef unsigned char cat_t;
|
||||
|
||||
/*
|
||||
* main compiled-expression structure
|
||||
*/
|
||||
struct re_guts {
|
||||
int magic;
|
||||
# define MAGIC2 ((('R'^0200)<<8)|'E')
|
||||
sop *strip; /* malloced area for strip */
|
||||
int csetsize; /* number of bits in a cset vector */
|
||||
int ncsets; /* number of csets in use */
|
||||
cset *sets; /* -> cset [ncsets] */
|
||||
uch *setbits; /* -> uch[csetsize][ncsets/CHAR_BIT] */
|
||||
int cflags; /* copy of regcomp() cflags argument */
|
||||
sopno nstates; /* = number of sops */
|
||||
sopno firststate; /* the initial OEND (normally 0) */
|
||||
sopno laststate; /* the final OEND */
|
||||
int iflags; /* internal flags */
|
||||
# define USEBOL 01 /* used ^ */
|
||||
# define USEEOL 02 /* used $ */
|
||||
# define BAD 04 /* something wrong */
|
||||
int nbol; /* number of ^ used */
|
||||
int neol; /* number of $ used */
|
||||
int ncategories; /* how many character categories */
|
||||
cat_t *categories; /* ->catspace[-CHAR_MIN] */
|
||||
char *must; /* match must contain this string */
|
||||
int mlen; /* length of must */
|
||||
size_t nsub; /* copy of re_nsub */
|
||||
int backrefs; /* does it use back references? */
|
||||
sopno nplus; /* how deep does it nest +s? */
|
||||
/* catspace must be last */
|
||||
cat_t catspace[1]; /* actually [NC] */
|
||||
};
|
||||
|
||||
/* misc utilities */
|
||||
#define OUT (CHAR_MAX+1) /* a non-character value */
|
||||
#define ISWORD(c) (isalnum(c) || (c) == '_')
|
@@ -1,316 +0,0 @@
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
/*
|
||||
- split - divide a string into fields, like awk split()
|
||||
= int split(char *string, char *fields[], int nfields, char *sep);
|
||||
*/
|
||||
int /* number of fields, including overflow */
|
||||
split(string, fields, nfields, sep)
|
||||
char *string;
|
||||
char *fields[]; /* list is not NULL-terminated */
|
||||
int nfields; /* number of entries available in fields[] */
|
||||
char *sep; /* "" white, "c" single char, "ab" [ab]+ */
|
||||
{
|
||||
register char *p = string;
|
||||
register char c; /* latest character */
|
||||
register char sepc = sep[0];
|
||||
register char sepc2;
|
||||
register int fn;
|
||||
register char **fp = fields;
|
||||
register char *sepp;
|
||||
register int trimtrail;
|
||||
|
||||
/* white space */
|
||||
if (sepc == '\0') {
|
||||
while ((c = *p++) == ' ' || c == '\t')
|
||||
continue;
|
||||
p--;
|
||||
trimtrail = 1;
|
||||
sep = " \t"; /* note, code below knows this is 2 long */
|
||||
sepc = ' ';
|
||||
} else
|
||||
trimtrail = 0;
|
||||
sepc2 = sep[1]; /* now we can safely pick this up */
|
||||
|
||||
/* catch empties */
|
||||
if (*p == '\0')
|
||||
return(0);
|
||||
|
||||
/* single separator */
|
||||
if (sepc2 == '\0') {
|
||||
fn = nfields;
|
||||
for (;;) {
|
||||
*fp++ = p;
|
||||
fn--;
|
||||
if (fn == 0)
|
||||
break;
|
||||
while ((c = *p++) != sepc)
|
||||
if (c == '\0')
|
||||
return(nfields - fn);
|
||||
*(p-1) = '\0';
|
||||
}
|
||||
/* we have overflowed the fields vector -- just count them */
|
||||
fn = nfields;
|
||||
for (;;) {
|
||||
while ((c = *p++) != sepc)
|
||||
if (c == '\0')
|
||||
return(fn);
|
||||
fn++;
|
||||
}
|
||||
/* not reached */
|
||||
}
|
||||
|
||||
/* two separators */
|
||||
if (sep[2] == '\0') {
|
||||
fn = nfields;
|
||||
for (;;) {
|
||||
*fp++ = p;
|
||||
fn--;
|
||||
while ((c = *p++) != sepc && c != sepc2)
|
||||
if (c == '\0') {
|
||||
if (trimtrail && **(fp-1) == '\0')
|
||||
fn++;
|
||||
return(nfields - fn);
|
||||
}
|
||||
if (fn == 0)
|
||||
break;
|
||||
*(p-1) = '\0';
|
||||
while ((c = *p++) == sepc || c == sepc2)
|
||||
continue;
|
||||
p--;
|
||||
}
|
||||
/* we have overflowed the fields vector -- just count them */
|
||||
fn = nfields;
|
||||
while (c != '\0') {
|
||||
while ((c = *p++) == sepc || c == sepc2)
|
||||
continue;
|
||||
p--;
|
||||
fn++;
|
||||
while ((c = *p++) != '\0' && c != sepc && c != sepc2)
|
||||
continue;
|
||||
}
|
||||
/* might have to trim trailing white space */
|
||||
if (trimtrail) {
|
||||
p--;
|
||||
while ((c = *--p) == sepc || c == sepc2)
|
||||
continue;
|
||||
p++;
|
||||
if (*p != '\0') {
|
||||
if (fn == nfields+1)
|
||||
*p = '\0';
|
||||
fn--;
|
||||
}
|
||||
}
|
||||
return(fn);
|
||||
}
|
||||
|
||||
/* n separators */
|
||||
fn = 0;
|
||||
for (;;) {
|
||||
if (fn < nfields)
|
||||
*fp++ = p;
|
||||
fn++;
|
||||
for (;;) {
|
||||
c = *p++;
|
||||
if (c == '\0')
|
||||
return(fn);
|
||||
sepp = sep;
|
||||
while ((sepc = *sepp++) != '\0' && sepc != c)
|
||||
continue;
|
||||
if (sepc != '\0') /* it was a separator */
|
||||
break;
|
||||
}
|
||||
if (fn < nfields)
|
||||
*(p-1) = '\0';
|
||||
for (;;) {
|
||||
c = *p++;
|
||||
sepp = sep;
|
||||
while ((sepc = *sepp++) != '\0' && sepc != c)
|
||||
continue;
|
||||
if (sepc == '\0') /* it wasn't a separator */
|
||||
break;
|
||||
}
|
||||
p--;
|
||||
}
|
||||
|
||||
/* not reached */
|
||||
}
|
||||
|
||||
#ifdef TEST_SPLIT
|
||||
|
||||
|
||||
/*
|
||||
* test program
|
||||
* pgm runs regression
|
||||
* pgm sep splits stdin lines by sep
|
||||
* pgm str sep splits str by sep
|
||||
* pgm str sep n splits str by sep n times
|
||||
*/
|
||||
int
|
||||
main(argc, argv)
|
||||
int argc;
|
||||
char *argv[];
|
||||
{
|
||||
char buf[512];
|
||||
register int n;
|
||||
# define MNF 10
|
||||
char *fields[MNF];
|
||||
|
||||
if (argc > 4)
|
||||
for (n = atoi(argv[3]); n > 0; n--) {
|
||||
(void) strcpy(buf, argv[1]);
|
||||
}
|
||||
else if (argc > 3)
|
||||
for (n = atoi(argv[3]); n > 0; n--) {
|
||||
(void) strcpy(buf, argv[1]);
|
||||
(void) split(buf, fields, MNF, argv[2]);
|
||||
}
|
||||
else if (argc > 2)
|
||||
dosplit(argv[1], argv[2]);
|
||||
else if (argc > 1)
|
||||
while (fgets(buf, sizeof(buf), stdin) != NULL) {
|
||||
buf[strlen(buf)-1] = '\0'; /* stomp newline */
|
||||
dosplit(buf, argv[1]);
|
||||
}
|
||||
else
|
||||
regress();
|
||||
|
||||
exit(0);
|
||||
}
|
||||
|
||||
dosplit(string, seps)
|
||||
char *string;
|
||||
char *seps;
|
||||
{
|
||||
# define NF 5
|
||||
char *fields[NF];
|
||||
register int nf;
|
||||
|
||||
nf = split(string, fields, NF, seps);
|
||||
print(nf, NF, fields);
|
||||
}
|
||||
|
||||
print(nf, nfp, fields)
|
||||
int nf;
|
||||
int nfp;
|
||||
char *fields[];
|
||||
{
|
||||
register int fn;
|
||||
register int bound;
|
||||
|
||||
bound = (nf > nfp) ? nfp : nf;
|
||||
printf("%d:\t", nf);
|
||||
for (fn = 0; fn < bound; fn++)
|
||||
printf("\"%s\"%s", fields[fn], (fn+1 < nf) ? ", " : "\n");
|
||||
}
|
||||
|
||||
#define RNF 5 /* some table entries know this */
|
||||
struct {
|
||||
char *str;
|
||||
char *seps;
|
||||
int nf;
|
||||
char *fi[RNF];
|
||||
} tests[] = {
|
||||
"", " ", 0, { "" },
|
||||
" ", " ", 2, { "", "" },
|
||||
"x", " ", 1, { "x" },
|
||||
"xy", " ", 1, { "xy" },
|
||||
"x y", " ", 2, { "x", "y" },
|
||||
"abc def g ", " ", 5, { "abc", "def", "", "g", "" },
|
||||
" a bcd", " ", 4, { "", "", "a", "bcd" },
|
||||
"a b c d e f", " ", 6, { "a", "b", "c", "d", "e f" },
|
||||
" a b c d ", " ", 6, { "", "a", "b", "c", "d " },
|
||||
|
||||
"", " _", 0, { "" },
|
||||
" ", " _", 2, { "", "" },
|
||||
"x", " _", 1, { "x" },
|
||||
"x y", " _", 2, { "x", "y" },
|
||||
"ab _ cd", " _", 2, { "ab", "cd" },
|
||||
" a_b c ", " _", 5, { "", "a", "b", "c", "" },
|
||||
"a b c_d e f", " _", 6, { "a", "b", "c", "d", "e f" },
|
||||
" a b c d ", " _", 6, { "", "a", "b", "c", "d " },
|
||||
|
||||
"", " _~", 0, { "" },
|
||||
" ", " _~", 2, { "", "" },
|
||||
"x", " _~", 1, { "x" },
|
||||
"x y", " _~", 2, { "x", "y" },
|
||||
"ab _~ cd", " _~", 2, { "ab", "cd" },
|
||||
" a_b c~", " _~", 5, { "", "a", "b", "c", "" },
|
||||
"a b_c d~e f", " _~", 6, { "a", "b", "c", "d", "e f" },
|
||||
"~a b c d ", " _~", 6, { "", "a", "b", "c", "d " },
|
||||
|
||||
"", " _~-", 0, { "" },
|
||||
" ", " _~-", 2, { "", "" },
|
||||
"x", " _~-", 1, { "x" },
|
||||
"x y", " _~-", 2, { "x", "y" },
|
||||
"ab _~- cd", " _~-", 2, { "ab", "cd" },
|
||||
" a_b c~", " _~-", 5, { "", "a", "b", "c", "" },
|
||||
"a b_c-d~e f", " _~-", 6, { "a", "b", "c", "d", "e f" },
|
||||
"~a-b c d ", " _~-", 6, { "", "a", "b", "c", "d " },
|
||||
|
||||
"", " ", 0, { "" },
|
||||
" ", " ", 2, { "", "" },
|
||||
"x", " ", 1, { "x" },
|
||||
"xy", " ", 1, { "xy" },
|
||||
"x y", " ", 2, { "x", "y" },
|
||||
"abc def g ", " ", 4, { "abc", "def", "g", "" },
|
||||
" a bcd", " ", 3, { "", "a", "bcd" },
|
||||
"a b c d e f", " ", 6, { "a", "b", "c", "d", "e f" },
|
||||
" a b c d ", " ", 6, { "", "a", "b", "c", "d " },
|
||||
|
||||
"", "", 0, { "" },
|
||||
" ", "", 0, { "" },
|
||||
"x", "", 1, { "x" },
|
||||
"xy", "", 1, { "xy" },
|
||||
"x y", "", 2, { "x", "y" },
|
||||
"abc def g ", "", 3, { "abc", "def", "g" },
|
||||
"\t a bcd", "", 2, { "a", "bcd" },
|
||||
" a \tb\t c ", "", 3, { "a", "b", "c" },
|
||||
"a b c d e ", "", 5, { "a", "b", "c", "d", "e" },
|
||||
"a b\tc d e f", "", 6, { "a", "b", "c", "d", "e f" },
|
||||
" a b c d e f ", "", 6, { "a", "b", "c", "d", "e f " },
|
||||
|
||||
NULL, NULL, 0, { NULL },
|
||||
};
|
||||
|
||||
regress()
|
||||
{
|
||||
char buf[512];
|
||||
register int n;
|
||||
char *fields[RNF+1];
|
||||
register int nf;
|
||||
register int i;
|
||||
register int printit;
|
||||
register char *f;
|
||||
|
||||
for (n = 0; tests[n].str != NULL; n++) {
|
||||
(void) strcpy(buf, tests[n].str);
|
||||
fields[RNF] = NULL;
|
||||
nf = split(buf, fields, RNF, tests[n].seps);
|
||||
printit = 0;
|
||||
if (nf != tests[n].nf) {
|
||||
printf("split `%s' by `%s' gave %d fields, not %d\n",
|
||||
tests[n].str, tests[n].seps, nf, tests[n].nf);
|
||||
printit = 1;
|
||||
} else if (fields[RNF] != NULL) {
|
||||
printf("split() went beyond array end\n");
|
||||
printit = 1;
|
||||
} else {
|
||||
for (i = 0; i < nf && i < RNF; i++) {
|
||||
f = fields[i];
|
||||
if (f == NULL)
|
||||
f = "(NULL)";
|
||||
if (strcmp(f, tests[n].fi[i]) != 0) {
|
||||
printf("split `%s' by `%s', field %d is `%s', not `%s'\n",
|
||||
tests[n].str, tests[n].seps,
|
||||
i, fields[i], tests[n].fi[i]);
|
||||
printit = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (printit)
|
||||
print(nf, RNF, fields);
|
||||
}
|
||||
}
|
||||
#endif
|
477
src/regex/tests
477
src/regex/tests
@@ -1,477 +0,0 @@
|
||||
# regular expression test set
|
||||
# Lines are at least three fields, separated by one or more tabs. "" stands
|
||||
# for an empty field. First field is an RE. Second field is flags. If
|
||||
# C flag given, regcomp() is expected to fail, and the third field is the
|
||||
# error name (minus the leading REG_).
|
||||
#
|
||||
# Otherwise it is expected to succeed, and the third field is the string to
|
||||
# try matching it against. If there is no fourth field, the match is
|
||||
# expected to fail. If there is a fourth field, it is the substring that
|
||||
# the RE is expected to match. If there is a fifth field, it is a comma-
|
||||
# separated list of what the subexpressions should match, with - indicating
|
||||
# no match for that one. In both the fourth and fifth fields, a (sub)field
|
||||
# starting with @ indicates that the (sub)expression is expected to match
|
||||
# a null string followed by the stuff after the @; this provides a way to
|
||||
# test where null strings match. The character `N' in REs and strings
|
||||
# is newline, `S' is space, `T' is tab, `Z' is NUL.
|
||||
#
|
||||
# The full list of flags:
|
||||
# - placeholder, does nothing
|
||||
# b RE is a BRE, not an ERE
|
||||
# & try it as both an ERE and a BRE
|
||||
# C regcomp() error expected, third field is error name
|
||||
# i REG_ICASE
|
||||
# m ("mundane") REG_NOSPEC
|
||||
# s REG_NOSUB (not really testable)
|
||||
# n REG_NEWLINE
|
||||
# ^ REG_NOTBOL
|
||||
# $ REG_NOTEOL
|
||||
# # REG_STARTEND (see below)
|
||||
# p REG_PEND
|
||||
#
|
||||
# For REG_STARTEND, the start/end offsets are those of the substring
|
||||
# enclosed in ().
|
||||
|
||||
# basics
|
||||
a & a a
|
||||
abc & abc abc
|
||||
abc|de - abc abc
|
||||
a|b|c - abc a
|
||||
|
||||
# parentheses and perversions thereof
|
||||
a(b)c - abc abc
|
||||
a\(b\)c b abc abc
|
||||
a( C EPAREN
|
||||
a( b a( a(
|
||||
a\( - a( a(
|
||||
a\( bC EPAREN
|
||||
a\(b bC EPAREN
|
||||
a(b C EPAREN
|
||||
a(b b a(b a(b
|
||||
# gag me with a right parenthesis -- 1003.2 goofed here (my fault, partly)
|
||||
a) - a) a)
|
||||
) - ) )
|
||||
# end gagging (in a just world, those *should* give EPAREN)
|
||||
a) b a) a)
|
||||
a\) bC EPAREN
|
||||
\) bC EPAREN
|
||||
a()b - ab ab
|
||||
a\(\)b b ab ab
|
||||
|
||||
# anchoring and REG_NEWLINE
|
||||
^abc$ & abc abc
|
||||
a^b - a^b
|
||||
a^b b a^b a^b
|
||||
a$b - a$b
|
||||
a$b b a$b a$b
|
||||
^ & abc @abc
|
||||
$ & abc @
|
||||
^$ & "" @
|
||||
$^ - "" @
|
||||
\($\)\(^\) b "" @
|
||||
# stop retching, those are legitimate (although disgusting)
|
||||
^^ - "" @
|
||||
$$ - "" @
|
||||
b$ & abNc
|
||||
b$ &n abNc b
|
||||
^b$ & aNbNc
|
||||
^b$ &n aNbNc b
|
||||
^$ &n aNNb @Nb
|
||||
^$ n abc
|
||||
^$ n abcN @
|
||||
$^ n aNNb @Nb
|
||||
\($\)\(^\) bn aNNb @Nb
|
||||
^^ n^ aNNb @Nb
|
||||
$$ n aNNb @NN
|
||||
^a ^ a
|
||||
a$ $ a
|
||||
^a ^n aNb
|
||||
^b ^n aNb b
|
||||
a$ $n bNa
|
||||
b$ $n bNa b
|
||||
a*(^b$)c* - b b
|
||||
a*\(^b$\)c* b b b
|
||||
|
||||
# certain syntax errors and non-errors
|
||||
| C EMPTY
|
||||
| b | |
|
||||
* C BADRPT
|
||||
* b * *
|
||||
+ C BADRPT
|
||||
? C BADRPT
|
||||
"" &C EMPTY
|
||||
() - abc @abc
|
||||
\(\) b abc @abc
|
||||
a||b C EMPTY
|
||||
|ab C EMPTY
|
||||
ab| C EMPTY
|
||||
(|a)b C EMPTY
|
||||
(a|)b C EMPTY
|
||||
(*a) C BADRPT
|
||||
(+a) C BADRPT
|
||||
(?a) C BADRPT
|
||||
({1}a) C BADRPT
|
||||
\(\{1\}a\) bC BADRPT
|
||||
(a|*b) C BADRPT
|
||||
(a|+b) C BADRPT
|
||||
(a|?b) C BADRPT
|
||||
(a|{1}b) C BADRPT
|
||||
^* C BADRPT
|
||||
^* b * *
|
||||
^+ C BADRPT
|
||||
^? C BADRPT
|
||||
^{1} C BADRPT
|
||||
^\{1\} bC BADRPT
|
||||
|
||||
# metacharacters, backslashes
|
||||
a.c & abc abc
|
||||
a[bc]d & abd abd
|
||||
a\*c & a*c a*c
|
||||
a\\b & a\b a\b
|
||||
a\\\*b & a\*b a\*b
|
||||
a\bc & abc abc
|
||||
a\ &C EESCAPE
|
||||
a\\bc & a\bc a\bc
|
||||
\{ bC BADRPT
|
||||
a\[b & a[b a[b
|
||||
a[b &C EBRACK
|
||||
# trailing $ is a peculiar special case for the BRE code
|
||||
a$ & a a
|
||||
a$ & a$
|
||||
a\$ & a
|
||||
a\$ & a$ a$
|
||||
a\\$ & a
|
||||
a\\$ & a$
|
||||
a\\$ & a\$
|
||||
a\\$ & a\ a\
|
||||
|
||||
# back references, ugh
|
||||
a\(b\)\2c bC ESUBREG
|
||||
a\(b\1\)c bC ESUBREG
|
||||
a\(b*\)c\1d b abbcbbd abbcbbd bb
|
||||
a\(b*\)c\1d b abbcbd
|
||||
a\(b*\)c\1d b abbcbbbd
|
||||
^\(.\)\1 b abc
|
||||
a\([bc]\)\1d b abcdabbd abbd b
|
||||
a\(\([bc]\)\2\)*d b abbccd abbccd
|
||||
a\(\([bc]\)\2\)*d b abbcbd
|
||||
# actually, this next one probably ought to fail, but the spec is unclear
|
||||
a\(\(b\)*\2\)*d b abbbd abbbd
|
||||
# here is a case that no NFA implementation does right
|
||||
\(ab*\)[ab]*\1 b ababaaa ababaaa a
|
||||
# check out normal matching in the presence of back refs
|
||||
\(a\)\1bcd b aabcd aabcd
|
||||
\(a\)\1bc*d b aabcd aabcd
|
||||
\(a\)\1bc*d b aabd aabd
|
||||
\(a\)\1bc*d b aabcccd aabcccd
|
||||
\(a\)\1bc*[ce]d b aabcccd aabcccd
|
||||
^\(a\)\1b\(c\)*cd$ b aabcccd aabcccd
|
||||
|
||||
# ordinary repetitions
|
||||
ab*c & abc abc
|
||||
ab+c - abc abc
|
||||
ab?c - abc abc
|
||||
a\(*\)b b a*b a*b
|
||||
a\(**\)b b ab ab
|
||||
a\(***\)b bC BADRPT
|
||||
*a b *a *a
|
||||
**a b a a
|
||||
***a bC BADRPT
|
||||
|
||||
# the dreaded bounded repetitions
|
||||
{ & { {
|
||||
{abc & {abc {abc
|
||||
{1 C BADRPT
|
||||
{1} C BADRPT
|
||||
a{b & a{b a{b
|
||||
a{1}b - ab ab
|
||||
a\{1\}b b ab ab
|
||||
a{1,}b - ab ab
|
||||
a\{1,\}b b ab ab
|
||||
a{1,2}b - aab aab
|
||||
a\{1,2\}b b aab aab
|
||||
a{1 C EBRACE
|
||||
a\{1 bC EBRACE
|
||||
a{1a C EBRACE
|
||||
a\{1a bC EBRACE
|
||||
a{1a} C BADBR
|
||||
a\{1a\} bC BADBR
|
||||
a{,2} - a{,2} a{,2}
|
||||
a\{,2\} bC BADBR
|
||||
a{,} - a{,} a{,}
|
||||
a\{,\} bC BADBR
|
||||
a{1,x} C BADBR
|
||||
a\{1,x\} bC BADBR
|
||||
a{1,x C EBRACE
|
||||
a\{1,x bC EBRACE
|
||||
a{300} C BADBR
|
||||
a\{300\} bC BADBR
|
||||
a{1,0} C BADBR
|
||||
a\{1,0\} bC BADBR
|
||||
ab{0,0}c - abcac ac
|
||||
ab\{0,0\}c b abcac ac
|
||||
ab{0,1}c - abcac abc
|
||||
ab\{0,1\}c b abcac abc
|
||||
ab{0,3}c - abbcac abbc
|
||||
ab\{0,3\}c b abbcac abbc
|
||||
ab{1,1}c - acabc abc
|
||||
ab\{1,1\}c b acabc abc
|
||||
ab{1,3}c - acabc abc
|
||||
ab\{1,3\}c b acabc abc
|
||||
ab{2,2}c - abcabbc abbc
|
||||
ab\{2,2\}c b abcabbc abbc
|
||||
ab{2,4}c - abcabbc abbc
|
||||
ab\{2,4\}c b abcabbc abbc
|
||||
((a{1,10}){1,10}){1,10} - a a a,a
|
||||
|
||||
# multiple repetitions
|
||||
a** &C BADRPT
|
||||
a++ C BADRPT
|
||||
a?? C BADRPT
|
||||
a*+ C BADRPT
|
||||
a*? C BADRPT
|
||||
a+* C BADRPT
|
||||
a+? C BADRPT
|
||||
a?* C BADRPT
|
||||
a?+ C BADRPT
|
||||
a{1}{1} C BADRPT
|
||||
a*{1} C BADRPT
|
||||
a+{1} C BADRPT
|
||||
a?{1} C BADRPT
|
||||
a{1}* C BADRPT
|
||||
a{1}+ C BADRPT
|
||||
a{1}? C BADRPT
|
||||
a*{b} - a{b} a{b}
|
||||
a\{1\}\{1\} bC BADRPT
|
||||
a*\{1\} bC BADRPT
|
||||
a\{1\}* bC BADRPT
|
||||
|
||||
# brackets, and numerous perversions thereof
|
||||
a[b]c & abc abc
|
||||
a[ab]c & abc abc
|
||||
a[^ab]c & adc adc
|
||||
a[]b]c & a]c a]c
|
||||
a[[b]c & a[c a[c
|
||||
a[-b]c & a-c a-c
|
||||
a[^]b]c & adc adc
|
||||
a[^-b]c & adc adc
|
||||
a[b-]c & a-c a-c
|
||||
a[b &C EBRACK
|
||||
a[] &C EBRACK
|
||||
a[1-3]c & a2c a2c
|
||||
a[3-1]c &C ERANGE
|
||||
a[1-3-5]c &C ERANGE
|
||||
a[[.-.]--]c & a-c a-c
|
||||
a[1- &C ERANGE
|
||||
a[[. &C EBRACK
|
||||
a[[.x &C EBRACK
|
||||
a[[.x. &C EBRACK
|
||||
a[[.x.] &C EBRACK
|
||||
a[[.x.]] & ax ax
|
||||
a[[.x,.]] &C ECOLLATE
|
||||
a[[.one.]]b & a1b a1b
|
||||
a[[.notdef.]]b &C ECOLLATE
|
||||
a[[.].]]b & a]b a]b
|
||||
a[[:alpha:]]c & abc abc
|
||||
a[[:notdef:]]c &C ECTYPE
|
||||
a[[: &C EBRACK
|
||||
a[[:alpha &C EBRACK
|
||||
a[[:alpha:] &C EBRACK
|
||||
a[[:alpha,:] &C ECTYPE
|
||||
a[[:]:]]b &C ECTYPE
|
||||
a[[:-:]]b &C ECTYPE
|
||||
a[[:alph:]] &C ECTYPE
|
||||
a[[:alphabet:]] &C ECTYPE
|
||||
[[:alnum:]]+ - -%@a0X- a0X
|
||||
[[:alpha:]]+ - -%@aX0- aX
|
||||
[[:blank:]]+ - aSSTb SST
|
||||
[[:cntrl:]]+ - aNTb NT
|
||||
[[:digit:]]+ - a019b 019
|
||||
[[:graph:]]+ - Sa%bS a%b
|
||||
[[:lower:]]+ - AabC ab
|
||||
[[:print:]]+ - NaSbN aSb
|
||||
[[:punct:]]+ - S%-&T %-&
|
||||
[[:space:]]+ - aSNTb SNT
|
||||
[[:upper:]]+ - aBCd BC
|
||||
[[:xdigit:]]+ - p0f3Cq 0f3C
|
||||
a[[=b=]]c & abc abc
|
||||
a[[= &C EBRACK
|
||||
a[[=b &C EBRACK
|
||||
a[[=b= &C EBRACK
|
||||
a[[=b=] &C EBRACK
|
||||
a[[=b,=]] &C ECOLLATE
|
||||
a[[=one=]]b & a1b a1b
|
||||
|
||||
# complexities
|
||||
a(((b)))c - abc abc
|
||||
a(b|(c))d - abd abd
|
||||
a(b*|c)d - abbd abbd
|
||||
# just gotta have one DFA-buster, of course
|
||||
a[ab]{20} - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab
|
||||
# and an inline expansion in case somebody gets tricky
|
||||
a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab] - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab
|
||||
# and in case somebody just slips in an NFA...
|
||||
a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab](wee|week)(knights|night) - aaaaabaaaabaaaabaaaabweeknights aaaaabaaaabaaaabaaaabweeknights
|
||||
# fish for anomalies as the number of states passes 32
|
||||
12345678901234567890123456789 - a12345678901234567890123456789b 12345678901234567890123456789
|
||||
123456789012345678901234567890 - a123456789012345678901234567890b 123456789012345678901234567890
|
||||
1234567890123456789012345678901 - a1234567890123456789012345678901b 1234567890123456789012345678901
|
||||
12345678901234567890123456789012 - a12345678901234567890123456789012b 12345678901234567890123456789012
|
||||
123456789012345678901234567890123 - a123456789012345678901234567890123b 123456789012345678901234567890123
|
||||
# and one really big one, beyond any plausible word width
|
||||
1234567890123456789012345678901234567890123456789012345678901234567890 - a1234567890123456789012345678901234567890123456789012345678901234567890b 1234567890123456789012345678901234567890123456789012345678901234567890
|
||||
# fish for problems as brackets go past 8
|
||||
[ab][cd][ef][gh][ij][kl][mn] - xacegikmoq acegikm
|
||||
[ab][cd][ef][gh][ij][kl][mn][op] - xacegikmoq acegikmo
|
||||
[ab][cd][ef][gh][ij][kl][mn][op][qr] - xacegikmoqy acegikmoq
|
||||
[ab][cd][ef][gh][ij][kl][mn][op][q] - xacegikmoqy acegikmoq
|
||||
|
||||
# subtleties of matching
|
||||
abc & xabcy abc
|
||||
a\(b\)?c\1d b acd
|
||||
aBc i Abc Abc
|
||||
a[Bc]*d i abBCcd abBCcd
|
||||
0[[:upper:]]1 &i 0a1 0a1
|
||||
0[[:lower:]]1 &i 0A1 0A1
|
||||
a[^b]c &i abc
|
||||
a[^b]c &i aBc
|
||||
a[^b]c &i adc adc
|
||||
[a]b[c] - abc abc
|
||||
[a]b[a] - aba aba
|
||||
[abc]b[abc] - abc abc
|
||||
[abc]b[abd] - abd abd
|
||||
a(b?c)+d - accd accd
|
||||
(wee|week)(knights|night) - weeknights weeknights
|
||||
(we|wee|week|frob)(knights|night|day) - weeknights weeknights
|
||||
a[bc]d - xyzaaabcaababdacd abd
|
||||
a[ab]c - aaabc abc
|
||||
abc s abc abc
|
||||
a* & b @b
|
||||
|
||||
# Let's have some fun -- try to match a C comment.
|
||||
# first the obvious, which looks okay at first glance...
|
||||
/\*.*\*/ - /*x*/ /*x*/
|
||||
# but...
|
||||
/\*.*\*/ - /*x*/y/*z*/ /*x*/y/*z*/
|
||||
# okay, we must not match */ inside; try to do that...
|
||||
/\*([^*]|\*[^/])*\*/ - /*x*/ /*x*/
|
||||
/\*([^*]|\*[^/])*\*/ - /*x*/y/*z*/ /*x*/
|
||||
# but...
|
||||
/\*([^*]|\*[^/])*\*/ - /*x**/y/*z*/ /*x**/y/*z*/
|
||||
# and a still fancier version, which does it right (I think)...
|
||||
/\*([^*]|\*+[^*/])*\*+/ - /*x*/ /*x*/
|
||||
/\*([^*]|\*+[^*/])*\*+/ - /*x*/y/*z*/ /*x*/
|
||||
/\*([^*]|\*+[^*/])*\*+/ - /*x**/y/*z*/ /*x**/
|
||||
/\*([^*]|\*+[^*/])*\*+/ - /*x****/y/*z*/ /*x****/
|
||||
/\*([^*]|\*+[^*/])*\*+/ - /*x**x*/y/*z*/ /*x**x*/
|
||||
/\*([^*]|\*+[^*/])*\*+/ - /*x***x/y/*z*/ /*x***x/y/*z*/
|
||||
|
||||
# subexpressions
|
||||
.* - abc abc -
|
||||
a(b)(c)d - abcd abcd b,c
|
||||
a(((b)))c - abc abc b,b,b
|
||||
a(b|(c))d - abd abd b,-
|
||||
a(b*|c|e)d - abbd abbd bb
|
||||
a(b*|c|e)d - acd acd c
|
||||
a(b*|c|e)d - ad ad @d
|
||||
a(b?)c - abc abc b
|
||||
a(b?)c - ac ac @c
|
||||
a(b+)c - abc abc b
|
||||
a(b+)c - abbbc abbbc bbb
|
||||
a(b*)c - ac ac @c
|
||||
(a|ab)(bc([de]+)f|cde) - abcdef abcdef a,bcdef,de
|
||||
# the regression tester only asks for 9 subexpressions
|
||||
a(b)(c)(d)(e)(f)(g)(h)(i)(j)k - abcdefghijk abcdefghijk b,c,d,e,f,g,h,i,j
|
||||
a(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)l - abcdefghijkl abcdefghijkl b,c,d,e,f,g,h,i,j,k
|
||||
a([bc]?)c - abc abc b
|
||||
a([bc]?)c - ac ac @c
|
||||
a([bc]+)c - abc abc b
|
||||
a([bc]+)c - abcc abcc bc
|
||||
a([bc]+)bc - abcbc abcbc bc
|
||||
a(bb+|b)b - abb abb b
|
||||
a(bbb+|bb+|b)b - abb abb b
|
||||
a(bbb+|bb+|b)b - abbb abbb bb
|
||||
a(bbb+|bb+|b)bb - abbb abbb b
|
||||
(.*).* - abcdef abcdef abcdef
|
||||
(a*)* - bc @b @b
|
||||
|
||||
# do we get the right subexpression when it is used more than once?
|
||||
a(b|c)*d - ad ad -
|
||||
a(b|c)*d - abcd abcd c
|
||||
a(b|c)+d - abd abd b
|
||||
a(b|c)+d - abcd abcd c
|
||||
a(b|c?)+d - ad ad @d
|
||||
a(b|c?)+d - abcd abcd @d
|
||||
a(b|c){0,0}d - ad ad -
|
||||
a(b|c){0,1}d - ad ad -
|
||||
a(b|c){0,1}d - abd abd b
|
||||
a(b|c){0,2}d - ad ad -
|
||||
a(b|c){0,2}d - abcd abcd c
|
||||
a(b|c){0,}d - ad ad -
|
||||
a(b|c){0,}d - abcd abcd c
|
||||
a(b|c){1,1}d - abd abd b
|
||||
a(b|c){1,1}d - acd acd c
|
||||
a(b|c){1,2}d - abd abd b
|
||||
a(b|c){1,2}d - abcd abcd c
|
||||
a(b|c){1,}d - abd abd b
|
||||
a(b|c){1,}d - abcd abcd c
|
||||
a(b|c){2,2}d - acbd acbd b
|
||||
a(b|c){2,2}d - abcd abcd c
|
||||
a(b|c){2,4}d - abcd abcd c
|
||||
a(b|c){2,4}d - abcbd abcbd b
|
||||
a(b|c){2,4}d - abcbcd abcbcd c
|
||||
a(b|c){2,}d - abcd abcd c
|
||||
a(b|c){2,}d - abcbd abcbd b
|
||||
a(b+|((c)*))+d - abd abd @d,@d,-
|
||||
a(b+|((c)*))+d - abcd abcd @d,@d,-
|
||||
|
||||
# check out the STARTEND option
|
||||
[abc] &# a(b)c b
|
||||
[abc] &# a(d)c
|
||||
[abc] &# a(bc)d b
|
||||
[abc] &# a(dc)d c
|
||||
. &# a()c
|
||||
b.*c &# b(bc)c bc
|
||||
b.* &# b(bc)c bc
|
||||
.*c &# b(bc)c bc
|
||||
|
||||
# plain strings, with the NOSPEC flag
|
||||
abc m abc abc
|
||||
abc m xabcy abc
|
||||
abc m xyz
|
||||
a*b m aba*b a*b
|
||||
a*b m ab
|
||||
"" mC EMPTY
|
||||
|
||||
# cases involving NULs
|
||||
aZb & a a
|
||||
aZb &p a
|
||||
aZb &p# (aZb) aZb
|
||||
aZ*b &p# (ab) ab
|
||||
a.b &# (aZb) aZb
|
||||
a.* &# (aZb)c aZb
|
||||
|
||||
# word boundaries (ick)
|
||||
[[:<:]]a & a a
|
||||
[[:<:]]a & ba
|
||||
[[:<:]]a & -a a
|
||||
a[[:>:]] & a a
|
||||
a[[:>:]] & ab
|
||||
a[[:>:]] & a- a
|
||||
[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc abc
|
||||
[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc-q abc
|
||||
[[:<:]]a.c[[:>:]] & axc-dayc-dazce-abc axc
|
||||
[[:<:]]b.c[[:>:]] & a_bxc-byc_d-bzc-q bzc
|
||||
[[:<:]].x..[[:>:]] & y_xa_-_xb_y-_xc_-axdc _xc_
|
||||
[[:<:]]a_b[[:>:]] & x_a_b
|
||||
|
||||
# past problems, and suspected problems
|
||||
(A[1])|(A[2])|(A[3])|(A[4])|(A[5])|(A[6])|(A[7])|(A[8])|(A[9])|(A[A]) - A1 A1
|
||||
abcdefghijklmnop i abcdefghijklmnop abcdefghijklmnop
|
||||
abcdefghijklmnopqrstuv i abcdefghijklmnopqrstuv abcdefghijklmnopqrstuv
|
||||
(ALAK)|(ALT[AB])|(CC[123]1)|(CM[123]1)|(GAMC)|(LC[23][EO ])|(SEM[1234])|(SL[ES][12])|(SLWW)|(SLF )|(SLDT)|(VWH[12])|(WH[34][EW])|(WP1[ESN]) - CC11 CC11
|
||||
CC[13]1|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a - CC11 CC11
|
||||
Char \([a-z0-9_]*\)\[.* b Char xyz[k Char xyz[k xyz
|
||||
a?b - ab ab
|
||||
-\{0,1\}[0-9]*$ b -5 -5
|
||||
a*a*a*a*a*a*a* & aaaaaa aaaaaa
|
@@ -1,22 +0,0 @@
|
||||
/* utility definitions */
|
||||
#ifdef _POSIX2_RE_DUP_MAX
|
||||
#define DUPMAX _POSIX2_RE_DUP_MAX
|
||||
#else
|
||||
#define DUPMAX 255
|
||||
#endif
|
||||
#define INFINITY (DUPMAX + 1)
|
||||
#define NC (CHAR_MAX - CHAR_MIN + 1)
|
||||
typedef unsigned char uch;
|
||||
|
||||
/* switch off assertions (if not already off) if no REDEBUG */
|
||||
#ifndef REDEBUG
|
||||
#ifndef NDEBUG
|
||||
#define NDEBUG /* no assertions please */
|
||||
#endif
|
||||
#endif
|
||||
#include <assert.h>
|
||||
|
||||
/* for old systems with bcopy() but no memmove() */
|
||||
#ifdef USEBCOPY
|
||||
#define memmove(d, s, c) bcopy(s, d, c)
|
||||
#endif
|
Reference in New Issue
Block a user