home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Frozen Fish 1: Amiga
/
FrozenFish-Apr94.iso
/
bbs
/
alib
/
d3xx
/
d352
/
mg.lha
/
MG
/
src.LZH
/
mg
/
regex.c
< prev
next >
Wrap
C/C++ Source or Header
|
1990-05-23
|
44KB
|
1,728 lines
/*
* Somewhere along the way, someone deleted the GNU copyright from this code,
* replacing it with "Imagine generic GNU copyright here". This is _not_
* acceptable behavior in any society, polite or otherwise.
*
* However, since I understand the need to delete this large chunk of text, I've
* followed the new GNU technic, and put it in a seperate file.
*/
/*
* Copyright (C) 1985 Richard M. Stallman
*
* This software may be redistributed under the terms described in the file
* LICENSE in this directory.
*/
/* modified by Robert Larson to make fewer assumptions about the compiler */
/*
* (Making it useful to mg) (#if not supported by osk, enum not supported by
* many)
*/
/*
* To test, compile with -Dtest. This Dtestable feature turns this into a
* self-contained program which reads a pattern, describes how it compiles,
* then reads a string and searches for it.
*/
#include "regexp.h"
#ifdef REGEXP
#include "def.h" /* defines VOID etc. for mg */
static VOID init_syntax_once
PROTO((VOID));
static VOID store_jump
PROTO((char *, int, char *));
static VOID insert_jump
PROTO((int, char *, char *, char *));
static int bcmp_translate
PROTO((char *, char *, int, char *));
#ifdef ANSI
#include <string.h>
#include <stdlib.h>
#endif
#ifdef emacs
/*
* The `emacs' switch turns on certain special matching commands that make
* sense only in emacs.
*/
#include "config.h"
#include "lisp.h"
#include "buffer.h"
#include "syntax.h"
#else /* not emacs */
/*
* Define the syntax stuff, so we can do the \<...\> things.
*/
#ifndef Sword /* must be non-zero in some of the tests
* below... */
#define Sword 1
#endif
#define SYNTAX(c) re_syntax_table[c]
#ifdef SYNTAX_TABLE
char *re_syntax_table;
#else
static char re_syntax_table[256];
static VOID
init_syntax_once()
{
register int c;
static int done = 0;
if (done)
return;
bzero(re_syntax_table, sizeof re_syntax_table);
for (c = 'a'; c <= 'z'; c++)
re_syntax_table[c] = Sword;
for (c = 'A'; c <= 'Z'; c++)
re_syntax_table[c] = Sword;
for (c = '0'; c <= '9'; c++)
re_syntax_table[c] = Sword;
done = 1;
}
#endif /* SYNTAX_TABLE */
#endif /* not emacs */
#include "regex.h"
/*
* Number of failure points to allocate space for initially, when matching. f
* this number is exceeded, more space is allocated, so it is not a hard
* limit.
*/
#ifndef NFAILURES
#define NFAILURES 80
#endif /* NFAILURES */
/* width of a byte in bits */
#define BYTEWIDTH 8
/*
* These are the command codes that appear in compiled regular expressions,
* one per byte. Some command codes are followed by argument bytes. A command
* code can specify any interpretation whatever for its arguments. Zero-bytes
* may appear in the compiled regular expression.
*/
typedef char regexpcode;
#define unused 0
#define exactn 1 /* followed by one byte giving n, and then by
* n literal bytes */
#define begline 2 /* fails unless at beginning of line */
#define endline 3 /* fails unless at end of line */
#define jump 4 /* followed by two bytes giving relative
* address to jump to */
#define on_failure_jump 5 /* followed by two bytes giving relative
* address of place */
/* to resume at in case of failure. */
#define finalize_jump 6 /* Throw away latest failure point and then
* jump to address. */
#define maybe_finalize_jump 7 /* Like jump but finalize if safe to do so. */
/*
* This is used to jump back to the beginning of a repeat. If the command
* that follows this jump is clearly incompatible with the one at the
* beginning of the repeat, such that we can be sure that there is no use
* backtracking out of repetitions already completed, then we finalize.
*/
#define dummy_failure_jump 8 /* jump, and push a dummy failure point. */
/*
* This failure point will be thrown away if an attempt is made to use it for
* a failure. A + construct makes this before the first repeat.
*/
#define anychar 9 /* matches any one character */
#define charset 10 /* matches any one char belonging to
* specified set. */
/*
* First following byte is # bitmap bytes. Then come bytes for a bit-map
* saying which chars are in. Bits in each byte are ordered low-bit-first. A
* character is in the set if its bit is 1. A character too large to have a
* bit in the map is automatically not in the set
*/
#define charset_not 11 /* similar but match any character that is
* NOT one of those specified */
#define start_memory 12 /* starts remembering the text that is
* matched */
/*
* and stores it in a memory register. followed by one byte containing the
* register number. Register numbers must be in the range 0 through NREGS.
*/
#define stop_memory 13 /* stops remembering the text that is matched */
/*
* and stores it in a memory register. followed by one byte containing the
* register number. Register numbers must be in the range 0 through NREGS.
*/
#define duplicate 14 /* match a duplicate of something remembered. */
/* Followed by one byte containing the index of the memory register. */
#define before_dot 15 /* Succeeds if before dot */
#define at_dot 16 /* Succeeds if at dot */
#define after_dot 17 /* Succeeds if after dot */
#define begbuf 18 /* Succeeds if at beginning of buffer */
#define endbuf 19 /* Succeeds if at end of buffer */
#define wordchar 20 /* Matches any word-constituent character */
#define notwordchar 21 /* Matches any char that is not a
* word-constituent */
#define wordbeg 22 /* Succeeds if at word beginning */
#define wordend 23 /* Succeeds if at word end */
#define wordbound 24 /* Succeeds if at a word boundary */
#define notwordbound 25 /* Succeeds if not at a word boundary */
#define syntaxspec 26 /* Matches any character whose syntax is
* specified. */
/* followed by a byte which contains a syntax code, Sword or such like */
#define notsyntaxspec 27 /* Matches any character whose syntax differs
* from the specified. */
#ifndef SIGN_EXTEND_CHAR
#define SIGN_EXTEND_CHAR(x) (x)
#endif
/*
* compile_pattern takes a regular-expression descriptor string in the user's
* format and converts it into a buffer full of byte commands for matching.
*
* pattern is the address of the pattern string size is the length of
* it. bufp is a struct re_pattern_buffer * which points to the
* info on where to store the byte commands. This structure contains a char *
* which points to the actual space, which should have been obtained with
* malloc. compile_pattern may use realloc to grow the buffer space.
*
* The number of bytes of commands can be found out by looking in the struct
* re_pattern_buffer that bufp pointed to, after compile_pattern returns.
*/
#define PATPUSH(ch) (*b++ = (char) (ch))
#define PATFETCH(c) \
{if (p == pend) goto end_of_pattern; \
c = * (unsigned char *) p++; \
if (translate) c = translate[c]; }
#define PATFETCH_RAW(c) \
{if (p == pend) goto end_of_pattern; \
c = * (unsigned char *) p++; }
#define PATUNFETCH p--
#define EXTEND_BUFFER \
{ char *old_buffer = bufp->buffer; \
if (bufp->allocated == (1<<16)) goto too_big; \
bufp->allocated *= 2; \
if (bufp->allocated > (1<<16)) bufp->allocated = (1<<16); \
if (!(bufp->buffer = (char *) realloc (bufp->buffer, bufp->allocated))) \
goto memory_exhausted; \
c = bufp->buffer - old_buffer; \
b += c; \
if (fixup_jump) \
fixup_jump += c; \
if (laststart) \
laststart += c; \
begalt += c; \
if (pending_exact) \
pending_exact += c; \
}
char *
re_compile_pattern(pattern, size, bufp)
char *pattern;
int size;
struct re_pattern_buffer *bufp;
{
register char *b = bufp->buffer;
register char *p = pattern;
char *pend = pattern + size;
register unsigned c, c1;
char *p1;
unsigned char *translate = (unsigned char *) bufp->translate;
/*
* address of the count-byte of the most recently inserted "exactn"
* command. This makes it possible to tell whether a new exact-match
* character can be added to that command or requires a new "exactn"
* command.
*/
char *pending_exact = 0;
/*
* address of the place where a forward-jump should go to the end of
* the containing expression. Each alternative of an "or", except the
* last, ends with a forward-jump of this sort.
*/
char *fixup_jump = 0;
/*
* address of start of the most recently finished expression. This
* tells postfix * where to find the start of its operand.
*/
char *laststart = 0;
/* In processing a repeat, 1 means zero matches is allowed */
char zero_times_ok;
/* In processing a repeat, 1 means many matches is allowed */
char many_times_ok;
/* address of beginning of regexp, or inside of last \( */
char *begalt = b;
/*
* Stack of information saved by \( and restored by \). Four stack
* elements are pushed by each \(: First, the value of b. Second, the
* value of fixup_jump. Third, the value of regnum. Fourth, the value
* of begalt.
*/
int stackb[40];
int *stackp = stackb;
int *stacke = stackb + 40;
int *stackt;
/*
* Counts \('s as they are encountered. Remembered for the matching
* \), where it becomes the "register number" to put in the
* stop_memory command
*/
int regnum = 1;
bufp->fastmap_accurate = 0;
#ifndef SYNTAX_TABLE
#ifndef emacs
/*
* Initialize the syntax table.
*/
init_syntax_once();
#endif
#endif
if (bufp->allocated == 0) {
bufp->allocated = 28;
if (bufp->buffer)
/* EXTEND_BUFFER loses when bufp->allocated is 0 */
bufp->buffer = (char *) realloc((char *) bufp->buffer, (unsigned) 28);
else
/* Caller did not allocate a buffer. Do it for him */
bufp->buffer = (char *) malloc((unsigned) 28);
if (!bufp->buffer)
goto memory_exhausted;
begalt = b = bufp->buffer;
}
while (p != pend) {
if (b - bufp->buffer > bufp->allocated - 10)
/* Note that EXTEND_BUFFER clobbers c */
EXTEND_BUFFER;
PATFETCH(c);
switch (c) {
case '$':
/*
* $ means succeed if at end of line, but only in
* special contexts. If randonly in the middle of a
* pattern, it is a normal character.
*/
if (p == pend || (*p == '\\' && (p[1] == ')' || p[1] == '|'))) {
PATPUSH(endline);
break;
}
goto normal_char;
case '^':
/*
* ^ means succeed if at beg of line, but only if no
* preceding pattern.
*/
if (laststart)
goto normal_char;
PATPUSH(begline);
break;
case '*':
case '+':
case '?':
/* If there is no previous pattern, char not special. */
if (!laststart)
goto normal_char;
/*
* If there is a sequence of repetition chars,
* collapse it down to equivalent to just one.
*/
zero_times_ok = 0;
many_times_ok = 0;
while (1) {
zero_times_ok |= c != '+';
many_times_ok |= c != '?';
if (p == pend)
break;
PATFETCH(c);
if (!(c == '*' || c == '+' || c == '?')) {
PATUNFETCH;
break;
}
}
/*
* Now we know whether 0 matches is allowed, and
* whether 2 or more matches is allowed.
*/
if (many_times_ok) {
/*
* If more than one repetition is allowed,
* put in a backward jump at the end.
*/
store_jump(b, maybe_finalize_jump, laststart - 3);
b += 3;
}
insert_jump(on_failure_jump, laststart, b + 3, b);
pending_exact = 0;
b += 3;
if (!zero_times_ok) {
/*
* At least one repetition required: insert
* before the loop a skip over the initial
* on-failure-jump instruction
*/
insert_jump(dummy_failure_jump, laststart, laststart + 6, b);
b += 3;
}
break;
case '.':
laststart = b;
PATPUSH(anychar);
break;
case '[':
if (b - bufp->buffer
> bufp->allocated - 3 - (1 << BYTEWIDTH) / BYTEWIDTH)
/* Note that EXTEND_BUFFER clobbers c */
EXTEND_BUFFER;
laststart = b;
if (*p == '^')
PATPUSH(charset_not), p++;
else
PATPUSH(charset);
p1 = p;
PATPUSH((1 << BYTEWIDTH) / BYTEWIDTH);
/* Clear the whole map */
bzero(b, (1 << BYTEWIDTH) / BYTEWIDTH);
/* Read in characters and ranges, setting map bits */
while (1) {
PATFETCH(c);
if (c == ']' && p != p1 + 1)
break;
if (*p == '-') {
PATFETCH(c1);
PATFETCH(c1);
while (c <= c1)
b[c / BYTEWIDTH] |= 1 << (c % BYTEWIDTH), c++;
} else {
b[c / BYTEWIDTH] |= 1 << (c % BYTEWIDTH);
}
}
/*
* Discard any bitmap bytes that are all 0 at the end
* of the map. Decrement the map-length byte too.
*/
while (b[-1] > 0 && b[b[-1] - 1] == 0)
b[-1]--;
b += b[-1];
break;
case '\\':
if (p == pend)
goto invalid_pattern;
PATFETCH_RAW(c);
switch (c) {
case '(':
if (stackp == stacke)
goto nesting_too_deep;
if (regnum < RE_NREGS) {
PATPUSH(start_memory);
PATPUSH(regnum);
}
*stackp++ = b - bufp->buffer;
*stackp++ = fixup_jump ? fixup_jump - bufp->buffer + 1 : 0;
*stackp++ = regnum++;
*stackp++ = begalt - bufp->buffer;
fixup_jump = 0;
laststart = 0;
begalt = b;
break;
case ')':
if (stackp == stackb)
goto unmatched_close;
begalt = *--stackp + bufp->buffer;
if (fixup_jump)
store_jump(fixup_jump, jump, b);
if (stackp[-1] < RE_NREGS) {
PATPUSH(stop_memory);
PATPUSH(stackp[-1]);
}
stackp -= 2;
fixup_jump = 0;
if (*stackp)
fixup_jump = *stackp + bufp->buffer - 1;
laststart = *--stackp + bufp->buffer;
break;
case '|':
insert_jump(on_failure_jump, begalt, b + 6, b);
pending_exact = 0;
b += 3;
if (fixup_jump)
store_jump(fixup_jump, jump, b);
fixup_jump = b;
b += 3;
laststart = 0;
begalt = b;
break;
#ifdef emacs
case '=':
PATPUSH(at_dot);
break;
case 's':
laststart = b;
PATPUSH(syntaxspec);
PATFETCH(c);
PATPUSH(syntax_spec_code[c]);
break;
case 'S':
laststart = b;
PATPUSH(notsyntaxspec);
PATFETCH(c);
PATPUSH(syntax_spec_code[c]);
break;
#endif /* emacs */
case 'w':
laststart = b;
PATPUSH(wordchar);
break;
case 'W':
laststart = b;
PATPUSH(notwordchar);
break;
case '<':
PATPUSH(wordbeg);
break;
case '>':
PATPUSH(wordend);
break;
case 'b':
PATPUSH(wordbound);
break;
case 'B':
PATPUSH(notwordbound);
break;
case '`':
PATPUSH(begbuf);
break;
case '\'':
PATPUSH(endbuf);
break;
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
c1 = c - '0';
if (c1 >= regnum)
goto normal_char;
for (stackt = stackp - 2; stackt > stackb; stackt -= 4)
if (*stackt == c1)
goto normal_char;
laststart = b;
PATPUSH(duplicate);
PATPUSH(c1);
break;
default:
goto normal_char;
}
break;
default:
normal_char:
if (!pending_exact || pending_exact + *pending_exact + 1 != b
|| *pending_exact == 0177 || *p == '*' || *p == '^'
|| *p == '+' || *p == '?') {
laststart = b;
PATPUSH(exactn);
pending_exact = b;
PATPUSH(0);
}
PATPUSH(c);
(*pending_exact)++;
}
}
if (fixup_jump)
store_jump(fixup_jump, jump, b);
if (stackp != stackb)
goto unmatched_open;
bufp->used = b - bufp->buffer;
return 0;
invalid_pattern:
return "Invalid regular expression";
unmatched_open:
return "Unmatched \\(";
unmatched_close:
return "Unmatched \\)";
end_of_pattern:
return "Premature end of regular expression";
nesting_too_deep:
return "Nesting too deep";
too_big:
return "Regular expression too big";
memory_exhausted:
return "Memory exhausted";
}
/*
* Store where `from' points a jump operation to jump to where `to' points.
* `opcode' is the opcode to store.
*/
static VOID
store_jump(from, opcode, to)
char *from, *to;
int opcode;
{
from[0] = (char) opcode;
from[1] = (to - (from + 3)) & 0377;
from[2] = (to - (from + 3)) >> 8;
}
/*
* Open up space at char FROM, and insert there a jump to TO. CURRENT_END
* gives te end of the storage no in use, so we know how much data to copy
* up. OP is the opcode of the jump to insert.
*
* If you call this function, you must zero out pending_exact.
*/
static VOID
insert_jump(op, from, to, current_end)
int op;
char *from, *to, *current_end;
{
register char *pto = current_end + 3;
register char *pfrom = current_end;
while (pfrom != from)
*--pto = *--pfrom;
store_jump(from, op, to);
}
/*
* Given a pattern, compute a fastmap from it. The fastmap records which of
* the (1 << BYTEWIDTH) possible characters can start a string that matches
* the pattern. This fastmap is used by re_search to skip quickly over
* totally implausible text.
*
* The caller must supply the address of a (1 << BYTEWIDTH)-byte data area as
* bufp->fastmap. The other components of bufp describe the pattern to be
* used.
*/
VOID
re_compile_fastmap(bufp)
struct re_pattern_buffer *bufp;
{
unsigned char *pattern = (unsigned char *) bufp->buffer;
int size = bufp->used;
register char *fastmap = bufp->fastmap;
register unsigned char *p = pattern;
register unsigned char *pend = pattern + size;
register int j;
#ifdef emacs
register int k;
#endif
unsigned char *translate = (unsigned char *) bufp->translate;
unsigned char *stackb[NFAILURES];
unsigned char **stackp = stackb;
bzero(fastmap, (1 << BYTEWIDTH));
bufp->fastmap_accurate = 1;
bufp->can_be_null = 0;
while (p) {
if (p == pend) {
bufp->can_be_null = 1;
break;
}
switch ((regexpcode) * p++) {
case exactn:
if (translate)
fastmap[translate[p[1]]] = 1;
else
fastmap[p[1]] = 1;
break;
case begline:
case before_dot:
case at_dot:
case after_dot:
case begbuf:
case endbuf:
case wordbound:
case notwordbound:
case wordbeg:
case wordend:
continue;
case endline:
if (translate)
fastmap[translate['\n']] = 1;
else
fastmap['\n'] = 1;
bufp->can_be_null = 1;
break;
case finalize_jump:
case maybe_finalize_jump:
case jump:
case dummy_failure_jump:
bufp->can_be_null = 1;
j = *p++ & 0377;
j += SIGN_EXTEND_CHAR(*(char *) p++) << 8;
p += j;
if (j > 0)
continue;
/*
* Jump backward reached implies we just went through
* the body of a loop and matched nothing. Opcode
* jumped to should be an on_failure_jump. Just treat
* it like an ordinary jump. For a * loop, it has
* pushed its failure point already; if so, discard
* that as redundant.
*/
if ((regexpcode) * p != on_failure_jump)
continue;
p++;
j = *p++ & 0377;
j += SIGN_EXTEND_CHAR(*(char *) p++) << 8;
p += j;
if (stackp != stackb && *stackp == p)
stackp--;
continue;
case on_failure_jump:
j = *p++ & 0377;
j += SIGN_EXTEND_CHAR(*(char *) p++) << 8;
*++stackp = p + j;
continue;
case start_memory:
case stop_memory:
p++;
continue;
case duplicate:
bufp->can_be_null = 1;
fastmap['\n'] = 1;
case anychar:
for (j = 0; j < (1 << BYTEWIDTH); j++)
if (j != '\n')
fastmap[j] = 1;
if (bufp->can_be_null)
return;
/*
* Don't return; check the alternative paths so we
* can set can_be_null if appropriate.
*/
break;
case wordchar:
for (j = 0; j < (1 << BYTEWIDTH); j++)
if (SYNTAX(j) == Sword)
fastmap[j] = 1;
break;
case notwordchar:
for (j = 0; j < (1 << BYTEWIDTH); j++)
if (SYNTAX(j) != Sword)
fastmap[j] = 1;
break;
#ifdef emacs
case syntaxspec:
k = *p++;
for (j = 0; j < (1 << BYTEWIDTH); j++)
if (SYNTAX(j) == (enum syntaxcode) k)
fastmap[j] = 1;
break;
case notsyntaxspec:
for (j = 0; j < (1 << BYTEWIDTH); j++)
if (SYNTAX(j) != (enum syntaxcode) k)
fastmap[j] = 1;
break;
#endif /* emacs */
case charset:
for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) {
if (translate)
fastmap[translate[j]] = 1;
else
fastmap[j] = 1;
}
break;
case charset_not:
/* Chars beyond end of map must be allowed */
for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
if (translate)
fastmap[translate[j]] = 1;
else
fastmap[j] = 1;
for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) {
if (translate)
fastmap[translate[j]] = 1;
else
fastmap[j] = 1;
}
break;
}
/*
* Get here means we have successfully found the possible
* starting characters of one path of the pattern. We need
* not follow this path any farther. Instead, look at the
* next alternative remembered in the stack.
*/
if (stackp != stackb)
p = *stackp--;
else
break;
}
}
/* Like re_search_2, below, but only one string is specified. */
int
re_search(pbufp, string, size, startpos, range, regs)
struct re_pattern_buffer *pbufp;
char *string;
int size, startpos, range;
struct re_registers *regs;
{
return re_search_2(pbufp, (char *) 0, 0, string, size, startpos, range, regs, size);
}
/*
* Like re_match_2 but tries first a match starting at index `startpos', then
* at startpos + 1, and so on. `range' is the number of places to try before
* giving up. If `range' is negative, the starting positions tried are
* startpos, startpos - 1, etc. It is up to the caller to make sure that
* range is not so large as to take the starting position outside of the
* input strings.
*
* The value returned is the position at which the match was found, or -1 if no
* match was found.
*/
int
re_search_2(pbufp, string1, size1, string2, size2, startpos, range, regs, mstop)
struct re_pattern_buffer *pbufp;
char *string1, *string2;
int size1, size2;
int startpos;
register int range;
struct re_registers *regs;
int mstop;
{
register char *fastmap = pbufp->fastmap;
register char *translate = pbufp->translate;
int total = size1 + size2;
/* Update the fastmap now if not correct already */
if (fastmap && !pbufp->fastmap_accurate)
re_compile_fastmap(pbufp);
while (1) {
/*
* If a fastmap is supplied, skip quickly over characters
* that cannot possibly be the start of a match. Note,
* however, that if the pattern can possibly match the null
* string, we must test it at each starting point so that we
* take the first null string we get.
*/
if (fastmap && startpos < total && !pbufp->can_be_null) {
if (range > 0) {
register int lim = 0;
register char *p;
int irange = range;
if (startpos < size1 && startpos + range >= size1)
lim = range - (size1 - startpos);
p = &(startpos >= size1 ? string2 - size1 : string1)[startpos];
if (translate) {
while (range > lim && !fastmap[translate[*p++]])
range--;
} else {
while (range > lim && !fastmap[*p++])
range--;
}
startpos += irange - range;
} else {
register char c;
if (startpos >= size1)
c = string2[startpos - size1];
else
c = string1[startpos];
if (translate ? !fastmap[translate[c]] : !fastmap[c])
goto advance;
}
}
if (range >= 0 && startpos == total
&& fastmap && !pbufp->can_be_null)
return -1;
if (0 <= re_match_2(pbufp, string1, size1, string2, size2, startpos, regs, mstop))
return startpos;
#ifdef C_ALLOCA
alloca(0);
#endif /* C_ALLOCA */
advance:
if (!range)
break;
if (range > 0)
range--, startpos++;
else
range++, startpos--;
}
return -1;
}
#ifndef emacs /* emacs never uses this */
int
re_match(pbufp, string, size, pos, regs)
struct re_pattern_buffer *pbufp;
char *string;
int size, pos;
struct re_registers *regs;
{
return re_match_2(pbufp, 0, 0, string, size, pos, regs, size);
}
#endif /* emacs */
/*
* Match the pattern described by `pbufp' against data which is the virtual
* concatenation of `string1' and `string2'. `size1' and `size2' are the
* sizes of the two data strings. Start the match at position `pos'. Do not
* consider matching past the position `mstop'.
*
* If pbufp->fastmap is nonzero, then it had better be up to date.
*
* The reason that the data to match is specified as two components which are to
* be regarded as concatenated is so that this function can be used directly
* on the contents of an Emacs buffer.
*
* -1 is returned if there is no match. Otherwise the value is the length of
* the substring which was matched.
*/
int
re_match_2(pbufp, string1, size1, string2, size2, pos, regs, mstop)
struct re_pattern_buffer *pbufp;
char *string1, *string2;
int size1, size2;
int pos;
struct re_registers *regs;
int mstop;
{
register char *p = pbufp->buffer;
register char *pend = p + pbufp->used;
/* End of first string */
char *end1;
/* End of second string */
char *end2;
/* Pointer just past last char to consider matching */
char *end_match_1, *end_match_2;
register char *d, *dend;
register int mcnt;
char *translate = pbufp->translate;
/*
* Failure point stack. Each place that can handle a failure further
* down the line pushes a failure point on this stack. It consists
* of two char *'s. The first one pushed is where to resume scanning
* the pattern; the second pushed is where to resume scanning the
* strings. If the latter is zero, the failure point is a "dummy". If
* a failure happens and the innermost failure point is dormant, it
* discards that failure point and tries the next one.
*/
char **stackb = (char **) alloca(2 * NFAILURES * sizeof(char *));
char **stackp = stackb, **stacke = &stackb[2 * NFAILURES];
/*
* Information on the "contents" of registers. These are pointers
* into the input strings; they record just what was matched (on this
* attempt) by some part of the pattern. The start_memory command
* stores the start of a register's contents and the stop_memory
* command stores the end.
*
* At that point, regstart[regnum] points to the first character in the
* register, regend[regnum] points to the first character beyond the
* end of the register, and regstart_segend[regnum] is either the
* same as regend[regnum] or else points to the end of the input
* string into which regstart[regnum] points. The latter case happens
* when regstart[regnum] is in string1 and regend[regnum] is in
* string2.
*/
char *regstart[RE_NREGS];
char *regstart_segend[RE_NREGS];
char *regend[RE_NREGS];
/*
* Set up pointers to ends of strings. Don't allow the second string
* to be empty unless both are empty.
*/
if (!size2) {
string2 = string1;
size2 = size1;
string1 = 0;
size1 = 0;
}
end1 = string1 + size1;
end2 = string2 + size2;
/* Compute where to stop matching, within the two strings */
if (mstop <= size1) {
end_match_1 = string1 + mstop;
end_match_2 = string2;
} else {
end_match_1 = end1;
end_match_2 = string2 + mstop - size1;
}
/*
* Initialize \( and \) text positions to -1 to mark ones that no \(
* or \) has been seen for.
*/
for (mcnt = 0; mcnt < sizeof(regstart) / sizeof(*regstart); mcnt++)
regstart[mcnt] = (char *) -1;
/*
* `p' scans through the pattern as `d' scans through the data.
* `dend' is the end of the input string that `d' points within. `d'
* is advanced into the following input string whenever necessary,
* but this happens before fetching; therefore, at the beginning of
* the loop, `d' can be pointing at the end of a string, but it
* cannot equal string2.
*/
if (pos <= size1)
d = string1 + pos, dend = end_match_1;
else
d = string2 + pos - size1, dend = end_match_2;
/* Write PREFETCH; just before fetching a character with *d. */
#define PREFETCH \
while (d == dend) \
{ if (dend == end_match_2) goto fail; /* end of string2 => failure */ \
d = string2; /* end of string1 => advance to string2. */ \
dend = end_match_2; }
/*
* This loop loops over pattern commands. It exits by returning from
* the function if match is complete, or it drops through if match
* fails at this starting point in the input data.
*/
while (1) {
if (p == pend)
/* End of pattern means we have succeeded! */
{
/*
* If caller wants register contents data back,
* convert it to indices
*/
if (regs) {
regend[0] = d;
regstart[0] = string1;
for (mcnt = 0; mcnt < RE_NREGS; mcnt++) {
if ((mcnt != 0) && regstart[mcnt] == (char *) -1) {
regs->start[mcnt] = -1;
regs->end[mcnt] = -1;
continue;
}
if (regstart[mcnt] - string1 < 0 ||
regstart[mcnt] - string1 > size1)
regs->start[mcnt] = regstart[mcnt] - string2 + size1;
else
regs->start[mcnt] = regstart[mcnt] - string1;
if (regend[mcnt] - string1 < 0 ||
regend[mcnt] - string1 > size1)
regs->end[mcnt] = regend[mcnt] - string2 + size1;
else
regs->end[mcnt] = regend[mcnt] - string1;
}
regs->start[0] = pos;
}
if (d - string1 >= 0 && d - string1 <= size1)
return d - string1 - pos;
else
return d - string2 + size1 - pos;
}
/* Otherwise match next pattern command */
switch ((regexpcode) * p++) {
/*
* \( is represented by a start_memory, \) by a
* stop_memory. Both of those commands contain a
* "register number" argument. The text matched
* within the \( and \) is recorded under that
* number. Then, \<digit> turns into a `duplicate'
* command which is followed by the numeric value of
* <digit> as the register number.
*/
case start_memory:
regstart[*p] = d;
regstart_segend[*p++] = dend;
break;
case stop_memory:
regend[*p] = d;
if (regstart_segend[*p] == dend)
regstart_segend[*p] = d;
p++;
break;
case duplicate:
{
int regno = *p++; /* Get which register to
* match against */
register char *d2, *dend2;
d2 = regstart[regno];
dend2 = regstart_segend[regno];
while (1) {
/*
* Advance to next segment in
* register contents, if necessary
*/
while (d2 == dend2) {
if (dend2 == end_match_2)
break;
if (dend2 == regend[regno])
break;
d2 = string2, dend2 = regend[regno]; /* end of string1 =>
* advance to string2. */
}
/*
* At end of register contents =>
* success
*/
if (d2 == dend2)
break;
/*
* Advance to next segment in data
* being matched, if necessary
*/
PREFETCH;
/*
* mcnt gets # consecutive chars to
* compare
*/
mcnt = dend - d;
if (mcnt > dend2 - d2)
mcnt = dend2 - d2;
/*
* Compare that many; failure if
* mismatch, else skip them.
*/
if (translate ? bcmp_translate(d, d2, mcnt, translate) : bcmp(d, d2, mcnt))
goto fail;
d += mcnt, d2 += mcnt;
}
}
break;
case anychar:
/* fetch a data character */
PREFETCH;
/* Match anything but a newline. */
if ((translate ? translate[*d++] : *d++) == '\n')
goto fail;
break;
case charset:
case charset_not:
{
/* Nonzero for charset_not */
int not = 0;
register int c;
if (*(p - 1) == (char) charset_not)
not = 1;
/* fetch a data character */
PREFETCH;
if (translate)
c = translate[*(unsigned char *) d];
else
c = *(unsigned char *) d;
if (c < *p * BYTEWIDTH
&& p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
not = !not;
p += 1 + *p;
if (!not)
goto fail;
d++;
break;
}
case begline:
if (d == string1 || d[-1] == '\n')
break;
goto fail;
case endline:
if (d == end2
|| (d == end1 ? (size2 == 0 || *string2 == '\n') : *d == '\n'))
break;
goto fail;
/*
* "or" constructs ("|") are handled by starting each
* alternative with an on_failure_jump that points to
* the start of the next alternative. Each
* alternative except the last ends with a jump to
* the joining point. (Actually, each jump except for
* the last one really jumps to the following jump,
* because tensioning the jumps is a hassle.)
*/
/*
* The start of a stupid repeat has an
* on_failure_jump that points past the end of the
* repeat text. This makes a failure point so that,
* on failure to match a repetition, matching
* restarts past as many repetitions have been found
* with no way to fail and look for another one.
*/
/*
* A smart repeat is similar but loops back to the
* on_failure_jump so that each repetition makes
* another failure point.
*/
case on_failure_jump:
if (stackp == stacke) {
char **stackx = (char **) alloca(2 * (stacke - stackb) * sizeof(char *));
bcopy((char *) stackb, (char *) stackx, (stacke - stackb) * sizeof(char *));
stackp += stackx - stackb;
stacke = stackx + 2 * (stacke - stackb);
stackb = stackx;
}
mcnt = *p++ & 0377;
mcnt += SIGN_EXTEND_CHAR(*p++) << 8;
*stackp++ = mcnt + p;
*stackp++ = d;
break;
/*
* The end of a smart repeat has an
* maybe_finalize_jump back. Change it either to a
* finalize_jump or an ordinary jump.
*/
case maybe_finalize_jump:
mcnt = *p++ & 0377;
mcnt += SIGN_EXTEND_CHAR(*p++) << 8;
/*
* Compare what follows with the begining of the
* repeat. If we can establish that there is nothing
* that they would both match, we can change to
* finalize_jump
*/
if (p == pend)
p[-3] = (char) finalize_jump;
else if (*p == (char) exactn || *p == (char) endline) {
register int c = *p == (char) endline ? '\n' : p[2];
register char *p1 = p + mcnt;
/*
* p1[0] ... p1[2] are an on_failure_jump.
* Examine what follows that
*/
if (p1[3] == (char) exactn && p1[5] != c)
p[-3] = (char) finalize_jump;
else if (p1[3] == (char) charset || p1[3] == (char) charset_not) {
int not = p1[3] == (char) charset_not;
if (c < p1[4] * BYTEWIDTH
&& p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
not = !not;
/* not is 1 if c would match */
/*
* That means it is not safe to
* finalize
*/
if (!not)
p[-3] = (char) finalize_jump;
}
}
p -= 2;
if (p[-1] != (char) finalize_jump) {
p[-1] = (char) jump;
goto nofinalize;
}
/*
* The end of a stupid repeat has a finalize-jump
* back to the start, where another failure point
* will be made which will point after all the
* repetitions found so far.
*/
case finalize_jump:
stackp -= 2;
case jump:
nofinalize:
mcnt = *p++ & 0377;
mcnt += SIGN_EXTEND_CHAR(*p++) << 8;
p += mcnt;
break;
case dummy_failure_jump:
if (stackp == stacke) {
char **stackx = (char **) alloca(2 * (stacke - stackb) * sizeof(char *));
bcopy((char *) stackb, (char *) stackx, (stacke - stackb) * sizeof(char *));
stackp += stackx - stackb;
stacke = stackx + 2 * (stacke - stackb);
stackb = stackx;
}
*stackp++ = 0;
*stackp++ = 0;
goto nofinalize;
case wordbound:
if (d == string1 /* Points to first char */
|| d == end2 /* Points to end */
|| (d == end1 && size2 == 0)) /* Points to end */
break;
if ((SYNTAX(((unsigned char *) d)[-1]) == Sword)
!= (SYNTAX(d == end1 ? *(unsigned char *) string2 : *(unsigned char *) d) == Sword))
break;
goto fail;
case notwordbound:
if (d == string1 /* Points to first char */
|| d == end2 /* Points to end */
|| (d == end1 && size2 == 0)) /* Points to end */
goto fail;
if ((SYNTAX(((unsigned char *) d)[-1]) == Sword)
!= (SYNTAX(d == end1 ? *(unsigned char *) string2 : *(unsigned char *) d) == Sword))
goto fail;
break;
case wordbeg:
if (d == end2 /* Points to end */
|| (d == end1 && size2 == 0) /* Points to end */
||SYNTAX(*(unsigned char *) (d == end1 ? string2 : d)) != Sword) /* Next char not a
* letter */
goto fail;
if (d == string1 /* Points to first char */
|| SYNTAX(((unsigned char *) d)[-1]) != Sword) /* prev char not letter */
break;
goto fail;
case wordend:
if (d == string1 /* Points to first char */
|| SYNTAX(((unsigned char *) d)[-1]) != Sword) /* prev char not letter */
goto fail;
if (d == end2 /* Points to end */
|| (d == end1 && size2 == 0) /* Points to end */
||SYNTAX(d == end1 ? *(unsigned char *) string2 : *(unsigned char *) d) != Sword) /* Next char not a
* letter */
break;
goto fail;
#ifdef emacs
case before_dot:
if (((d - string2 <= (unsigned) size2)
? d - (char *) bf_p2 : d - (char *) bf_p1)
<= point)
goto fail;
break;
case at_dot:
if (((d - string2 <= (unsigned) size2)
? d - (char *) bf_p2 : d - (char *) bf_p1)
== point)
goto fail;
break;
case after_dot:
if (((d - string2 <= (unsigned) size2)
? d - (char *) bf_p2 : d - (char *) bf_p1)
>= point)
goto fail;
break;
case wordchar:
mcnt = (int) Sword;
goto matchsyntax;
case syntaxspec:
mcnt = *p++;
matchsyntax:
PREFETCH;
if (SYNTAX(*(unsigned char *) d++) != (enum syntaxcode) mcnt)
goto fail;
break;
case notwordchar:
mcnt = (int) Sword;
goto matchnotsyntax;
case notsyntaxspec:
mcnt = *p++;
matchnotsyntax:
PREFETCH;
if (SYNTAX(*(unsigned char *) d++) == (enum syntaxcode) mcnt)
goto fail;
break;
#else
case wordchar:
PREFETCH;
if (SYNTAX(*(unsigned char *) d++) == 0)
goto fail;
break;
case notwordchar:
PREFETCH;
if (SYNTAX(*(unsigned char *) d++) != 0)
goto fail;
break;
#endif /* not emacs */
case begbuf:
if (d == string1) /* Note, d cannot equal
* string2 */
break; /* unless string1 == string2. */
goto fail;
case endbuf:
if (d == end2 || (d == end1 && size2 == 0))
break;
goto fail;
case exactn:
/*
* Match the next few pattern characters exactly.
* mcnt is how many characters to match.
*/
mcnt = *p++;
if (translate) {
do {
PREFETCH;
if (translate[*(unsigned char *) d++] != *p++)
goto fail;
}
while (--mcnt);
} else {
do {
PREFETCH;
if (*d++ != *p++)
goto fail;
}
while (--mcnt);
}
break;
}
continue; /* Successfully matched one pattern command;
* keep matching */
/* Jump here if any matching operation fails. */
fail:
if (stackp != stackb)
/*
* A restart point is known. Restart there and pop
* it.
*/
{
if (!stackp[-2]) { /* If innermost failure point
* is dormant, flush it and
* keep looking */
stackp -= 2;
goto fail;
}
d = *--stackp;
p = *--stackp;
if (d >= string1 && d <= end1)
dend = end_match_1;
} else
break; /* Matching at this starting point really
* fails! */
}
return -1; /* Failure to match */
}
static int
bcmp_translate(s1, s2, len, translate)
char *s1, *s2;
register int len;
char *translate;
{
register char *p1 = s1, *p2 = s2;
while (len) {
if (translate[*p1++] != translate[*p2++])
return 1;
len--;
}
return 0;
}
/* Entry points compatible with bsd4.2 regex library */
#ifndef emacs
static struct re_pattern_buffer re_comp_buf;
char *
re_comp(s)
char *s;
{
if (!s) {
if (!re_comp_buf.buffer)
return "No previous regular expression";
return 0;
}
if (!re_comp_buf.buffer) {
if (!(re_comp_buf.buffer = (char *) malloc(200)))
return "Memory exhausted";
re_comp_buf.allocated = 200;
if (!(re_comp_buf.fastmap = (char *) malloc(1 << BYTEWIDTH)))
return "Memory exhausted";
}
return re_compile_pattern(s, strlen(s), &re_comp_buf);
}
int
re_exec(s)
char *s;
{
int len = strlen(s);
return 0 <= re_search(&re_comp_buf, s, len, 0, len, 0);
}
#endif /* emacs */
#ifdef test
#include <stdio.h>
/* Indexed by a character, gives the upper case equivalent of the character */
static char upcase[0400] =
{000, 001, 002, 003, 004, 005, 006, 007,
010, 011, 012, 013, 014, 015, 016, 017,
020, 021, 022, 023, 024, 025, 026, 027,
030, 031, 032, 033, 034, 035, 036, 037,
040, 041, 042, 043, 044, 045, 046, 047,
050, 051, 052, 053, 054, 055, 056, 057,
060, 061, 062, 063, 064, 065, 066, 067,
070, 071, 072, 073, 074, 075, 076, 077,
0100, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
0130, 0131, 0132, 0133, 0134, 0135, 0136, 0137,
0140, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
0130, 0131, 0132, 0173, 0174, 0175, 0176, 0177,
0200, 0201, 0202, 0203, 0204, 0205, 0206, 0207,
0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217,
0220, 0221, 0222, 0223, 0224, 0225, 0226, 0227,
0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237,
0240, 0241, 0242, 0243, 0244, 0245, 0246, 0247,
0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257,
0260, 0261, 0262, 0263, 0264, 0265, 0266, 0267,
0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277,
0300, 0301, 0302, 0303, 0304, 0305, 0306, 0307,
0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317,
0320, 0321, 0322, 0323, 0324, 0325, 0326, 0327,
0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337,
0340, 0341, 0342, 0343, 0344, 0345, 0346, 0347,
0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357,
0360, 0361, 0362, 0363, 0364, 0365, 0366, 0367,
0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377
};
main()
{
char pat[80];
struct re_pattern_buffer buf;
struct re_registers regs;
int i;
char c;
char fastmap[(1 << BYTEWIDTH)];
buf.allocated = 40;
buf.buffer = (char *) malloc(buf.allocated);
buf.fastmap = fastmap;
buf.translate = upcase;
while (1) {
printf("Enter pattern\n");
gets(pat);
if (*pat) {
re_compile_pattern(pat, strlen(pat), &buf);
for (i = 0; i < buf.used; i++)
printchar(buf.buffer[i]);
putchar('\n');
printf("%d allocated, %d used.\n", buf.allocated, buf.used);
re_compile_fastmap(&buf);
printf("Allowed by fastmap: ");
for (i = 0; i < (1 << BYTEWIDTH); i++)
if (fastmap[i])
printchar(i);
putchar('\n');
}
printf("enter string to search\n");
gets(pat); /* Now read the string to match against */
/* i = re_match (&buf, pat, strlen (pat), 0, 0); */
i = re_search(&buf, pat, strlen(pat), 0, strlen(pat), ®s);
printf("Match value %d.\n", i);
for (i = 0; i < RE_NREGS; i++)
printf("%2d start %2d end %2d\n", i, regs.start[i], regs.end[i]);
}
}
#ifdef NOTDEF
print_buf(bufp)
struct re_pattern_buffer *bufp;
{
int i;
printf("buf is :\n----------------\n");
for (i = 0; i < bufp->used; i++)
printchar(bufp->buffer[i]);
printf("\n%d allocated, %d used.\n", bufp->allocated, bufp->used);
printf("Allowed by fastmap: ");
for (i = 0; i < (1 << BYTEWIDTH); i++)
if (bufp->fastmap[i])
printchar(i);
printf("\nAllowed by translate: ");
if (bufp->translate)
for (i = 0; i < (1 << BYTEWIDTH); i++)
if (bufp->translate[i])
printchar(i);
printf("\nfastmap is%s accurate\n", bufp->fastmap_accurate ? "" : "n't");
printf("can %s be null\n----------", bufp->can_be_null ? "" : "not");
}
#endif
printchar(c)
char c;
{
if (c < 041 || c >= 0177) {
putchar('\\');
putchar(((c >> 6) & 3) + '0');
putchar(((c >> 3) & 7) + '0');
putchar((c & 7) + '0');
} else
putchar(c);
}
error(string)
char *string;
{
puts(string);
exit(1);
}
#endif /* test */
#else
#include "nullfile.h"
#endif