home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Dream 45
/
Amiga_Dream_45.iso
/
Amiga
/
Magazine
/
Dossier-LaTeX
/
lgrind-amiga.lha
/
lgrind
/
src
/
regexp.c
< prev
next >
Wrap
C/C++ Source or Header
|
1996-08-04
|
15KB
|
624 lines
#ifndef lint
static char *sccsid="@(#)regexp.c 1.2 (LBL) 4/12/85";
static char Version[] =
"$Id: regexp.c,v 1.2 91/10/01 00:30:20 gvr Exp $";
#endif
/*
* Regular expression matching routines for lgrind/tgrind/tfontedpr.
*
* These routines were written by Dave Presotto (I think) for vgrind.
* Minor mods & attempts to improve performance by Van Jacobson
* (van@@lbl-rtsg) and Chris Torek (chris@@maryland).
*
* Modifications.
* --------------
* Sep 91 George V Reilly Fixed up some bogus uses of @NIL@ and @NULL@.
* 30 Mar 85 Van & Chris Changed @expmatch()@ to return pointer to
* start of what was matched in addition to
* pointer to match end. Several changes to
* improve performance (too numerous to mention).
* 11 Dec 84 Dave Presotto Written.
*/
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include "lgrind.h"
#include "regexp.h"
int makelower(int c) {
switch(c) {
case 0xc5: return 0xe5;
case 0xc4: return 0xe4;
case 0xf6: return 0xd6;
default: return (isupper((c)) ? tolower((c)) : (c));
}
}
int (*re_strncmp)(const char *s1, const char *s2, size_t len);
/* function used by @expmatch()@ to compare
* strings. The caller should make it point to
* @strncmp()@ if case is significant &
* @lc_strncmp()@ otherwise.
*/
/* @lc_strncmp()@ --- like @strncmp()@ except that we convert the
* first string to lower case before comparing.
*/
int
lc_strncmp(const char *s1, const char *s2, size_t len)
/* register unsigned char *s1,*s2;
register int len;*/
{
while (len-- > 0)
if (*s2 - makelower(*s1))
return 1;
else
s2++, s1++;
return 0;
}
/* The following routine converts an irregular expression to
* internal format.
*
* Either meta symbols (%|\a|%, %|\d|%, or %|\p|%) or character strings
* or operations (alternation or parenthesizing) can be specified.
* Each starts with a descriptor byte. The descriptor byte
* has @STR@ set for strings, @META@ set for meta symbols,
* and @OPER@ set for operations. The descriptor byte can also have
* the @OPT@ bit set if the object defined is optional. Also @ALT@
* can be set to indicate an alternation.
*
* For metasymbols, the byte following the descriptor byte identities
* the meta symbol (containing an ASCII @'a'@, @'d'@, @'p'@, @'|'@,
* or @'('@). For strings, the byte after the descriptor is a
* character count for the string:
*@
* meta symbols := descriptor
* symbol
*
* strings := descriptor
* character count
* the string
*
* operations := descriptor
* symbol
* character count@
*/
/*
* handy macros for accessing parts of match blocks
*/
#define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
#define MNEXT(A) (A+2) /* character following a metasymbol block */
#define OSYM(A) (*(A+1)) /* symbol in an operation block */
#define OCNT(A) (*(A+2)) /* character count */
#define ONEXT(A) (A+3) /* next character after the operation */
#define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
#define SCNT(A) (*(A+1)) /* byte count of a string */
#define SSTR(A) (A+2) /* address of the string */
#define SNEXT(A) (A+2+*(A+1)) /* character following the string */
/*
* bit flags in the descriptor
*/
#define OPT 1
#define STR 2
#define META 4
#define ALT 8
#define OPER 16
unsigned char *ure; /* pointer current position in unconverted exp */
unsigned char *ccre; /* pointer to current position in converted exp */
/*
* Convert an irregular expression to the internal form
*/
unsigned char *
convexp(re)
unsigned char *re; /* unconverted irregular expression */
{
register unsigned char *cre; /* pointer to converted regular expression */
/* allocate room for the converted expression */
if (re == NULL)
return NULL;
if (*re == '\0')
return NULL;
cre = malloc(4 * strlen(re) + 3);
ccre = cre;
ure = re;
/* start the conversion with a %|\a|% */
*cre = META | OPT;
MSYM(cre) = 'a';
ccre = MNEXT(cre);
/* start the conversion (it's recursive) */
expconv();
*ccre = 0;
return cre;
}
/*
* Actually do the conversion
*/
void
expconv(void)
{
register unsigned char *cs; /* pointer to current symbol in converted exp */
register unsigned char c; /* character being processed */
register unsigned char *acs; /* pointer to last alternate */
register int temp;
/* let the conversion begin */
acs = NULL;
cs = NULL;
while (*ure != '\0') {
switch (c = *ure++) {
case '\\':
switch (c = *ure++) {
/* escaped characters are just characters */
default:
if (cs == NULL || (*cs & STR) == 0) {
cs = ccre;
*cs = STR;
SCNT(cs) = 1;
ccre += 2;
} else
SCNT(cs)++;
*ccre++ = c;
break;
/* normal(?) metacharacters */
case 'a':
case 'd':
case 'e':
case 'p':
if (acs != NULL && acs != cs) {
do {
temp = OCNT(acs);
OCNT(acs) = ccre - acs;
acs -= temp;
} while (temp != 0);
acs = NULL;
}
cs = ccre;
*cs = META;
MSYM(cs) = c;
ccre = MNEXT(cs);
break;
}
break;
/* just put the symbol in */
case '^':
case '$':
if (acs != NULL && acs != cs) {
do {
temp = OCNT(acs);
OCNT(acs) = ccre - acs;
acs -= temp;
} while (temp != 0);
acs = NULL;
}
cs = ccre;
*cs = META;
MSYM(cs) = c;
ccre = MNEXT(cs);
break;
/* mark the last match sequence as optional */
case '?':
if (cs)
*cs = *cs | OPT;
break;
/* recurse and define a subexpression */
case '(':
if (acs != NULL && acs != cs) {
do {
temp = OCNT(acs);
OCNT(acs) = ccre - acs;
acs -= temp;
} while (temp != 0);
acs = NULL;
}
cs = ccre;
*cs = OPER;
OSYM(cs) = '(';
ccre = ONEXT(cs);
expconv();
OCNT(cs) = ccre - cs; /* offset to next symbol */
break;
/* return from a recursion */
case ')':
if (acs != NULL) {
do {
temp = OCNT(acs);
OCNT(acs) = ccre - acs;
acs -= temp;
} while (temp != 0);
acs = NULL;
}
cs = ccre;
*cs = META;
MSYM(cs) = c;
ccre = MNEXT(cs);
return;
/* Mark the last match sequence as having an alternate. */
/* The third byte will contain an offset to jump over the */
/* alternate match in case the first did not fail */
case '|':
if (acs != NULL && acs != cs)
OCNT(ccre) = ccre - acs; /* make a back pointer */
else
OCNT(ccre) = 0;
*cs |= ALT;
cs = ccre;
*cs = OPER;
OSYM(cs) = '|';
ccre = ONEXT(cs);
acs = cs; /* remember that the pointer is to be filled */
break;
/* if it's not a metasymbol, just build a character string */
default:
if (cs == NULL || (*cs & STR) == 0) {
cs = ccre;
*cs = STR;
SCNT(cs) = 1;
ccre = SSTR(cs);
} else
SCNT(cs)++;
*ccre++ = c;
break;
}
}
if (acs != NULL) {
do {
temp = OCNT(acs);
OCNT(acs) = ccre - acs;
acs -= temp;
} while (temp != 0);
acs = NULL;
}
return;
} /* end of @expconv()@ */
/*
* The following routine recognises an irregular expression
* with the following special characters:
*
* %|\?|% means last match was optional
* %|\a|% matches any number of characters
* %|\d|% matches any number of spaces and tabs
* %|\p|% matches any number of alphanumeric characters.
* The characters matched will be copied into
* the area pointed to by @mstring@.
* %|\||% alternation
* %|\( \)|% grouping used mostly for alternation and
* optionality
*
* The irregular expression must be translated to internal form
* prior to calling this routine
*
* The value returned is the pointer to the last character matched.
* If @strtptr@ is non-@NULL@, a pointer to the start of the string matched
* (excluding %|\a|% matches) will be returned at @*strtptr@.
* If @mstring@ is non-@NULL@, the string matched by a %|\p|% will be copied
* into @mstring@.
*/
boolean _escaped; /* true if we are currently @_escaped@ */
unsigned char *_sstart; /* start of string. Set in @putScp()@. */
unsigned char *
expmatch(register unsigned char *s, register unsigned char *re, unsigned char **strtptr, unsigned char *mstring)
/* s - string to check for a match in */
/* re - a converted irregular expression */
/* strtptr - where to put ptr to start of match */
/* mstring - where to put whatever matches a %|\p|% */
{
register unsigned char *cs; /* the current symbol */
register unsigned char *ptr, *s1; /* temporary pointer */
register unsigned char c; /* temporary */
boolean matched; /* a temporary @boolean@ */
/* initial conditions */
if (strtptr)
*strtptr = '\0';
if (re == NULL)
return NULL;
cs = re;
matched = FALSE;
/* loop till expression string is exhausted (or at least pretty tired) */
while (*cs) {
switch (*cs & (OPER | STR | META)) {
/* try to match a string */
case STR:
matched = !((*re_strncmp)(s, SSTR(cs), SCNT(cs)));
if (matched) {
/* hoorah: it matches */
s += SCNT(cs);
cs = SNEXT(cs);
} else if (*cs & ALT) {
/* alternation: skip to next expression */
cs = SNEXT(cs);
} else if (*cs & OPT) {
/* the match is optional */
cs = SNEXT(cs);
matched = 1; /* indicate a successful match */
} else {
/* no match: error return */
return NULL;
}
break;
/* an operator: do something fancy */
case OPER:
switch (OSYM(cs)) {
/* this is an alternation */
case '|':
if (matched)
/* last thing in the alternation was a match: skip ahead */
cs = OPTR(cs);
else
/* no match: keep trying */
cs = ONEXT(cs);
break;
/* this is a grouping: recurse */
case '(':
ptr = expmatch(s, ONEXT(cs), strtptr, mstring);
if (ptr != NULL) {
/* the subexpression matched */
matched = 1;
s = ptr;
} else if (*cs & ALT) {
/* alternation: skip to next expression */
matched = 0;
} else if (*cs & OPT) {
/* the match is optional */
matched = 1; /* indicate a successful match */
} else {
/* no match: error return */
return NULL;
}
cs = OPTR(cs);
break;
}
break;
/* try to match a metasymbol */
case META:
switch (MSYM(cs)) {
/* try to match anything and remember what was matched */
case 'p':
/*
* This is really the same as trying the match the
* remaining parts of the expression to any subset
* of the string.
*/
s1 = s;
do {
ptr = expmatch(s1, MNEXT(cs), strtptr, mstring);
if (ptr != NULL && s1 != s) {
/* we have a match: remember the match */
if (mstring) {
strncpy(mstring, s, s1 - s);
mstring[s1 - s] = '\0';
}
return ptr;
} else if (ptr != NULL && (*cs & OPT)) {
/* %|\p|% was optional, so no match is ok */
return ptr;
} else if (ptr != NULL) {
/* not optional and we still matched */
return NULL;
}
if (!(isalnum(*s1)) && !strchr(l_id, *s1))
return NULL;
if (*s1 == '\\')
_escaped = _escaped ? FALSE : TRUE;
else
_escaped = FALSE;
} while (*s1++);
return NULL;
/* try to match anything */
case 'a':
/*
* This is really the same as trying the match the
* remaining parts of the expression to any subset
* of the string.
*/
s1 = s;
do {
/*
* Hack for an important special case: if the next thing
* in the pattern is a string, just gobble characters until
* we find something that matches that string (this saves
* the cost of a recursive call on @expmatch()@ while scanning
* for the start of comments or strings). Since many
* patterns end with a string, we also treat that as a
* special case.
*/
if(*(ptr = MNEXT(cs)) == STR) {
c = *SSTR(ptr);
while(*s1 && *s1 != c)
s1++;
if (*s1 == 0)
return NULL;
if (SNEXT(ptr) == 0 && (s1 != s || *cs & OPT)) {
/* next item is a string, it's the last item and
* the %|\a|% match is ok --- just loop to try & match
* the string.
*/
if (strtptr)
*strtptr = s1;
cs = ptr;
s = s1;
break;
}
}
ptr = expmatch(s1, MNEXT(cs), strtptr, mstring);
if (ptr != NULL && (s1 != s || *cs & OPT)) {
/* we have a match */
if (strtptr)
*strtptr = s1;
return ptr;
} else if (ptr != NULL) {
/* not optional and we still matched */
return NULL;
}
if (*s1 == '\\')
_escaped = _escaped ? FALSE : TRUE;
else
_escaped = FALSE;
} while (*s1++);
return NULL;
/* fail if we are currently @_escaped@ */
case 'e':
if (_escaped)
return NULL;
cs = MNEXT(cs);
break;
/* match any number of tabs and spaces */
case 'd':
ptr = s;
while (*s == ' ' || *s == '\t')
s++;
if (s != ptr || s == _sstart) {
/* match: be happy */
matched = 1;
cs = MNEXT(cs);
} else if (*s == '\n' || *s == '\0') {
/* match: be happy */
matched = 1;
cs = MNEXT(cs);
} else if (*cs & ALT) {
/* try the next part */
matched = 0;
cs = MNEXT(cs);
} else if (*cs & OPT) {
/* doesn't matter */
matched = 1;
cs = MNEXT(cs);
} else
/* no match: error return */
return NULL;
break;
/* check for end of line */
case '$':
if (*s == '\0' || *s == '\n') {
/* match: be happy */
s++;
matched = 1;
cs = MNEXT(cs);
} else if (*cs & ALT) {
/* try the next part */
matched = 0;
cs = MNEXT(cs);
} else if (*cs & OPT) {
/* doesn't matter */
matched = 1;
cs = MNEXT(cs);
} else
/* no match: error return */
return NULL;
break;
/* check for start of line */
case '^':
if (s == _sstart) {
/* match: be happy */
matched = 1;
cs = MNEXT(cs);
} else if (*cs & ALT) {
/* try the next part */
matched = 0;
cs = MNEXT(cs);
} else if (*cs & OPT) {
/* doesn't matter */
matched = 1;
cs = MNEXT(cs);
} else
/* no match: error return */
return NULL;
break;
/* end of a subexpression: return success */
case ')':
return s;
}
break;
}
}
return s;
}