home *** CD-ROM | disk | FTP | other *** search
- /******************************************************************************
- * Module : Regular Expression Compiling.
- *
- * Author : John W. M. Stevens.
- *
- * Notes : Use UNIX style regular expressions. Grammar is:
- *
- * REG_EXPR ::= ANCHOR
- * | ANCHOR '|' REG_EXPR
- *
- * ANCHOR ::= CATENATION
- * | '^' CATENATION
- * | CATENATION '$'
- * | '^' CATENATION '$'
- *
- * CATENATION ::= PHRASE
- * | PHRASE CATENATION
- *
- * PHRASE ::= UNARY
- * | UNARY ENUM_OP
- *
- * ENUM_OP ::= '*'
- * | '+'
- * | '?'
- * | '{' SPAN_RNG '}'
- *
- * SPAN_RNG ::= NUMBER
- * | NUMBER ',' NUMBER
- *
- * UNARY ::= '(' REG_EXPR ')'
- * | CHAR_STR
- * | '[' SET ']'
- * | '[' '^' SET ']'
- * | '.'
- *
- * SET ::= CHAR_STR
- * | CHAR_STR SET
- * | CHAR '-' CHAR
- * | CHAR '-' CHAR SET
- *
- * CHAR_STR ::= CHARACTER
- * | CHARACTER CHARACTER_STR
- *
- * CHARACTER ::= ' ' - '~' except for []()|{}+*? which are special without
- * being escaped.
- ******************************************************************************/
-
- #include "compiler.h"
-
- #include "unpost.h"
- #include "sets.h"
- #include "regexp.h"
- #include "utils.h"
-
- /* Character Sets. */
- static int ReInitFlag = 0;
- static char *SpecStr = "[().|$";
- static SET SpecSet;
- static char *PostFixStr = "{*?+";
- static SET PostSet;
- static char *DecStr = "0-9";
- static SET DecSet;
-
- static UINT SubExprNo;
-
- static REG_EXP_NODE *Alternation(char **);
-
- #if defined( RE_TEST )
-
- /*-----------------------------------------------------------------------------
- | Routine : PrtReExpr() --- Print a regular expression for debugging
- | purposes. (This may be used in the configuration helper
- | in a later release, so make it pretty).
- |
- | Inputs : Node - Pointer to current level regular expression node.
- | Level - Current level in tree.
- -----------------------------------------------------------------------------*/
-
- static
- void PrtReExpr(REG_EXP_NODE *Node,
- int Level)
- {
- register int i;
- register int j;
-
- /* Traverse right to left so that the tree can be printed
- * on a screen or page rotated 90 degrees to the left.
- *
- * (Right to left == top to bottom of screen or page)
- */
- if ( Node->Right )
- PrtReExpr(Node->Right, Level + 1);
-
- /* Print indentation. */
- for (i = 0; i < Level; i++)
- printf(" ");
-
- /* Print node type and information. */
- switch ( Node->NodeType )
- {
- case OP_L_PAREN:
- printf("(\n");
- break;
- case DATA_LEFT_ANCHOR:
- printf("^\n");
- break;
- case DATA_RIGHT_ANCHOR:
- printf("$\n");
- break;
- case OP_ENUM:
- printf("Enum {%d, %u}\n",
- Node->MinSpan,
- Node->MaxSpan);
- break;
- case OP_OR:
- printf("|\n");
- break;
- case OP_AND:
- printf("&\n");
- break;
- case DATA_ANY:
- printf(".\n");
- break;
- case DATA_SPAN:
- printf("Span {%d, %u}\n",
- Node->MinSpan,
- Node->MaxSpan);
- break;
- case DATA_STRING:
- printf("'%s'\n",
- Node->data.MatchStr,
- Node->SubExprNo);
- break;
- case DATA_SET:
- printf("[");
- for (j = 0; j < 128; j++)
- if ( InSet(Node->data.CSet, (char) j) )
- {
- if (j < 32 || j > 127)
- printf("\\x%02x");
- else
- putchar( (char) j );
- }
- printf("] %d\n", Node->SubExprNo);
- break;
- case NODE_TYPE_NOT_SET:
- default:
- printf(">>>> ERROR !, no node type set.\n");
- break;
- }
-
- /* Now print left child. */
- if ( Node->Left )
- PrtReExpr(Node->Left, Level + 1);
- }
-
- #endif
-
- /*-----------------------------------------------------------------------------
- | Routine : AllocRegExpNode() --- Allocate a regular expression node.
- |
- | Returns : Returns a pointer to the newly allocated regular expression
- | node.
- -----------------------------------------------------------------------------*/
-
- static
- REG_EXP_NODE *AllocRegExpNode(void)
- {
- auto REG_EXP_NODE *New;
- extern FILE *ErrFile;
-
- /* Allocate the node. */
- if ((New = (REG_EXP_NODE *) calloc(1, sizeof( REG_EXP_NODE ))) == NULL)
- {
- fprintf(ErrFile,
- "%s %d : Out of memory.\n",
- __FILE__,
- __LINE__);
- exit( 1 );
- }
- return( New );
- }
-
- /*-----------------------------------------------------------------------------
- | Routine : GetInt() --- Get an integer number for the case of
- | regular expression enumeration.
- |
- | Inputs : Str - Pointer to string to get number from.
- | Outputs : Str - Pointer to first character in string after number.
- |
- | Returns : Returns the integer read from the string.
- -----------------------------------------------------------------------------*/
-
- static
- int GetInt(char **Str)
- {
- auto int i;
-
- /* Strip white space. */
- while (**Str == ' ' || **Str == '\t' || **Str == '\n')
- (*Str)++;
-
- /* Get number. */
- i = 0;
- while ( InSet(DecSet, **Str) )
- i = i * 10 + (*(*Str)++ - '0');
-
- /* Strip trailing white space. */
- while (**Str == ' ' || **Str == '\t' || **Str == '\n')
- (*Str)++;
-
- /* Return number. */
- return( i );
- }
-
- /*-----------------------------------------------------------------------------
- | Routine : EnumOp() --- Get the enumeration modifer.
- | node.
- |
- | Inputs : Str - Pointer to enumeration operator.
- | Root - Pointer to regular expression node.
- | Outputs : Str - Pointer after enumeration operator.
- |
- | Note : A value of zero for maximum span value indicates ANY
- | number.
- -----------------------------------------------------------------------------*/
-
- static
- void EnumOp(char **Str,
- REG_EXP_NODE *Root)
- {
- extern FILE *ErrFile;
-
- /* Set up the operator and enumerator values. */
- switch ( *(*Str)++ )
- {
- case '+':
- Root->MinSpan = 1;
- Root->MaxSpan = ~0;
- break;
- case '*':
- Root->MinSpan = 0;
- Root->MaxSpan = ~0;
- break;
- case '?':
- Root->MinSpan = 0;
- Root->MaxSpan = 1;
- break;
- case '{':
- /* Get specifically enumerated span. */
- Root->MinSpan = GetInt( Str );
- Root->MaxSpan = Root->MinSpan;
-
- /* Is there a maximum number of characters? */
- if (**Str == ',')
- {
- /* Get maximum span. */
- (*Str)++;
-
- /* Strip white space. */
- while (**Str == ' ' || **Str == '\t' || **Str == '\n')
- (*Str)++;
-
- /* Set to maximum, or get maximum. */
- if (**Str == '}')
- Root->MaxSpan = ~0;
- else if ( InSet(DecSet, **Str) )
- Root->MaxSpan = GetInt( Str );
- }
-
- /* Check for end brace. */
- if (**Str != '}')
- {
- fprintf(ErrFile,
- "%s %d : Error - missing '}' in regular expression.\n",
- __FILE__,
- __LINE__);
- exit( 1 );
- }
- (*Str)++;
- break;
- }
- }
-
- /*-----------------------------------------------------------------------------
- | Routine : Unary() --- Unary Operations.
- |
- | Inputs : Str - Pointer to current character in source RE string.
- |
- | Returns : Pointer to regular expression node.
- -----------------------------------------------------------------------------*/
-
- static
- REG_EXP_NODE *Unary(char **Str)
- {
- auto REG_EXP_NODE *Node;
- auto char Buffer[256];
- auto char *Tmp;
- extern FILE *ErrFile;
-
- /* Get the regular expression atoms. */
- switch ( **Str )
- {
- case '.':
- /* Allocate a regular expression node. */
- (*Str)++;
- Node = AllocRegExpNode();
- Node->NodeType = DATA_ANY;
- break;
- case '[':
- /* Allocate a regular expression node. */
- Node = AllocRegExpNode();
-
- /* Allocate a set. */
- if ((Node->data.CSet = (SET_TYPE *) calloc(1, SET_SIZE *
- sizeof( SET_TYPE ))) == NULL)
- {
- fprintf(ErrFile,
- "%s %d : Out of memory.\n",
- __FILE__,
- __LINE__);
- exit( 1 );
- }
-
- /* Create the set. */
- (*Str)++;
- CrtSet(Str, Node->data.CSet);
- Node->NodeType = DATA_SET;
- break;
- case '(':
- /* Skip parentheses. */
- (*Str)++;
-
- /* Allocate a regular expression node. */
- Node = AllocRegExpNode();
- Node->NodeType = OP_L_PAREN;
-
- /* Save the sub expression number. */
- if (SubExprNo > MAX_SUB_EXPRS)
- {
- fprintf(ErrFile,
- "%s %d : Error - To many sub-expressions.\n",
- __FILE__,
- __LINE__);
- exit( 1 );
- }
- Node->SubExprNo = SubExprNo++;
-
- /* Get sub expression. */
- Node->Right = Alternation( Str );
-
- /* Check for end parentheses. */
- if (**Str != ')')
- {
- fprintf(ErrFile,
- "%s %d : Error - missing ')' in regular expression.\n",
- __FILE__,
- __LINE__);
- exit( 1 );
- }
- (*Str)++;
- break;
- default:
- /* Check for badly formed regular expression. */
- if (InSet(SpecSet, **Str) || InSet(PostSet, **Str))
- {
- fprintf(ErrFile,
- "%s %d : Error - badly formed regular expression.\n",
- __FILE__,
- __LINE__);
- fprintf(ErrFile,
- "\tUnexpected character '%c'\n",
- **Str);
- exit( 1 );
- }
-
- /* Allocate a regular expression node. */
- Node = AllocRegExpNode();
-
- /* Get characters while they are not in the set of special
- * characters.
- */
- for (Tmp = Buffer; **Str; )
- if (**Str == '\\' && (*Str)[1])
- {
- *Tmp++ = *++*Str;
- ++*Str;
- }
- else if (InSet(SpecSet, **Str) || InSet(PostSet, **Str))
- break;
- else
- *Tmp++ = *(*Str)++;
- *Tmp = '\0';
- Node->NodeType = DATA_STRING;
-
- /* Duplicate the string and add to the node. */
- if ((Node->data.MatchStr = StrDup( Buffer )) == NULL)
- {
- fprintf(ErrFile,
- "%s %d : Out of memory.\n",
- __FILE__,
- __LINE__);
- exit( 1 );
- }
- break;
- }
-
- /* Return a pointer to the new node. */
- return( Node );
- }
-
- /*-----------------------------------------------------------------------------
- | Routine : Enumerate() --- Enumerate a regular expression.
- |
- | Inputs : Str - Pointer to current character in source RE string.
- |
- | Returns : Pointer to regular expression node.
- -----------------------------------------------------------------------------*/
-
- static
- REG_EXP_NODE *Enumerate(char **Str)
- {
- auto REG_EXP_NODE *Node;
- auto REG_EXP_NODE *New;
- extern FILE *ErrFile;
-
- /* Get the regular expression. */
- Node = Unary( Str );
-
- /* Test for enumeration. */
- if ( InSet(PostSet, **Str) )
- {
- /* Check for the special case of enumerating a '.' */
- if (Node->NodeType == DATA_ANY)
- {
- /* Modify it to be a DATA_SPAN type. */
- Node->NodeType = DATA_SPAN;
- }
- else if (Node->NodeType == OP_L_PAREN)
- {
- fprintf(ErrFile,
- "%s %d : Error, can not enumerate a sub expression.\n",
- __FILE__,
- __LINE__);
- exit( 1 );
- }
- else
- {
- /* Allocate an enumeration node. */
- New = AllocRegExpNode();
- New->Right = Node;
- New->NodeType = OP_ENUM;
- Node = New;
- }
-
- /* Determine enumeration value. */
- EnumOp(Str, Node);
- }
- return( Node );
- }
-
- /*-----------------------------------------------------------------------------
- | Routine : Catenation() --- Concatenate regular expressions.
- |
- | Inputs : Str - Pointer to current character in source RE string.
- |
- | Returns : Pointer to regular expression node.
- -----------------------------------------------------------------------------*/
-
- static
- REG_EXP_NODE *Catenation(char **Str)
- {
- auto REG_EXP_NODE *Root;
- auto REG_EXP_NODE *SubExpr;
- auto REG_EXP_NODE *Prev;
- auto REG_EXP_NODE *Next;
-
- /* Loop, getting concatenations (and operation) of regular
- * expressions seperated by OR's.
- */
- Prev = Root = NULL;
- for ( ; ; )
- {
- /* Get next sub expression. */
- SubExpr = Enumerate( Str );
-
- /* Determine type of action to take. */
- if (**Str && **Str != '|' && **Str != ')' && **Str != '$')
- {
- /* Create new node. */
- Next = AllocRegExpNode();
- Next->Left = SubExpr;
- Next->NodeType = OP_AND;
-
- /* Link to old node. */
- if ( Prev )
- Prev->Right = Next;
- else
- Root = Next;
- Prev = Next;
- }
- else
- {
- /* Determine final link type. */
- if ( Prev )
- Prev->Right = SubExpr;
- else
- Root = SubExpr;
-
- /* End loop. */
- break;
- }
- }
-
- /* Return a pointer to the root node. */
- return( Root );
- }
-
- /*-----------------------------------------------------------------------------
- | Routine : Anchor() --- Parse the two anchoring unary operators.
- |
- | Inputs : Str - Regular expression source string.
- |
- | Returns : Returns a pointer to the compiled regular expression.
- -----------------------------------------------------------------------------*/
-
- static
- REG_EXP_NODE *Anchor(char **Str)
- {
- auto REG_EXP_NODE *Root;
- auto REG_EXP_NODE *Node;
-
- /* Check for begining of line anchor. */
- if (**Str == '^')
- {
- /* Next character. */
- (*Str)++;
-
- /* Allocate a node. */
- Root = AllocRegExpNode();
- Root->NodeType = DATA_LEFT_ANCHOR;
-
- /* Get expression. */
- Root->Right = Catenation( Str );
- }
- else
- Root = Catenation( Str );
-
- /* Check for end of line anchor. */
- if (**Str == '$')
- {
- /* Next character. */
- (*Str)++;
-
- /* Allocate a node. */
- Node = AllocRegExpNode();
- Node->NodeType = DATA_RIGHT_ANCHOR;
- Node->Right = Root;
- Root = Node;
- }
-
- /* Return regular expression. */
- return( Root );
- }
-
- /*-----------------------------------------------------------------------------
- | Routine : Alternation() --- Parse an alternation expression.
- |
- | Inputs : Str - Pointer to current character in source RE string.
- |
- | Returns : Pointer to regular expression node.
- -----------------------------------------------------------------------------*/
-
- static
- REG_EXP_NODE *Alternation(char **Str)
- {
- auto REG_EXP_NODE *Root;
- auto REG_EXP_NODE *New;
-
- /* Get a concatenation of regular expressions. */
- Root = Anchor( Str );
-
- /* Loop, getting concatenations (and operation) of regular
- * expressions seperated by OR's.
- */
- while (**Str == '|')
- {
- /* Next character. */
- (*Str)++;
-
- /* Allocate a node. */
- New = AllocRegExpNode();
- New->Left = Root;
- New->NodeType = OP_OR;
-
- /* Get right hand of expression. */
- New->Right = Anchor( Str );
- Root = New;
- }
-
- /* Return root of regular expression tree. */
- return( Root );
- }
-
- /*-----------------------------------------------------------------------------
- | Routine : ReGraph() --- Convert a regular expression tree into a graph.
- |
- | Inputs : Node - Regular expression tree node.
- |
- | Returns : Returns a pointer to the compiled regular expression.
- -----------------------------------------------------------------------------*/
-
- static
- REG_EXP_NODE *ReGraph(REG_EXP_NODE *Node,
- REG_EXP_NODE *Link)
- {
- auto REG_EXP_NODE *RetLink;
- auto REG_EXP_NODE *New;
-
- /* Determine operation type. */
- switch ( Node->NodeType )
- {
- case OP_L_PAREN:
- /* Allocate an end of parentheses node. */
- New = AllocRegExpNode();
- New->NodeType = OP_R_PAREN;
- New->SubExprNo = Node->SubExprNo;
- New->Right = Link;
-
- /* Continue link. */
- Node->Right = ReGraph(Node->Right, New);
- RetLink = Node;
- break;
- case OP_ENUM:
- /* Continue link. */
- Node->Left = Node->Right;
- Node->Right = Link;
- RetLink = Node;
- break;
- case OP_AND:
- /* Traverse right, returning a pointer that can be used to
- * link the tree into a graph.
- */
- RetLink = ReGraph(Node->Right, Link);
-
- /* Go down left and link together. */
- RetLink = ReGraph(Node->Left, RetLink);
-
- /* Free AND node and return the link. */
- free( Node );
- break;
- case OP_OR:
- /* Allocate an end of or node. */
- New = AllocRegExpNode();
- New->NodeType = END_OR;
- New->Right = Link;
-
- /* Process both. */
- Node->Right = ReGraph(Node->Right, New);
- Node->Left = ReGraph(Node->Left, New);
-
- /* Return pointer to OR. */
- RetLink = Node;
- break;
- case DATA_LEFT_ANCHOR:
- Node->Right = ReGraph(Node->Right, Link);
- RetLink = Node;
- break;
- case DATA_RIGHT_ANCHOR:
- /* Allocate an end of parentheses node. */
- New = Node->Right;
- Node->Right = Link;
- RetLink = ReGraph(New, Node);
- break;
- case DATA_ANY:
- case DATA_SPAN:
- case DATA_STRING:
- case DATA_SET:
- Node->Right = Link;
- RetLink = Node;
- }
-
- /* Return a pointer to the tail end of the graph so far. */
- return( RetLink );
- }
-
- /*-----------------------------------------------------------------------------
- | Routine : ReCompile() --- Compile a regular expression.
- |
- | Inputs : Str - Regular expression source string.
- |
- | Returns : Returns a pointer to the compiled regular expression.
- -----------------------------------------------------------------------------*/
-
- REG_EXP_NODE *ReCompile(char *Str)
- {
- auto REG_EXP_NODE *Root;
-
- /* Construct sets, if not already constructed. */
- if (ReInitFlag == 0)
- {
- auto char **Str;
-
- /* Create the sets. */
- Str = &SpecStr;
- CrtSet(Str, SpecSet);
- Str = &PostFixStr;
- CrtSet(Str, PostSet);
- Str = &DecStr;
- CrtSet(Str, DecSet);
-
- /* We have initialized, so set flag saying so. */
- ReInitFlag = 1;
- }
-
- /* Check for leading anchor. */
- SubExprNo = 1;
- Root = Alternation( &Str );
-
- /* Print the regular expression tree. */
- #if defined( RE_TEST )
- PrtReExpr(Root, 0);
- #endif
-
- /* Convert to a graph. */
- Root = ReGraph(Root, NULL);
-
- /* Return pointer to regular expression. */
- return( Root );
- }
-
- /*-----------------------------------------------------------------------------
- | Routine : FreeReExpr() --- Free the memory of a regular expression
- | graph memory.
- |
- | Inputs : ReExpr - Regular expression digraph root.
- -----------------------------------------------------------------------------*/
-
- REG_EXP_NODE *FreeReExpr(REG_EXP_NODE *ReExpr)
- {
- auto REG_EXP_NODE *Node;
- auto REG_EXP_NODE *EndOr;
- extern FILE *ErrFile;
-
- /* Free the different node types. */
- while (ReExpr && ReExpr->NodeType != END_OR)
- {
- /* Get pointer to next node. */
- Node = ReExpr->Right;
-
- /* Select operation on node type. */
- switch ( ReExpr->NodeType )
- {
- case OP_ENUM:
- (void) FreeReExpr( ReExpr->Left );
- break;
- case OP_OR:
- /* Free to end of OR branch. */
- (void) FreeReExpr( ReExpr->Right );
-
- /* Free to end of OR branch, and get pointer to next
- * node.
- */
- EndOr = FreeReExpr( ReExpr->Left );
- Node = EndOr->Right;
- free( EndOr );
- break;
- case DATA_STRING:
- free( ReExpr->data.MatchStr );
- break;
- case DATA_SET:
- free( ReExpr->data.CSet );
- break;
- case OP_AND:
- case OP_L_PAREN:
- case OP_R_PAREN:
- case DATA_LEFT_ANCHOR:
- case DATA_RIGHT_ANCHOR:
- case DATA_ANY:
- case DATA_SPAN:
- break;
- default:
- fprintf(ErrFile,
- "%s %d : Error - illegal regular expression node type.\n",
- __FILE__,
- __LINE__);
- exit( 1 );
- }
-
- /* Move along. */
- free( ReExpr );
- ReExpr = Node;
- }
-
- /* Return pointer to end of chain. */
- return( ReExpr );
- }
-