home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
C/C++ Interactive Guide
/
c-cplusplus-interactive-guide.iso
/
c_ref
/
csource3
/
167_01
/
fgrep.c
< prev
next >
Wrap
Text File
|
1985-11-15
|
23KB
|
925 lines
/*
HEADER: CUG
TITLE: FGREP.C - Search File(s) for Fixed Pattern(s);
VERSION: 1.06;
DATE: 09/13/86;
DESCRIPTION: "A full implementation of the UNIX 'fgrep'
utility. The algorithm used in this program
constructs a deterministic finite state automaton
(FSA) for pattern matching from the substrings,
then uses the FSA to process the text string in
one pass. The time taken to construct the FSA is
proportional to the sum of the lengths of the
substrings. The number of state transitions made
by the FSA in processing the text string is
independent of the number of substrings.";
KEYWORDS: fgrep, grep, filter, UNIX, pattern-matching;
SYSTEM: ANY;
FILENAME: FGREP.C;
WARNINGS: "The '-s' option may not be consistently
supported by various non-Unix operating systems
and compilers. Also, the Unix-specific '-b'
option of 'fgrep' is not supported. Finally,
non-Unix operating systems may not accept
lowercase character strings on the command line,
although these can be entered through files.";
CRC: xxxx
SEE-ALSO: FGREP.DOC;
AUTHORS: Ian Ashdown - byHeart Software;
COMPILERS: ANY;
REFERENCES: AUTHORS: Bell Telephone Laboratories;
TITLE: UNIX Programmer's Manual Vol. 1, p. 166;
AUTHORS: A.V. Aho & M.J. Corasick;
TITLE: 'Efficient String Matching: An Aid to
Bibliographic Search'
Communications of the ACM
pp. 333 - 340, Vol. 18 No. 6 (June '75);
ENDREF
*/
/*-------------------------------------------------------------*/
/* FGREP.C - Search File(s) For Fixed Pattern(s)
*
* Version 1.06 September 13th, 1986
*
* Modifications:
*
* V1.00 (84/12/01) - beta test release
* V1.01 (85/01/01) - added -P option
* - improved command line validation
* V1.02 (85/01/06) - modified "run_fsa()" and "bd_move()"
* V1.03 (85/02/11) - added -S option
* V1.04 (85/09/08) - corrected "bd_go()" for UNIX
* V1.05 (85/09/20) - (Lloyd Rice, Computalker Consultants)
* - added definitions for Lattice C
* - linecount complemented for -CV option
* - buffer added for "stoupper()"
* - allowed for signed char representations
* V1.06 (86/09/13) - corrected -P option bug
*
* Copyright 1985,1986: Ian Ashdown
* byHeart Software
* 620 Ballantree Road
* West Vancouver, B.C.
* Canada V7S 1W3
*
* This program may be copied for personal, non-commercial use
* only, provided that the above copyright notice is included in
* all copies of the source code. Copying for any other use
* without previously obtaining the written permission of the
* author is prohibited.
*
* Notes:
*
* The algorithm used in this program constructs a deterministic
* finite state automaton (FSA) for pattern matching from the sub-
* strings, then uses the FSA to process the text string in one
* pass. The time taken to construct the FSA is proportional to
* the sum of the lengths of the the substrings. The number of
* state transitions made by the FSA in processing the text
* string is independent of the number of substrings.
*
* Algorithm Source:
*
* "Efficient String Matching: An Aid to Bibliographic Search"
* Alfred V. Aho & Margaret J. Corasick
* Communications of the ACM
* pp. 333 - 340, Vol. 18 No. 6 (June '75)
*
* USAGE: fgrep [-vclnhyefxps] [strings] <files>
*
* where:
*
* -v All lines but those matching are printed.
* -c Only a count of the matching lines is printed.
* -l The names of the files with matching lines are
* listed (once), separated by newlines.
* -n Each line is preceded by its line number in the
* file.
* -h Do not print filename headers with output lines.
* -y All characters in the file are mapped to upper
* case before matching. (This is the default if the
* string is given in the command line under CP/M,
* as CP/M maps everything on the command line to
* upper case. Use the -f option if you need both
* lower and upper case.) Not a true UNIX "fgrep"
* option (normally available under "grep" only),
* but too useful to leave out.
* -e <string>. Same as a string argument, but useful
* when the string begins with a '-'.
* -f <file>. The strings (separated by newlines) are
* taken from a file. If several strings are listed
* in the file, then a match is flagged if any of
* the strings are matched. If -f is given, any
* following argument on the command line is taken
* to be a filename.
* -x Only lines matched in their entirety are printed.
* -p Each matched line is preceded by the matching
* substring(s). Not a UNIX "fgrep" option, but too
* useful to leave out.
* -s No output is produced, only status. Used when
* when "fgrep" is run as a process that returns a
* status value to its parent process. Under CP/M, a
* non-zero value returned by "exit()" may terminate
* a submit file that initiated the program,
* although this is implementation-dependent.
*
* DIAGNOSTICS:
*
* Exit status is 0 if any matches are found, 1 if none, 2 for
* error condition.
*
* BUGS:
*
* The following UNIX-specific option is not supported:
*
* -b Each line is preceded by the block number in
* which it was found
*
* Lines are limited to 256 characters.
*
*/
/*** Definitions ***/
#define TRUE -1
#define FALSE 0
#define MAX_LINE 257 /* Maximum number of characters */
/* per line plus NULL delimiter */
#define CP/M /* Comment out for compilation */
/* under UNIX or MS-DOS */
/* The following definition applies to the Lattice C compiler
* (Version 2.15) for MS-DOS. The Lattice library does not have
* the Unix function "index()". However, it does have an
* equivalent function called "stpchr()".
*/
/* #define index stpchr */ /* Remove comment delimiters for */
/* Lattice C version 2.15 */
#define CMD_ERR 0 /* Error codes */
#define OPT_ERR 1
#define INP_ERR 2
#define STR_ERR 3
#define MEM_ERR 4
/*** Typedefs ***/
typedef int BOOL; /* Boolean flag */
/* Some C compilers, including Lattice C, do not support the
* "void" data type. Remove the following comment delimiters for
* such compilers.
*/
/* typedef int void; */
/*** Data Structures ***/
/* Queue element */
typedef struct queue
{
struct state_el *st_ptr;
struct queue *next_el;
} QUEUE;
/* Transition element */
typedef struct transition
{
char lchar; /* Transition character */
struct state_el *nextst_ptr; /* Transition state pointer */
struct transition *next_el;
} TRANSITION;
/* FSA state element */
typedef struct state_el
{
TRANSITION *go_ls; /* Pointer to head of "go" list */
TRANSITION *mv_ls; /* Pointer to head of "move" list */
struct state_el *fail_state; /* "failure" transition state */
char *out_str; /* Terminal state message (if any) */
} FSA;
/*** Global Variables and Structures ***/
/* Dummy "failure" state */
FSA FAIL_STATE;
/* Define a separate data structure for State 0 of the FSA to
* speed processing of the input while the FSA is in that state.
* Since the Aho-Corasick algorithm only defines "go" transitions
* for this state (one for each valid input character) and no
* "failure" transitions or output messages, only an array of
* "go" transition state numbers is needed. The array is accessed
* directly, using the input character as the index.
*/
FSA *MZ[128]; /* State 0 of FSA */
BOOL vflag = FALSE, /* Command-line option flags */
cflag = FALSE,
lflag = FALSE,
nflag = FALSE,
hflag = FALSE,
yflag = FALSE,
eflag = FALSE,
fflag = FALSE,
xflag = FALSE,
pflag = FALSE,
sflag = FALSE;
/*** Include Files ***/
#include <stdio.h>
#include <ctype.h>
/*** Main Body of Program ***/
int main(argc,argv)
int argc;
char **argv;
{
char *temp;
BOOL match_flag = FALSE,
proc_file();
void bd_go(),
bd_move(),
error();
/* Check for minimum number of command line arguments */
if(argc < 2)
error(CMD_ERR,N