home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Acorn User 11
/
AUCD11B.iso
/
LANGUAGES
/
WraithSet
/
AwkStuff
/
MawkSrc
/
c
/
scan
< prev
next >
Wrap
Text File
|
1996-07-28
|
21KB
|
1,081 lines
/********************************************
scan.c
copyright 1991, Michael D. Brennan
This is a source file for mawk, an implementation of
the AWK programming language.
Mawk is distributed without warranty under the terms of
the GNU General Public License, version 2, 1991.
********************************************/
/* $Log: scan.c,v $
* Revision 1.8 1996/07/28 21:47:05 mike
* gnuish patch
*
* Revision 1.7 1995/06/18 19:42:24 mike
* Remove some redundant declarations and add some prototypes
*
* Revision 1.6 1995/06/10 16:57:52 mike
* silently exit(0) if no program
* always add a '\n' on eof in scan_fillbuff()
*
* Revision 1.5 1995/06/06 00:18:33 mike
* change mawk_exit(1) to mawk_exit(2)
*
* Revision 1.4 1994/09/23 00:20:04 mike
* minor bug fix: handle \ in eat_nl()
*
* Revision 1.3 1993/07/17 00:45:21 mike
* indent
*
* Revision 1.2 1993/07/04 12:52:09 mike
* start on autoconfig changes
*
* Revision 1.1.1.1 1993/07/03 18:58:20 mike
* move source to cvs
*
* Revision 5.6 1993/02/13 21:57:33 mike
* merge patch3
*
* Revision 5.5 1993/01/01 21:30:48 mike
* split new_STRING() into new_STRING and new_STRING0
*
* Revision 5.4.1.1 1993/01/15 03:33:50 mike
* patch3: safer double to int conversion
*
* Revision 5.4 1992/11/29 18:57:50 mike
* field expressions convert to long so 16 bit and 32 bit
* systems behave the same
*
* Revision 5.3 1992/07/08 15:43:41 brennan
* patch2: length returns. I am a wimp
*
* Revision 5.2 1992/02/21 14:16:53 brennan
* fix: getline <=
*
* Revision 5.1 91/12/05 07:56:27 brennan
* 1.1 pre-release
*
*/
#include "mawk.h"
#include "scan.h"
#include "memory.h"
#include "field.h"
#include "init.h"
#include "fin.h"
#include "repl.h"
#include "code.h"
#ifndef NO_FCNTL_H
#include <fcntl.h>
#endif
#include "files.h"
/* static functions */
static void PROTO(scan_fillbuff, (void)) ;
static void PROTO(scan_open, (void)) ;
static int PROTO(slow_next, (void)) ;
static void PROTO(eat_comment, (void)) ;
static void PROTO(eat_semi_colon, (void)) ;
static double PROTO(collect_decimal, (int, int *)) ;
static int PROTO(collect_string, (void)) ;
static int PROTO(collect_RE, (void)) ;
/*-----------------------------
program file management
*----------------------------*/
char *pfile_name ;
STRING *program_string ;
PFILE *pfile_list ;
static unsigned char *buffer ;
static unsigned char *buffp ;
/* unsigned so it works with 8 bit chars */
static int program_fd ;
static int eof_flag ;
void
scan_init(cmdline_program)
char *cmdline_program ;
{
if (cmdline_program)
{
program_fd = -1 ; /* command line program */
program_string = new_STRING0(strlen(cmdline_program) + 1) ;
strcpy(program_string->str, cmdline_program) ;
/* simulate file termination */
program_string->str[program_string->len - 1] = '\n' ;
buffp = (unsigned char *) program_string->str ;
eof_flag = 1 ;
}
else /* program from file[s] */
{
scan_open() ;
buffp = buffer = (unsigned char *) zmalloc(BUFFSZ + 1) ;
scan_fillbuff() ;
}
#ifdef OS2 /* OS/2 "extproc" is similar to #! */
if (strnicmp(buffp, "extproc ", 8) == 0)
eat_comment();
#endif
eat_nl() ; /* scan to first token */
if (next() == 0)
{
/* no program */
mawk_exit(0) ;
}
un_next() ;
}
static void
scan_open() /* open pfile_name */
{
if (pfile_name[0] == '-' && pfile_name[1] == 0)
{
program_fd = 0 ;
}
else if ((program_fd = open(pfile_name, O_RDONLY, 0)) == -1)
{
errmsg(errno, "cannot open %s", pfile_name) ;
mawk_exit(2) ;
}
}
void
scan_cleanup()
{
if (program_fd >= 0) zfree(buffer, BUFFSZ + 1) ;
else free_STRING(program_string) ;
if (program_fd > 0) close(program_fd) ;
/* redefine SPACE as [ \t\n] */
scan_code['\n'] = posix_space_flag && rs_shadow.type != SEP_MLR
? SC_UNEXPECTED : SC_SPACE ;
scan_code['\f'] = SC_UNEXPECTED ; /*value doesn't matter */
scan_code['\013'] = SC_UNEXPECTED ; /* \v not space */
scan_code['\r'] = SC_UNEXPECTED ;
}
/*--------------------------------
global variables shared by yyparse() and yylex()
and used for error messages too
*-------------------------------*/
int current_token = -1 ;
unsigned token_lineno ;
unsigned compile_error_count ;
int NR_flag ; /* are we tracking NR */
int paren_cnt ;
int brace_cnt ;
int print_flag ; /* changes meaning of '>' */
int getline_flag ; /* changes meaning of '<' */
/*----------------------------------------
file reading functions
next() and un_next(c) are macros in scan.h
*---------------------*/
static unsigned lineno = 1 ;
static void
scan_fillbuff()
{
unsigned r ;
r = fillbuff(program_fd, (char *) buffer, BUFFSZ) ;
if (r < BUFFSZ)
{
eof_flag = 1 ;
/* make sure eof is terminated */
buffer[r] = '\n' ;
buffer[r + 1] = 0 ;
}
}
/* read one character -- slowly */
static int
slow_next()
{
while (*buffp == 0)
{
if (!eof_flag)
{
buffp = buffer ;
scan_fillbuff() ;
}
else if (pfile_list /* open another program file */ )
{
PFILE *q ;
if (program_fd > 0) close(program_fd) ;
eof_flag = 0 ;
pfile_name = pfile_list->fname ;
q = pfile_list ;
pfile_list = pfile_list->link ;
ZFREE(q) ;
scan_open() ;
token_lineno = lineno = 1 ;
}
else break /* real eof */ ;
}
return *buffp++ ; /* note can un_next() , eof which is zero */
}
static void
eat_comment()
{
register int c ;
while ((c = next()) != '\n' && scan_code[c]) ;
un_next() ;
}
/* this is how we handle extra semi-colons that are
now allowed to separate pattern-action blocks
A proof that they are useless clutter to the language:
we throw them away
*/
static void
eat_semi_colon()
/* eat one semi-colon on the current line */
{
register int c ;
while (scan_code[c = next()] == SC_SPACE) ;
if (c != ';') un_next() ;
}
void
eat_nl() /* eat all space including newlines */
{
while (1)
switch (scan_code[next()])
{
case SC_COMMENT:
eat_comment() ;
break ;
case SC_NL:
lineno++ ;
/* fall thru */
case SC_SPACE:
break ;
case SC_ESCAPE:
/* bug fix - surprised anyone did this,
a csh user with backslash dyslexia.(Not a joke)
*/
{
unsigned c ;
while (scan_code[c = next()] == SC_SPACE) ;
if (c == '\n')
token_lineno = ++lineno ;
else if (c == 0)
{
un_next() ;
return ;
}
else /* error */
{
un_next() ;
/* can't un_next() twice so deal with it */
yylval.ival = '\\' ;
unexpected_char() ;
if( ++compile_error_count == MAX_COMPILE_ERRORS )
mawk_exit(2) ;
return ;
}
}
break ;
default:
un_next() ;
return ;
}
}
int
yylex()
{
register int c ;
token_lineno = lineno ;
reswitch:
switch (scan_code[c = next()])
{
case 0:
ct_ret(EOF) ;
case SC_SPACE:
goto reswitch ;
case SC_COMMENT:
eat_comment() ;
goto reswitch ;
case SC_NL:
lineno++ ;
eat_nl() ;
ct_ret(NL) ;
case SC_ESCAPE:
while (scan_code[c = next()] == SC_SPACE) ;
if (c == '\n')
{
token_lineno = ++lineno ;
goto reswitch ;
}
if (c == 0) ct_ret(EOF) ;
un_next() ;
yylval.ival = '\\' ;
ct_ret(UNEXPECTED) ;
case SC_SEMI_COLON:
eat_nl() ;
ct_ret(SEMI_COLON) ;
case SC_LBRACE:
eat_nl() ;
brace_cnt++ ;
ct_ret(LBRACE) ;
case SC_PLUS:
switch (next())
{
case '+':
yylval.ival = '+' ;
string_buff[0] =
string_buff[1] = '+' ;
string_buff[2] = 0 ;
ct_ret(INC_or_DEC) ;
case '=':
ct_ret(ADD_ASG) ;
default:
un_next() ;
ct_ret(PLUS) ;
}
case SC_MINUS:
switch (next())
{
case '-':
yylval.ival = '-' ;
string_buff[0] =
string_buff[1] = '-' ;
string_buff[2] = 0 ;
ct_ret(INC_or_DEC) ;
case '=':
ct_ret(SUB_ASG) ;
default:
un_next() ;
ct_ret(MINUS) ;
}
case SC_COMMA:
eat_nl() ;
ct_ret(COMMA) ;
case SC_MUL:
test1_ret('=', MUL_ASG, MUL) ;
case SC_DIV:
{
static int can_precede_div[] =
{DOUBLE, STRING_, RPAREN, ID, D_ID, RE, RBOX, FIELD,
GETLINE, INC_or_DEC, -1} ;
int *p = can_precede_div ;
do
{
if (*p == current_token)
{
if (*p != INC_or_DEC) test1_ret('=', DIV_ASG, DIV) ;
if (next() == '=')
{
un_next() ;
ct_ret(collect_RE()) ;
}
}
}
while (*++p != -1) ;
ct_ret(collect_RE()) ;
}
case SC_MOD:
test1_ret('=', MOD_ASG, MOD) ;
case SC_POW:
test1_ret('=', POW_ASG, POW) ;
case SC_LPAREN:
paren_cnt++ ;
ct_ret(LPAREN) ;
case SC_RPAREN:
if (--paren_cnt < 0)
{
compile_error("extra ')'") ;
paren_cnt = 0 ;
goto reswitch ;
}
ct_ret(RPAREN) ;
case SC_LBOX:
ct_ret(LBOX) ;
case SC_RBOX:
ct_ret(RBOX) ;
case SC_MATCH:
string_buff[0] = '~' ;
string_buff[0] = 0 ;
yylval.ival = 1 ;
ct_ret(MATCH) ;
case SC_EQUAL:
test1_ret('=', EQ, ASSIGN) ;
case SC_NOT: /* ! */
if ((c = next()) == '~')
{
string_buff[0] = '!' ;
string_buff[1] = '~' ;
string_buff[2] = 0 ;
yylval.ival = 0 ;
ct_ret(MATCH) ;
}
else if (c == '=') ct_ret(NEQ) ;
un_next() ;
ct_ret(NOT) ;
case SC_LT: /* '<' */
if (next() == '=') ct_ret(LTE) ;
else un_next() ;
if (getline_flag)
{
getline_flag = 0 ;
ct_ret(IO_IN) ;
}
else ct_ret(LT) ;
case SC_GT: /* '>' */
if (print_flag && paren_cnt == 0)
{
print_flag = 0 ;
/* there are 3 types of IO_OUT
-- build the error string in string_buff */
string_buff[0] = '>' ;
if (next() == '>')
{
yylval.ival = F_APPEND ;
string_buff[1] = '>' ;
string_buff[2] = 0 ;
}
else
{
un_next() ;
yylval.ival = F_TRUNC ;
string_buff[1] = 0 ;
}
return current_token = IO_OUT ;
}
test1_ret('=', GTE, GT) ;
case SC_OR:
if (next() == '|')
{
eat_nl() ;
ct_ret(OR) ;
}
else
{
un_next() ;
if (print_flag && paren_cnt == 0)
{
print_flag = 0 ;
yylval.ival = PIPE_OUT ;
string_buff[0] = '|' ;
string_buff[1] = 0 ;
ct_ret(IO_OUT) ;
}
else ct_ret(PIPE) ;
}
case SC_AND:
if (next() == '&')
{
eat_nl() ;
ct_ret(AND) ;
}
else
{
un_next() ;
yylval.ival = '&' ;
ct_ret(UNEXPECTED) ;
}
case SC_QMARK:
ct_ret(QMARK) ;
case SC_COLON:
ct_ret(COLON) ;
case SC_RBRACE:
if (--brace_cnt < 0)
{
compile_error("extra '}'") ;
eat_semi_colon() ;
brace_cnt = 0 ;
goto reswitch ;
}
if ((c = current_token) == NL || c == SEMI_COLON
|| c == SC_FAKE_SEMI_COLON || c == RBRACE)
{
/* if the brace_cnt is zero , we've completed
a pattern action block. If the user insists
on adding a semi-colon on the same line
we will eat it. Note what we do below:
physical law -- conservation of semi-colons */
if (brace_cnt == 0) eat_semi_colon() ;
eat_nl() ;
ct_ret(RBRACE) ;
}
/* supply missing semi-colon to statement that
precedes a '}' */
brace_cnt++ ;
un_next() ;
current_token = SC_FAKE_SEMI_COLON ;
return SEMI_COLON ;
case SC_DIGIT:
case SC_DOT:
{
double d;
int flag ;
static double double_zero = 0.0 ;
static double double_one = 1.0 ;
if ((d = collect_decimal(c, &flag)) == 0.0)
{
if (flag) ct_ret(flag) ;
else yylval.ptr = (PTR) & double_zero ;
}
else if (d == 1.0)
{
yylval.ptr = (PTR) & double_one ;
}
else
{
yylval.ptr = (PTR) ZMALLOC(double) ;
*(double *) yylval.ptr = d ;
}
ct_ret(DOUBLE) ;
}
case SC_DOLLAR: /* '$' */
{
double d;
int flag ;
while (scan_code[c = next()] == SC_SPACE) ;
if (scan_code[c] != SC_DIGIT &&
scan_code[c] != SC_DOT)
{
un_next() ;
ct_ret(DOLLAR) ;
}
/* compute field address at compile time */
if ((d = collect_decimal(c, &flag)) == 0.0)
{
if (flag) ct_ret(flag) ; /* an error */
else yylval.cp = &field[0] ;
}
else
{
if (d > MAX_FIELD)
{
compile_error(
"$%g exceeds maximum field(%d)", d, MAX_FIELD) ;
d = MAX_FIELD ;
}
yylval.cp = field_ptr((int) d) ;
}
ct_ret(FIELD) ;
}
case SC_DQUOTE:
return current_token = collect_string() ;
case SC_IDCHAR: /* collect an identifier */
{
unsigned char *p =
(unsigned char *) string_buff + 1 ;
SYMTAB *stp ;
string_buff[0] = c ;
while (
(c = scan_code[*p++ = next()]) == SC_IDCHAR ||
c == SC_DIGIT) ;
un_next() ;
*--p = 0 ;
switch ((stp = find(string_buff))->type)
{
case ST_NONE:
/* check for function call before defined */
if (next() == '(')
{
stp->type = ST_FUNCT ;
stp->stval.fbp = (FBLOCK *)
zmalloc(sizeof(FBLOCK)) ;
stp->stval.fbp->name = stp->name ;
stp->stval.fbp->code = (INST *) 0 ;
yylval.fbp = stp->stval.fbp ;
current_token = FUNCT_ID ;
}
else
{
yylval.stp = stp ;
current_token =
current_token == DOLLAR ? D_ID : ID ;
}
un_next() ;
break ;
case ST_NR:
NR_flag = 1 ;
stp->type = ST_VAR ;
/* fall thru */
case ST_VAR:
case ST_ARRAY:
case ST_LOCAL_NONE:
case ST_LOCAL_VAR:
case ST_LOCAL_ARRAY:
yylval.stp = stp ;
current_token =
current_token == DOLLAR ? D_ID : ID ;
break ;
case ST_ENV:
stp->type = ST_ARRAY ;
stp->stval.array = new_ARRAY() ;
load_environ(stp->stval.array) ;
yylval.stp = stp ;
current_token =
current_token == DOLLAR ? D_ID : ID ;
break ;
case ST_FUNCT:
yylval.fbp = stp->stval.fbp ;
current_token = FUNCT_ID ;
break ;
case ST_KEYWORD:
current_token = stp->stval.kw ;
break ;
case ST_BUILTIN:
yylval.bip = stp->stval.bip ;
current_token = BUILTIN ;
break ;
case ST_LENGTH:
yylval.bip = stp->stval.bip ;
/* check for length alone, this is an ugly
hack */
while (scan_code[c = next()] == SC_SPACE) ;
un_next() ;
current_token = c == '(' ? BUILTIN : LENGTH ;
break ;
case ST_FIELD:
yylval.cp = stp->stval.cp ;
current_token = FIELD ;
break ;
default:
bozo("find returned bad st type") ;
}
return current_token ;
}
case SC_UNEXPECTED:
yylval.ival = c & 0xff ;
ct_ret(UNEXPECTED) ;
}
return 0 ; /* never get here make lint happy */
}
/* collect a decimal constant in temp_buff.
Return the value and error conditions by reference */
static double
collect_decimal(c, flag)
int c ;
int *flag ;
{
register unsigned char *p = (unsigned char *) string_buff + 1 ;
unsigned char *endp ;
double d;
*flag = 0 ;
string_buff[0] = c ;
if (c == '.')
{
if (scan_code[*p++ = next()] != SC_DIGIT)
{
*flag = UNEXPECTED ;
yylval.ival = '.' ;
return 0.0 ;
}
}
else
{
while (scan_code[*p++ = next()] == SC_DIGIT) ;
if (p[-1] != '.')
{
un_next() ;
p-- ;
}
}
/* get rest of digits after decimal point */
while (scan_code[*p++ = next()] == SC_DIGIT) ;
/* check for exponent */
if (p[-1] != 'e' && p[-1] != 'E')
{
un_next() ;
*--p = 0 ;
}
else /* get the exponent */
{
if (scan_code[*p = next()] != SC_DIGIT &&
*p != '-' && *p != '+')
{
*++p = 0 ;
*flag = BAD_DECIMAL ;
return 0.0 ;
}
else /* get the rest of the exponent */
{
p++ ;
while (scan_code[*p++ = next()] == SC_DIGIT) ;
un_next() ;
*--p = 0 ;
}
}
errno = 0 ; /* check for overflow/underflow */
d = strtod(string_buff, (char **) &endp) ;
#ifndef STRTOD_UNDERFLOW_ON_ZERO_BUG
if (errno) compile_error("%s : decimal %sflow", string_buff,
d == 0.0 ? "under" : "over") ;
#else /* ! sun4 bug */
if (errno && d != 0.0)
compile_error("%s : decimal overflow", string_buff) ;
#endif
if (endp < p)
{
*flag = BAD_DECIMAL ;
return 0.0 ;
}
return d ;
}
/*---------- process escape characters ---------------*/
static char hex_val['f' - 'A' + 1] =
{
10, 11, 12, 13, 14, 15, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
10, 11, 12, 13, 14, 15} ;
#define isoctal(x) ((x)>='0'&&(x)<='7')
#define hex_value(x) hex_val[(x)-'A']
#define ishex(x) (scan_code[x] == SC_DIGIT ||\
'A' <= (x) && (x) <= 'f' && hex_value(x))
static int PROTO(octal, (char **)) ;
static int PROTO(hex, (char **)) ;
/* process one , two or three octal digits
moving a pointer forward by reference */
static int
octal(start_p)
char **start_p ;
{
register char *p = *start_p ;
register unsigned x ;
x = *p++ - '0' ;
if (isoctal(*p))
{
x = (x << 3) + *p++ - '0' ;
if (isoctal(*p)) x = (x << 3) + *p++ - '0' ;
}
*start_p = p ;
return x & 0xff ;
}
/* process one or two hex digits
moving a pointer forward by reference */
static int
hex(start_p)
char **start_p ;
{
register unsigned char *p = (unsigned char *) *start_p ;
register unsigned x ;
unsigned t ;
if (scan_code[*p] == SC_DIGIT) x = *p++ - '0' ;
else x = hex_value(*p++) ;
if (scan_code[*p] == SC_DIGIT) x = (x << 4) + *p++ - '0' ;
else if ('A' <= *p && *p <= 'f' && (t = hex_value(*p)))
{
x = (x << 4) + t ;
p++ ;
}
*start_p = (char *) p ;
return x ;
}
#define ET_END 9
static struct
{
char in, out ;
}
escape_test[ET_END + 1] =
{
'n', '\n',
't', '\t',
'f', '\f',
'b', '\b',
'r', '\r',
'a', '\07',
'v', '\013',
'\\', '\\',
'\"', '\"',
0, 0
} ;
/* process the escape characters in a string, in place . */
char *
rm_escape(s)
char *s ;
{
register char *p, *q ;
char *t ;
int i ;
q = p = s ;
while (*p)
{
if (*p == '\\')
{
escape_test[ET_END].in = *++p ; /* sentinal */
i = 0 ;
while (escape_test[i].in != *p) i++ ;
if (i != ET_END) /* in table */
{
p++ ;
*q++ = escape_test[i].out ;
}
else if (isoctal(*p))
{
t = p ;
*q++ = octal(&t) ;
p = t ;
}
else if (*p == 'x' && ishex(*(unsigned char *) (p + 1)))
{
t = p + 1 ;
*q++ = hex(&t) ;
p = t ;
}
else if (*p == 0) /* can only happen with command line assign */
*q++ = '\\' ;
else /* not an escape sequence */
{
*q++ = '\\' ;
*q++ = *p++ ;
}
}
else *q++ = *p++ ;
}
*q = 0 ;
return s ;
}
static int
collect_string()
{
register unsigned char *p = (unsigned char *) string_buff ;
int c ;
int e_flag = 0 ; /* on if have an escape char */
while (1)
switch (scan_code[*p++ = next()])
{
case SC_DQUOTE: /* done */
*--p = 0 ;
goto out ;
case SC_NL:
p[-1] = 0 ;
/* fall thru */
case 0: /* unterminated string */
compile_error(
"runaway string constant \"%.10s ...",
string_buff, token_lineno) ;
mawk_exit(2) ;
case SC_ESCAPE:
if ((c = next()) == '\n')
{
p-- ;
lineno++ ;
}
else if (c == 0) un_next() ;
else
{
*p++ = c ;
e_flag = 1 ;
}
break ;
default:
break ;
}
out:
yylval.ptr = (PTR) new_STRING(
e_flag ? rm_escape(string_buff)
: string_buff) ;
return STRING_ ;
}
static int
collect_RE()
{
register unsigned char *p = (unsigned char *) string_buff ;
int c ;
STRING *sval ;
while (1)
switch (scan_code[*p++ = next()])
{
case SC_DIV: /* done */
*--p = 0 ;
goto out ;
case SC_NL:
p[-1] = 0 ;
/* fall thru */
case 0: /* unterminated re */
compile_error(
"runaway regular expression /%.10s ...",
string_buff, token_lineno) ;
mawk_exit(2) ;
case SC_ESCAPE:
switch (c = next())
{
case '/':
p[-1] = '/' ;
break ;
case '\n':
p-- ;
break ;
case 0:
un_next() ;
break ;
default:
*p++ = c ;
break ;
}
break ;
}
out:
/* now we've got the RE, so compile it */
sval = new_STRING(string_buff) ;
yylval.ptr = re_compile(sval) ;
free_STRING(sval) ;
return RE ;
}