home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Simtel MSDOS 1992 September
/
Simtel20_Sept92.cdr
/
msdos
/
ddjmag
/
ddj8905.arc
/
TAWK.ASC
< prev
next >
Wrap
Text File
|
1989-05-12
|
22KB
|
712 lines
_TAWK, A Simple Interpreter in C++_
by Bruce Eckel
[LISTING ONE]
// FIELD.HXX: used by csascii class to build a single field.
// Fields are collected by csascii to create a record.
// by Bruce Eckel,
#include <stream.hxx>
class field { // one field in a comma-separated ASCII record
istream * input; // where to get the data
char * data;
int length, fsize;
int end_of_file; // flag to indicate the end of file happened
void getfield(); // recursive function to read in a field;
// treats data, length & input as globals
int infield; // flag used by getfield() to determine whether
// it's inside a quoted field
public:
field(istream & instream);
~field();
friend ostream& operator<<(ostream &s, field & f) {
s << f.data;
return s;
}
int eof() { return end_of_file; } // to check for end
int size() { return fsize;}
int last_length() {return length; }
char * string() { return data; }
};
[LISTING TWO]
// FIELD.CXX: definitions for class field
// A "recursive descent" scanning scheme is used because field
// length is always unknown.
// by Bruce Eckel
#include "field.hxx"
field::field(istream & instream) {
input = &instream;
length = 0;
end_of_file = 0; // set flag to say "we're not at the end"
infield = 0; // set flag to say "we're not inside a field"
data = (char *)0; // to show no memory has been allocated
getfield(); // recursively get characters until end of field
}
field::~field() {
delete data; // if no memory has been allocated,
// data = (char *)0 so this will have no effect.
}
// A Comma-separated ASCII field is contained in quotes to allow
// commas within the field; these quotes must be stripped out
void field::getfield() {
char c;
// This happens when DEscending:
if((input->get(c)).eof() ) {
end_of_file++; // just say we reached the end...
return;
}
else // watch out for the Unix vs. DOS LF/CR problem here:
if (((c != ',') || infield) && (c != '\n')) {
if ( (c != '"') && (c != '\r')) // watch for quotes or CR
length++; // no quotes -- count this character
else {
if ( c == '"')
infield = !infield; // if we weren't inside a field
// and a quote was encountered, we are now inside
// a field. If we were inside a field and a quote
// was found, we're out of the field.
c = 0; // a quote or CR; mark it so it isn't included
}
getfield(); // recursively get characters in field
// after returning from function call, we jump past
// the following "else" part to finish the recursion
}
else { // This happens once, when the terminator is found:
fsize = length; // remember how long the string is
data = new char[length + 1]; // space for null terminator
data[length] = '\0'; // highest index is "length"
// when you allocate an array of length + 1
length--; // notice we don't insert the delimiter
// Now the first "if" statement evaluates to TRUE and
// the function rises back up.
return;
}
// This happens when Ascending:
if ( c ) // if it wasn't a quote or CR,
data[length--] = c; // put chars in as we rise back up...
}
[LISTING THREE]
// CSASCII.HXX: class to manipulate comma-separated ASCII
// database files.
//by Bruce Eckel
#include <stream.hxx>
#include "field.hxx"
class csascii { // manipulates comma-separated ascii files,
// generated by most database management systems (generated and
// used by the BASIC programming language). Each field
// is separated by a comma; records are separated by newlines.
int fieldcount;
field ** data; // an array to hold the entire record
istream * datafile; // file with comma separated ASCII input
int readrecord(); // private function to read a record
public:
csascii( char * filename ); // Open file, get first record
~csascii(); // destructor
int next(); // get next record, return 0 when EOF
field & operator[](int index); // select a field
int number_of_fields() { return fieldcount; }
};
[LISTING FOUR]
// CSASCII.CXX: function definitions for comma-separated
// ascii database manipulation class
// by Bruce Eckel,
#include "csascii.hxx"
int csascii::readrecord() {
for (int fieldnum = 0; fieldnum < fieldcount; fieldnum++ ) {
data[fieldnum] = new field(*datafile);
if (data[fieldnum]->eof()) return 0;
}
return 1;
}
csascii::csascii( char * filename ) {
char c;
fieldcount = 0;
int quote = 0;
// first, determine the number of fields in a record:
{
// See text for dangers of opening files this way:
istream infile(new filebuf->open(filename, input));
while(infile.get(c), c != '\n') {
// keep track of being inside a quoted string:
if (c == '"') quote = !quote;
// fields are delimited by unquoted commas:
if ( c == ',' && !quote)
fieldcount++;
}
} // infile goes out of scope; file closed
fieldcount++; // last field terminated by newline, not comma
// an array of field pointers:
data = new field * [ fieldcount ];
// re-open at start; dynamically allocate so it isn't scoped:
datafile = new istream(new filebuf->open(filename, input));
readrecord();
}
csascii::~csascii() {
delete data;
delete datafile; // calls istream destructor to close file
}
int csascii::next() {
for (int i = 0; i < fieldcount; i++ )
delete data[i]; // free all the data storage
return readrecord(); // 0 when end of file
}
field & csascii::operator[](int index) {
if (index >= fieldcount) {
cerr << "index too large for number of fields in record\n";
exit(1);
}
return *(data[index]);
}
[LISTING FIVE]
// LOOKUP.CXX: simple use of csascii to find name in a database
// by Bruce Eckel,
#include "csascii.hxx"
#include <string.h>
main(int argc, char ** argv) {
if (argc < 2) {
cerr << "usage: lookup lastname\n";
exit(1);
}
// This puts the database file in the root directory:
csascii file("\\ppquick.asc"); // create object & open file
int found = 0; // indicates one record was found
do {
if (strcmp(file[0].string(),argv[1]) == 0) {
found++; // found one. File is sorted, so if we stop
// finding them, quit instead of wasting time.
cout << chr(27) << "[2J"; // ANSI clear screen
for (int i = 0; i < file.number_of_fields(); i++)
cout << file[i] << "\n";
cout << chr(27) << "[7m" << "press any key" <<
chr(27) << "[0m";
if( getch() == 27) break;
} else if (found) exit(0); // quit if that was the last
} while (file.next());
}
[LISTING SIX]
// PARSE.HXX: class to parse a tawk script file. Creates
// a structure which can be used at run-time to "execute"
// the tawk script.
// by Bruce Eckel,
#include <stream.hxx>
// types of tokens the scanner can find:
enum tokentype {
fieldnumber, string, if_, else_, endif_, phase_change
};
// preamble and conclusion of the tawk script are only executed
// once, while main is executed once for every data record
enum phase { preamble, tmain, conclusion};
class token {
tokentype ttype;
union { // an "anonymous union"
int fieldnum; // if type is a fieldnumber
unsigned char * literal; // if type is a string
};
int if_level; // if this is an if_, then_, or else_
// private functions:
void get_token(); // recursive descent scanner
// Functions to help in scanning:
void getnext(char & c); // used by get_token();
unsigned char get_value(char delimiter, char * msg);
void dumpline(); // for @! comments
void error(char * msg = "", char * msg2 = "");
public:
token(istream & input);
~token();
friend ostream & operator<<(ostream &s, token &t);
int field_number() { return fieldnum; }
int token_type() { return ttype; }
int nesting_level() { return if_level;}
};
// The following is called a "container class," since its sole
// purpose is to hold a list of objects (tokens, in this case):
class parse_array {
token ** tokenarray; // an array of token pointers
istream * parse_stream;
int token_count;
int end; // the size of the array
phase p_section; // of the program (preamble, etc.)
void build_array(); // another recursive function
public:
parse_array(istream & input);
~parse_array();
int size() { return end; } // how big is it?
token & operator[](int index); // select a token
phase section() { return p_section; }
};
[LISTING SEVEN]
// PARSE.CXX: class parse function definitions
// by Bruce Eckel,
#include "csascii.hxx"
#include "parse.hxx"
#include <ctype.h>
#include <stdlib.h>
// The following are "file static," which means no one outside
// this file can know about them. This is the meaning when a
// global variable is declared "static."
static istream * tokenstream;
static int length; // to remember size of string
static int line_number = 1; // line counting for errors
static int if_counter = 0; // monitors "if" statement nesting
static phase program_section = preamble; // ... until @main
static int end_of_file = 0; // zero means not end of file
token::token(istream & input) {
// initialize values and start the descent
tokenstream = &input;
length = 0;
get_token(); // recursively get characters to end of token
}
token::~token() { // delete heap if any has been allocated:
if (ttype == string)
delete literal;
}
void token::error(char * msg, char * msg2) {
cerr << "token error on line " << line_number << ": " <<
msg << " " << msg2 << "\n";
exit(1);
}
ostream & operator<<(ostream &s, token &t) {
switch (t.ttype) {
case string:
s << (char *)t.literal;
break;
case fieldnumber: // only for testing
s << " fieldnumber: " << t.fieldnum << "\n";
}
return s;
}
// Get a character from the tokenstream, checking for
// end-of-file and newlines
void token::getnext(char & c) {
if(end_of_file)
error("attempt to read after @end statement\n",
"missing @conclusion ?");
if((tokenstream->get(c)).eof() )
error("@end statement missing");
if (c == '\n')
line_number++; // keep track of the line count
}
// See text for description of tokens
void token::get_token() {
char c;
// This happens when DEscending:
getnext(c);
if ( c == '@') {
if (length == 0) { // length 0 means start of token
getnext(c);
switch(c) {
case '!': // comment line
dumpline(); // dump the comment
get_token(); // get a real token
break;
case 'p' : case 'P' : // preamble statement
if ( program_section != preamble )
error("only one preamble allowed");
dumpline(); // just for looks, ignore it
get_token(); // get a real token
break;
case 'm' : case 'M' : // start of main loop
dumpline(); // toss rest of line
program_section = tmain;
ttype = phase_change;
return; // very simple token
case 'c' : case 'C' : // start conclusion
dumpline();
program_section = conclusion;
ttype = phase_change;
return; // very simple token
case 'e' : case 'E': // end statement
end_of_file++; // set flag
ttype = fieldnumber; // so destructor doesn't
// delete free store for this token.
if (if_counter)
error("unclosed 'if' statement(s)");
return;
case '(' :
if ( program_section == preamble ||
program_section == conclusion )
error("@() not allowed in preamble or conclusion");
fieldnum = get_value(')',"@()");
ttype = fieldnumber;
// This is a complete token, so quit
return;
case '<' :
c = get_value('>',"@<>");
length++;
get_token(); // get more...
break;
case '?' : // beginning of an "if" statement
if ( program_section == preamble ||
program_section == conclusion )
error("@? not allowed in preamble or conclusion");
fieldnum = get_value('@',"@?@");
ttype = if_;
getnext(c); // just eat the colon
if(c != ':')
error("@? must be followed by @: (then)");
if_level = ++if_counter; // for nesting
return;
case '~' : // the "else" part of an "if" statement
ttype = else_;
if_level = if_counter;
return;
case '.' : // "endif" terminator of an "if" statement
ttype = endif_;
if_level = if_counter--;
if(if_counter < 0)
error("incorrect nesting of if-then-else clauses");
return;
case '@' : // two '@' in a row mean print an '@'
length++; // just leave '@' as the value of c
get_token();
break;
default:
error("'@' must be followed by:",
"'(', '<', '?',':','~','.','p','m','c' or '@'");
}
} else { // an '@' in the middle of a string; terminate
// the string. Putback() is part of the stream class.
// It is only safe to put one character back on the input
tokenstream->putback(c); // to be used by the next token
// allocate space, put the null in and return up the stack
literal = new unsigned char[length + 1]; // space for '\0'
literal[length--] = '\0'; // string delimiter
ttype = string; // what kind of token this is
return; // back up the stack
}
} else { // not an '@', must be plain text
length++;
get_token();
}
// This occurs on the "tail" of the recursion:
literal[length--] = c; // put chars in as we rise back up...
}
// This function is used by get_token when it encounters a @(
// or a @< to get a number until it finds "delimiter."
// If an error occurs, msg is used to notify the user what
// kind of statement it is.
unsigned char token::get_value(char delimiter, char * msg) {
char c;
char buf[5];
int i = 0;
while(getnext(c), c != delimiter) {
if (!isdigit(c))
error("must use only digits inside", msg);
buf[i++] = c;
}
buf[i] = 0;
return atoi(buf);
}
void token::dumpline() { // called when '@!' encountered
char c;
while(getnext(c), c != '\n')
; // just eat characters until newline
}
// Since there's no way to know how big a parse_array is
// going to be until the entire tawkfile has been tokenized,
// the recursive approach is again used:
parse_array::parse_array(istream & input) {
parse_stream = &input;
token_count = 0;
p_section = program_section; // so we know at run-time
build_array();
}
void parse_array::build_array() {
token * tk = new token(*parse_stream);
if( ! end_of_file && tk->token_type() != phase_change) {
// normal token, not end of file or phase change:
token_count++;
// recursively get tokens until eof or phase change:
build_array();
} else { // end of file or phase change
// only done once per object:
// allocate memory and return up the stack
tokenarray = new token * [end = token_count];
if(token_count) token_count--; // only if non-zero
return;
}
tokenarray[token_count--] = tk; // performed on the "tail"
}
parse_array::~parse_array() {
for (int i = 0; i < end; i++)
delete tokenarray[i];
delete tokenarray;
}
token & parse_array::operator[](int index) {
if ( index >= end ) {
cerr << "parse_array error: index " << index
<< " out of bounds\n";
exit(1);
}
return *tokenarray[index];
}
[LISTING EIGHT]
// TAWK.CXX: parses a tawk script and reads an ascii file;
// generates results according to the tawk script.
// by Bruce Eckel,
#include "csascii.hxx"
#include "parse.hxx"
main (int argc, char * argv[]) {
int screen = 0; // flag set true if screen output desired
if (argc < 3) {
cerr << "usage: tawk tawkfile datafile\n" <<
"trailing -s pages output to screen";
exit(1);
}
if (argc == 4) {
if (argv[3][0] != '-') {
cerr << "must use '-' before trailing flag\n";
exit(1);
} else
if (argv[3][1] != 's') {
cerr << "'s' is only trailing flag allowed";
exit(1);
} else
screen++; // set screen output flag true
}
istream tawkfile(new filebuf->open(argv[1], input));
parse_array Apreamble(tawkfile); // the @preamble
parse_array Amain(tawkfile); // the @main section
parse_array Aconclusion(tawkfile); // the @conclusion
csascii datafile(argv[2]); // make a comma-separated ASCII
// object from the second arg
// ------ @preamble ------
for (int i = 0; i < Apreamble.size(); i++)
cout << Apreamble[i]; // preamble can only contain strings
if(screen) {
// ANSI reverse video sequence:
cout << chr(27) << "[7m" << "press any key" <<
chr(27) << "[0m";
getch();
}
// ------ The Central Loop (@main) -------
do { // for each record in the data file
if(screen) cout << chr(27) << "[2J"; // ANSI clear screen
for(int i = 0; i < Amain.size(); i++) {
switch(Amain[i].token_type()) {
case fieldnumber:
cout << datafile[Amain[i].field_number()];
break;
case string:
cout << Amain[i];
break;
case if_:
int fn = Amain[i].field_number();
if (datafile[fn].size() == 0) { // conditional false
int level = Amain[i].nesting_level();
// find the "else" statement on the same level:
while ( !(Amain[i].token_type() == else_
&& Amain[i].nesting_level() == level))
i++;
} // conditional true -- just continue
break;
case else_: // an "if" conditional was true so skip
// all the statements in the "else" clause
int level = Amain[i].nesting_level();
// find the "endif" statement on the same level:
while ( !(Amain[i].token_type() == endif_
&& Amain[i].nesting_level() == level))
i++;
break;
case endif_: // after performing the "else" clause
break; // ignore it; only used to find the end
// of the conditional when "if" is true.
default: // should never happen (caught in parsing)
cerr << "unknown statement encountered at run-time\n";
exit(1);
}
}
if(screen) {
cout << chr(27) << "[7m" <<
"press a key (ESC quits)" << chr(27) << "[0m";
if( getch() == 27) break;
}
} while (datafile.next()); // matches do { ...
// ------ @conclusion ------
for ( i = 0; i < Aconclusion.size(); i++)
cout << Aconclusion[i]; //conclusion contains only strings
}
[LISTING NINE]
# makefile for tawk.exe & lookup.exe
# Zortech C++:
CPP = ztc
# Glockenspiel C++ w/ MSC 4:
#CPP = ccxx !4
all: tawk.exe lookup.exe
tawk.exe : tawk.obj parse.obj csascii.obj field.obj
$(CPP) tawk.obj parse.obj csascii.obj field.obj
lookup.exe : lookup.cxx csascii.obj field.obj
$(CPP) lookup.cxx csascii.obj field.obj
tawk.obj : tawk.cxx parse.hxx csascii.hxx field.hxx
$(CPP) -c tawk.cxx
parse.obj : parse.cxx parse.hxx
$(CPP) -c parse.cxx
csascii.obj : csascii.cxx csascii.hxx field.hxx
$(CPP) -c csascii.cxx
field.obj : field.cxx field.hxx
$(CPP) -c field.cxx
[LISTING TEN]
@! REFORM.TWK
@! A tawk script to reformat a comma-separated ASCII file
@! with 6 fields. This creates a new CS-ASCII file with
@! fields 4 and 5 combined.
@main
"@(0)","@(1)","@(2)","@(3)","@(4)@?4@: @~@.@(5)"
@conclusion
@end
[LISTING ELEVEN]
@! WALLET.TWK
@! Tawkfile to create a tiny phone listing for a wallet
@! on a Hewlett-Packard Laserjet-compatible printer
@! From a comma-separated ASCII file generated by a DBMS
@preamble
@<27>&l5C@! approximately 10 lines per inch
@<27>(s16.66H@! small typeface, built into Laserjet
@main
@! last, first, (area code) phone1
@(0),@(1)(@(2))@?3@:@(3)
@ phone2, if it exists
@?4@:@(4)
@~@.@~@?4@:@(4)
@~
@.@.@conclusion
@<27>E @! Reset the Laserjet
@end
[EXAMPLE 1]
class tiny {
// private stuff here (this is a comment)
int i;
public: // public stuff here:
print() { // an "in-line" function
printf("i = %d\n",i);
}
tiny(int j); // constructors have the class name
~tiny() {} // destructors use a tilde
}; // classes end with a brace and a semicolon
tiny::tiny(int j) { // non inline definition
i = j;
}
main() {
tiny A(2); // implicit constructor call
// A.i = 30; // error! private member
A.print(); // calling a member function
// implicit destructor call at end of scope
}
[EXAMPLE 2]
#include <stream.hxx> // cout automatically defined
main() {
cout << "Hello, world!\n" << "I am "
<< 6 << "today!\n";
}
[EXAMPLE 3]
filebuf f1;
if (f1.open(argv[1],input) == 0) {
cout << "cannot open " << argv[1] << "\n";
exit(1);
}
istream infile(&f1);
[EXAMPLE 4]
"Ball","Mike","Oregon Software C++ Compiler"
"Bright","Walter","Zortech C++ Compiler"
"Carolan","John","Glockenspiel C++ Translator"
"Stroustrup","Bjarne","AT&T, C++ Creator"
"Tiemann","Michael","Free Software Foundation C++ Compiler"