HTML - Publishing on the Internet

home *** CD-ROM | disk | FTP | other *** search

/ HTML - Publishing on the Internet / html_cdrom.iso / tools / html / linux / check / dehtml.awk < prev next >

Wrap

Text File | 1995-01-21 | 5KB | 108 lines

#dehtml.awk: Removes all HTML tags from file, preliminary to spell check; common # ampersand "&entities;" are also resolved into single characters. # # Typical use: # # awk -f dehtml.awk infile.html > outfile.txt # # This program is written in the ``awk'' programming language (on Sun systems # and some others, non-archaic ``awk'' is called ``nawk'', so that ``nawk'' # should be used instead of ``awk''). Also, a freely-redistributable ``awk'' # interpreter called ``gawk'', which is free of the bugs that some of the # vendor-supplied ``awk''/``nawk'' programs suffer from, is available for most # platforms, and as source from the FSF GNU project. # # This program processes all files on the command line to STDOUT; to process a # number of files individually, use the iteration mechanism of your shell; for # example: # # for a in *.html ; do awk -f dehtml.awk$a > otherdir/$a ; done # # in Unix sh, or: # # for %a in (*.htm) do call dehtml %a otherdir\%a # # in MS-DOS, where dehtml.bat is the following one-line batch file: # # gawk -f dehtml.awk %1 > %2 # # Copyright H. Churchyard 1994, 1995 -- freely redistributable. # # Version 1.0 11/27/94 -- Included in htmlchek 3.0 release. # Version 1.1 12/6/94 -- Fixed minor bug which could unpredictably cause a # string such as "é" to be reduced into a single character; # added "". Included in htmlchek 3.01 release. # Version 1.2 1/12/95 -- No error on `>' outside tag; minor bugfix. Included # in htmlchek 4.0 release. # #This will test the 8-bit-cleanliness of your awk: BEGIN{ amp["&[\043]32;"]="\040";amp[" "]="\040"; amp["&[\043]34;"]="\042";amp["""]="\042"; amp["&[\043]60;"]="\074";amp["<"]="\074";amp["&[\043]62;"]="\076"; amp[">"]="\076";amp["À"]="\300";amp["Á"]="\301"; amp["Â"]="\302";amp["Ã"]="\303";amp["Ä"]="\304"; amp["Å"]="\305";amp["Æ"]="\306";amp["Ç"]="\307"; amp["È"]="\310";amp["É"]="\311";amp["Ê"]="\312"; amp["Ë"]="\313";amp["Ì"]="\314";amp["Í"]="\315"; amp["Î"]="\316";amp["Ï"]="\317";amp["Ð"]="\320"; amp["Ñ"]="\321";amp["Ò"]="\322";amp["Ó"]="\323"; amp["Ô"]="\324";amp["Õ"]="\325";amp["Ö"]="\326"; amp["Ø"]="\330";amp["Ù"]="\331";amp["Ú"]="\332"; amp["Û"]="\333";amp["Ü"]="\334";amp["Ý"]="\335"; amp["Þ"]="\336";amp["ß"]="\337";amp["à"]="\340"; amp["á"]="\341";amp["â"]="\342";amp["ã"]="\343"; amp["ä"]="\344";amp["å"]="\345";amp["æ"]="\346"; amp["ç"]="\347";amp["è"]="\350";amp["é"]="\351"; amp["ê"]="\352";amp["ë"]="\353";amp["ì"]="\354"; amp["í"]="\355";amp["î"]="\356";amp["ï"]="\357"; amp["ð"]="\360";amp["ñ"]="\361";amp["ò"]="\362"; amp["ó"]="\363";amp["ô"]="\364";amp["õ"]="\365"; amp["ö"]="\366";amp["ø"]="\370";amp["ù"]="\371"; amp["ú"]="\372";amp["û"]="\373";amp["ü"]="\374"; amp["ý"]="\375";amp["þ"]="\376";amp["ÿ"]="\377"; amp["®"]="\256";amp["©"]="\251";amp["&[\043]163;"]="\243"; amp[""]="-"; } # # Main # # Variable ``state'' is one if unresolved `<', zero otherwise. # {line="";errstr="";erra=0;errb=0;currsrch=1;txtbeg=1; while (match(substr($0,currsrch),/[<>]/)!=0) {currsrch=(currsrch+RSTART); if (substr($0,(currsrch-1),1)=="<") {if (state) {if (!erra) {errstr=(errstr "&&^Multiple `<' without `>' ERROR!, Ignoring^&&\n"); erra=1}} else {if ((currsrch>length($0))||(substr($0,currsrch,1)~/^[ \t]$/)) {if (!errb) {errstr=(errstr "&&^Whitespace after `<': Bad SGML syntax ERROR!, Ignoring^&&\n"); errb=1}} else {if (currsrch>(txtbeg+1)) {line=(line substr($0,txtbeg,(currsrch-(txtbeg+1))))}; state=1}}} else {if (substr($0,(currsrch-1),1)==">") {if (state==0) {continue} #`>' without `<' else {txtbeg=currsrch;state=0;}} else {print "Internal error, ignore"}}}; #At EOL: if ((!state)&&(txtbeg<=length($0))) {line=(line substr($0,txtbeg))}; if (line~/&[\043]?[-0-9a-zA-Z.]*;/) {for (x in amp) {gsub(x,amp[x],line);if (line!~/&/) {break}}; gsub(/&([\043]38|amp);/,"\\&",line)}; if ((line)||((!state)&&($0~/^$/))) {if ((!state) || (errstr) || (line~/[ \t]$/)) {print line} else {printf "%s",line}}; if (errstr) {printf "%s",errstr}} # #Minor bug: &g<X>t; will translate to a `>' character! # # END{if (state) {print "&&^Was awaiting a `>' ERROR! at END^&&"}} ##EOF