HTML - Publishing on the Internet

home *** CD-ROM | disk | FTP | other *** search

/ HTML - Publishing on the Internet / html_cdrom.iso / tools / html / linux / check / dehtml.pl < prev next >

Wrap

Perl Script | 1995-01-18 | 6KB | 130 lines

#!/usr/local/bin/perl #dehtml.pl: Removes all HTML tags from file, preliminary to spell check; common # ampersand "&entities;" are also resolved into single characters. # # Typical use: # # perl dehtml.pl infile.html > outfile.txt # # This program processes all files on the command line to STDOUT; to process a # number of files individually, use the iteration mechanism of your shell; for # example: # # for a in *.html ; do perl dehtml.pl $a > otherdir/$a ; done # # in Unix sh, or: # # for %a in (*.htm) do call dehtml %a otherdir\%a # # in MS-DOS, where dehtml.bat is the following one-line batch file: # # perl dehtml.pl %1 > %2 # # Copyright H. Churchyard 1994 -- freely redistributable. # # Version 1.0 11/27/94 -- Tested with 4.03[56] on SunOS and DEC Alpha OSF/1, # and MacPerl 4.13. Included in htmlchek 3.0 release. # Version 1.1 12/6/94 -- Fixed minor bug which could unpredictably cause a # string such as "é" to be reduced into a single character; # added "". Included in htmlchek 3.01 release. # Version 1.2 1/12/95 -- No error on `>' outside tag; minor bugfix. Included # in htmlchek 4.0 release. # # This program is a port to perl of the original dehtml.awk (the port was # fairly mechanical, so programming style and efficency may not be high). # eval "exec /usr/local/bin/perl -S $0 $*" if $running_under_some_shell; # this emulates #! processing on NIH machines. $[ = 1; # set array base to 1 $, = ' '; # set output field separator $\ = "\n"; # set output record separator # $amp{' '} = "\040"; $amp{' '}="\040"; $amp{'"'} = "\042"; $amp{'"'}="\042"; $amp{'<'} = "\074"; $amp{'<'}="\074"; $amp{'>'} = "\076"; $amp{'>'}="\076"; $amp{'À'}="\300"; $amp{'Á'}="\301"; $amp{'Â'}="\302"; $amp{'Ã'}="\303"; $amp{'Ä'}="\304"; $amp{'Å'}="\305"; $amp{'Æ'}="\306"; $amp{'Ç'}="\307"; $amp{'È'}="\310"; $amp{'É'}="\311"; $amp{'Ê'}="\312"; $amp{'Ë'}="\313"; $amp{'Ì'}="\314"; $amp{'Í'}="\315"; $amp{'Î'}="\316"; $amp{'Ï'}="\317"; $amp{'Ð'}="\320"; $amp{'Ñ'}="\321"; $amp{'Ò'}="\322"; $amp{'Ó'}="\323"; $amp{'Ô'}="\324"; $amp{'Õ'}="\325"; $amp{'Ö'}="\326"; $amp{'Ø'}="\330"; $amp{'Ù'}="\331"; $amp{'Ú'}="\332"; $amp{'Û'}="\333"; $amp{'Ü'}="\334"; $amp{'Ý'}="\335"; $amp{'Þ'}="\336"; $amp{'ß'}="\337"; $amp{'à'}="\340"; $amp{'á'}="\341"; $amp{'â'}="\342"; $amp{'ã'}="\343"; $amp{'ä'}="\344"; $amp{'å'}="\345"; $amp{'æ'}="\346"; $amp{'ç'}="\347"; $amp{'è'}="\350"; $amp{'é'}="\351"; $amp{'ê'}="\352"; $amp{'ë'}="\353"; $amp{'ì'}="\354"; $amp{'í'}="\355"; $amp{'î'}="\356"; $amp{'ï'}="\357"; $amp{'ð'}="\360"; $amp{'ñ'}="\361"; $amp{'ò'}="\362"; $amp{'ó'}="\363"; $amp{'ô'}="\364"; $amp{'õ'}="\365"; $amp{'ö'}="\366"; $amp{'ø'}="\370"; $amp{'ù'}="\371"; $amp{'ú'}="\372"; $amp{'û'}="\373"; $amp{'ü'}="\374"; $amp{'ý'}="\375"; $amp{'þ'}="\376"; $amp{'ÿ'}="\377"; $amp{'®'}="\256"; $amp{'©'}="\251"; $amp{'£'} = "\243"; $amp{''}="-"; # # Main # # Variable ``$state'' is one if unresolved `<', zero otherwise. # $stuperlRS = $/; while (<>) { if ($_ =~ /$stuperlRS$/o) { # strip record separator, allow for last line to chop;} # be unterminated. $line = ''; $errstr = ''; $erra = 0; $errb = 0; $currsrch = 1; $txtbeg = 1; while ((((substr($_, $currsrch) =~ /[<>]/) eq 1) && ($RSTART = length($`)+1)) != 0) { $currsrch = ($currsrch + $RSTART); if (substr($_, ($currsrch - 1), 1) eq '<') { if ($state) { if (!$erra) { $errstr = ($errstr . "&&^Multiple `<' without `>' ERROR!, Ignoring^&&\n"); $erra = 1;}} else { if (($currsrch > length($_)) || (substr($_, $currsrch, 1) =~ /^[ \t]$/)) { if (!$errb) { $errstr = ($errstr . "&&^Whitespace after `<': Bad SGML syntax ERROR!, Ignoring^&&\n"); $errb = 1;}} else { if ($currsrch > ($txtbeg + 1)) { $line = ($line . substr($_, $txtbeg, ($currsrch - ($txtbeg + 1))));} $state = 1;}}} else { if (substr($_, ($currsrch - 1), 1) eq '>') { if ($state == 0) { next;} else {$txtbeg = $currsrch; $state = 0;}} else {print 'Internal error, ignore';}}} #At EOL: if ((!$state) && ($txtbeg <= length($_))) { $line = ($line . substr($_, $txtbeg));} if ($line =~ /&#?[-0-9a-zA-Z.]*;/) { foreach $X (keys %amp) { $s_ = $amp{$X}; $line =~ s/$X/$s_/g; if ($line !~ /&/) { last;}} $line =~ s/&(#38|amp);/&/g;} if (($line) || ((!$state) && ($_ =~ /^$/))) { if ((!$state) || ($errstr) || ($line =~ /[ \t]$/)) {print $line;} else {printf "%s", $line;}} if ($errstr) { printf '%s', $errstr;}} # #Minor bug: &g<X>t; will translate to a `>' character! # #END routine: # if ($state) { print "&&^Was awaiting a `>' ERROR! at END^&&";} ##EOF