home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
HTML - Publishing on the Internet
/
html_cdrom.iso
/
tools
/
html
/
linux
/
check
/
dehtml.pl
< prev
next >
Wrap
Perl Script
|
1995-01-18
|
6KB
|
130 lines
#!/usr/local/bin/perl
#dehtml.pl: Removes all HTML tags from file, preliminary to spell check; common
# ampersand "&entities;" are also resolved into single characters.
#
# Typical use:
#
# perl dehtml.pl infile.html > outfile.txt
#
# This program processes all files on the command line to STDOUT; to process a
# number of files individually, use the iteration mechanism of your shell; for
# example:
#
# for a in *.html ; do perl dehtml.pl $a > otherdir/$a ; done
#
# in Unix sh, or:
#
# for %a in (*.htm) do call dehtml %a otherdir\%a
#
# in MS-DOS, where dehtml.bat is the following one-line batch file:
#
# perl dehtml.pl %1 > %2
#
# Copyright H. Churchyard 1994 -- freely redistributable.
#
# Version 1.0 11/27/94 -- Tested with 4.03[56] on SunOS and DEC Alpha OSF/1,
# and MacPerl 4.13. Included in htmlchek 3.0 release.
# Version 1.1 12/6/94 -- Fixed minor bug which could unpredictably cause a
# string such as "é" to be reduced into a single character;
# added "". Included in htmlchek 3.01 release.
# Version 1.2 1/12/95 -- No error on `>' outside tag; minor bugfix. Included
# in htmlchek 4.0 release.
#
# This program is a port to perl of the original dehtml.awk (the port was
# fairly mechanical, so programming style and efficency may not be high).
#
eval "exec /usr/local/bin/perl -S $0 $*"
if $running_under_some_shell;
# this emulates #! processing on NIH machines.
$[ = 1; # set array base to 1
$, = ' '; # set output field separator
$\ = "\n"; # set output record separator
#
$amp{' '} = "\040"; $amp{' '}="\040";
$amp{'"'} = "\042"; $amp{'"'}="\042";
$amp{'<'} = "\074"; $amp{'<'}="\074"; $amp{'>'} = "\076";
$amp{'>'}="\076"; $amp{'À'}="\300"; $amp{'Á'}="\301";
$amp{'Â'}="\302"; $amp{'Ã'}="\303"; $amp{'Ä'}="\304";
$amp{'Å'}="\305"; $amp{'Æ'}="\306"; $amp{'Ç'}="\307";
$amp{'È'}="\310"; $amp{'É'}="\311"; $amp{'Ê'}="\312";
$amp{'Ë'}="\313"; $amp{'Ì'}="\314"; $amp{'Í'}="\315";
$amp{'Î'}="\316"; $amp{'Ï'}="\317"; $amp{'Ð'}="\320";
$amp{'Ñ'}="\321"; $amp{'Ò'}="\322"; $amp{'Ó'}="\323";
$amp{'Ô'}="\324"; $amp{'Õ'}="\325"; $amp{'Ö'}="\326";
$amp{'Ø'}="\330"; $amp{'Ù'}="\331"; $amp{'Ú'}="\332";
$amp{'Û'}="\333"; $amp{'Ü'}="\334"; $amp{'Ý'}="\335";
$amp{'Þ'}="\336"; $amp{'ß'}="\337"; $amp{'à'}="\340";
$amp{'á'}="\341"; $amp{'â'}="\342"; $amp{'ã'}="\343";
$amp{'ä'}="\344"; $amp{'å'}="\345"; $amp{'æ'}="\346";
$amp{'ç'}="\347"; $amp{'è'}="\350"; $amp{'é'}="\351";
$amp{'ê'}="\352"; $amp{'ë'}="\353"; $amp{'ì'}="\354";
$amp{'í'}="\355"; $amp{'î'}="\356"; $amp{'ï'}="\357";
$amp{'ð'}="\360"; $amp{'ñ'}="\361"; $amp{'ò'}="\362";
$amp{'ó'}="\363"; $amp{'ô'}="\364"; $amp{'õ'}="\365";
$amp{'ö'}="\366"; $amp{'ø'}="\370"; $amp{'ù'}="\371";
$amp{'ú'}="\372"; $amp{'û'}="\373"; $amp{'ü'}="\374";
$amp{'ý'}="\375"; $amp{'þ'}="\376"; $amp{'ÿ'}="\377";
$amp{'®'}="\256"; $amp{'©'}="\251"; $amp{'£'} = "\243";
$amp{''}="-";
#
# Main
#
# Variable ``$state'' is one if unresolved `<', zero otherwise.
#
$stuperlRS = $/;
while (<>) {
if ($_ =~ /$stuperlRS$/o) { # strip record separator, allow for last line to
chop;} # be unterminated.
$line = ''; $errstr = ''; $erra = 0; $errb = 0;
$currsrch = 1; $txtbeg = 1;
while ((((substr($_, $currsrch) =~ /[<>]/) eq 1) &&
($RSTART = length($`)+1)) != 0) {
$currsrch = ($currsrch + $RSTART);
if (substr($_, ($currsrch - 1), 1) eq '<') {
if ($state) {
if (!$erra) {
$errstr = ($errstr .
"&&^Multiple `<' without `>' ERROR!, Ignoring^&&\n");
$erra = 1;}}
else {
if (($currsrch > length($_)) ||
(substr($_, $currsrch, 1) =~ /^[ \t]$/)) {
if (!$errb) {
$errstr = ($errstr .
"&&^Whitespace after `<': Bad SGML syntax ERROR!, Ignoring^&&\n");
$errb = 1;}}
else {
if ($currsrch > ($txtbeg + 1)) {
$line = ($line . substr($_, $txtbeg,
($currsrch - ($txtbeg + 1))));}
$state = 1;}}}
else {
if (substr($_, ($currsrch - 1), 1) eq '>') {
if ($state == 0) {
next;}
else {$txtbeg = $currsrch; $state = 0;}}
else {print 'Internal error, ignore';}}}
#At EOL:
if ((!$state) && ($txtbeg <= length($_))) {
$line = ($line . substr($_, $txtbeg));}
if ($line =~ /?[-0-9a-zA-Z.]*;/) {
foreach $X (keys %amp) {
$s_ = $amp{$X}; $line =~ s/$X/$s_/g;
if ($line !~ /&/) {
last;}}
$line =~ s/&(#38|amp);/&/g;}
if (($line) || ((!$state) && ($_ =~ /^$/))) {
if ((!$state) || ($errstr) || ($line =~ /[ \t]$/))
{print $line;}
else {printf "%s", $line;}}
if ($errstr) {
printf '%s', $errstr;}}
#
#Minor bug: &g<X>t; will translate to a `>' character!
#
#END routine:
#
if ($state) {
print "&&^Was awaiting a `>' ERROR! at END^&&";}
##EOF