HTML - Publishing on the Internet

home *** CD-ROM | disk | FTP | other *** search

/ HTML - Publishing on the Internet / html_cdrom.iso / tools / html / linux / check / htmlsrpl.pl < prev next >

Wrap

Perl Script | 1995-02-18 | 11KB | 259 lines

#!/usr/local/bin/perl #htmlsrpl.pl: HTML-aware search-and-replace; acts either only outside HTML/SGML # tags, or only within HTML/SGML tags; can also upper-case tag names # # Typical use: # # perl htmlsrpl.pl [options] infile.html > outfile.html # # Where options have the form "option=value"; all options should precede # filename arguments on the command line. (See the documentation.) # # Copyright H. Churchyard 1994, 1995 -- freely redistributable. This code is # awk-influenced (so sue me). Tested under Perl 4 (I'm still not sure whether # the fact that "s/$x/$y/" is equivalent to "s/$x/$y/e" is a bug or not). # # Version 1.0 12/21/94 -- Preliminary version. # Version 1.01 12/22/94 -- Minor bugfix. # Version 1.1 1/7/95 -- Added inside=, inmost=, oustside= , etc. Included in # htmlchek 4.0 release. # Version 1.11 1/22/95 -- Added "Changed!/Unchanged" final status messages. # Included in htmlchek 4.1 release. # eval "exec /usr/local/bin/perl -S $0 $*" if $running_under_some_shell; # this emulates #! processing on NIH machines. #process any FOO=bar switches $old= ''; $new = ''; $intags = 0; $regexp = 0; $regeval = 0; $upcase = 0; $lines = 0; $delete = 0; $case = 0; $slash=0; $inmost=''; $inside = ''; $outside = ''; eval '$'.$1.'$2;' while $ARGV[0] =~ /^(old=|new=|intags=|lines=|regexp=|regeval=|upcase=|delete=|case=|slash=|inmost=|inside=|outside=)(.*)/ && shift; $[ = 1; # set array base to 1 $, = ' '; # set output field separator $\ = "\n"; # set output record separator foreach $X (@ARGV) { if ($X =~ /^[^=]+=/) { print STDERR "Apparent misspelled or badly-placed command-line option $&"; print STDERR "Attempting to continue anyway...";}} $filstr = join(' ',@ARGV); $changed = 0; if ($lines) {$/ = "\0777"; $* = 1;} else {$/ = "\n";} if (($outside) && (!(($inside) || ($inmost)))) {$applyit = 1;} else {$applyit = 0;} # $unpair{'!--'} = 1; $unpair{'!DOCTYPE'} = 1; $unpair{'BASE'} = 1; $unpair{'BR'} = 1; $unpair{'COMMENT'} = 1; $unpair{'HR'} = 1; $unpair{'IMG'} = 1; $unpair{'INPUT'} = 1; $unpair{'ISINDEX'} = 1; $unpair{'LINK'} = 1; $unpair{'META'} = 1; $unpair{'NEXTID'} = 1; $unpair{'ATOP'} = 1; $unpair{'LEFT'} = 1; $unpair{'OVER'} = 1; $unpair{'OVERLAY'} = 1; $unpair{'RIGHT'} = 1; $unpair{'TAB'} = 1; $unpair{'BASEFONT'} = 1; $unpair{'WBR'} = 1; $nestvar = 0; $numins = 0; $numout = 0; if ($inmost) { $inmost =~ tr/a-z/A-Z/; if ($inmost =~ /[^-.a-zA-Z0-9]/) { die 'Non-alphanumeric value of inmost= was specified';} if (defined $unpair{$inmost}) { die "Non-pairing tag $inmost specified as value of inmost=";}} if ($inside) { $numins = (@inarr = split(/,/, $inside)); for ($i = 1; $i <= $numins; ++$i) { $inarr[$i] =~ tr/a-z/A-Z/; if ((!$inarr[$i]) || ($inarr[$i] =~ /[^-.a-zA-Z0-9]/)) { die 'Non-alphanumeric value of inside= was specified';} if (defined $xxin{$inarr[$i]}) { die 'Duplicate values of inside= were specified';} if (defined $unpair{$inarr[$i]}) { die "Non-pairing tag $inarr[$i] specified as value of inside=";} else { $xxin{$inarr[$i]} = 1;}}} if ($outside) { $numout = (@outarr = split(/,/, $outside)); for ($i = 1; $i <= $numout; ++$i) { $outarr[$i] =~ tr/a-z/A-Z/; if ((!$outarr[$i]) || ($outarr[$i] =~ /[^-.a-zA-Z0-9]/)) { die 'Non-alphanumeric value of outside= was specified';} if (defined $xxout{$outarr[$i]}) { die 'Duplicate values of outside= were specified';} if (defined $xxin{$outarr[$i]}) { die "Tagname $outarr[$i] specified as both outside= and inside=";} if (defined $unpair{$outarr[$i]}) { die "Non-pairing tag $outarr[$i] specified as value of outside=";} else { $xxout{$outarr[$i]} = 1;}}} # if ((!$old) && (!$upcase)) {die "No `old=' string was specified";} if (($delete) && (($new) || ($regexp) || ($regeval))) { die "Incompatible option specified with `delete=1'";} if (($regexp) && ($regeval)) {die 'Both regexp=1 and regeval=1 specified';} if (($case) && (!$delete) && (!$regexp) && (!$regeval)) { die 'Option case=1 specified without any of regexp=1, regeval=1, or delete=1 also being specified';} if ($delete) {$slash=1;} if (($upcase) || ($delete) || ($slash)) {$intags = 1;} # # Main # # Variable ``$state'' is one if there is an unresolved `<', zero otherwise. # ``$lastbeg'' is zero if no `<' has ocurred in $_, otherwise it points to the # character immediately after the last `<' encountered. # $xRS = "\n"; $state = 0; while (<>) { if ($_ =~ /$xRS$/o) { # strip record separator, allow for last line to chop;} # be unterminated. $lastbeg = 0; $currsrch = 1; $txtbeg = 1; while ((((substr($_, $currsrch) =~ /[<>]/) eq 1) && ($RSTART = length($`)+1)) != 0) { $currsrch = ($currsrch + $RSTART); if (substr($_, ($currsrch - 1), 1) eq '<') { if ($state) { print "\nERROR!"; die "Multiple `<' without `>' ERROR!";} else { if (($currsrch > length($_)) || (substr($_, $currsrch, 1) =~ /^[ \t]$/)) { print "\nERROR!"; die "Whitespace after `<': Bad SGML syntax ERROR!";} else { if ($currsrch > ($txtbeg + 1)) { if ((!$intags) && (($applyit) || (!(($inmost) || ($numins) || ($numout))))) { printf "%s", &changeht(substr($_, $txtbeg, ($currsrch - ($txtbeg + 1))));} else { printf "%s", substr($_, $txtbeg, ($currsrch - ($txtbeg + 1)));}} $deletit = 0; $lastbeg = $currsrch; $state = 1;}}} else { if (substr($_, ($currsrch - 1), 1) eq '>') { if ($state == 0) { next;} #`>' without `<' else { &parsetag($currsrch - 1); if (!$deletit) {printf "%s", '>';} $txtbeg = $currsrch; $state = 0;}} else {die 'Internal error, ignore';}}} #At EOL: if ($state == 1) { &parsetag(length($_) + 1);} elsif ($txtbeg <= length($_)) { if ((!$intags) && (($applyit) || (!(($inmost) || ($numins) || ($numout))))) { printf "%s", &changeht(substr($_, $txtbeg));} else {printf "%s", substr($_, $txtbeg);}} if (!(($state) && ($deletit))) {printf "\n";}} # #END routine: # if ($state) { die "Was awaiting a `>' ERROR! at END";} if ($changed) { print STDERR "Changed! on input", $filstr;} else { print STDERR "Unchanged on input", $filstr;} # # sub parsetag { local($inp) = @_; $docap = $lastbeg; if (!$lastbeg) { $strx = '' ; $lastbeg = 1;} else {$strx= '<';} if ($inp != $lastbeg) { $str = &upc(substr($_, $lastbeg, ($inp - $lastbeg))); if (($oldapply) || (!(($inmost) || ($numins) ||($numout)))) { if (($slash) && ($docap) && ($str =~ /^\//)) {$strx = ($strx . '/'); $str= substr($str, 2);} if ($delete) { if ($docap) {&getdel($str);} if (!$deletit) {printf "%s%s", $strx, $str;} else {$changed=1;}} else { if (($intags) && ($old)) {printf "%s%s", $strx, &changeht($str);} else {printf "%s%s", $strx, $str;}}} else {printf "%s%s", $strx, $str;}}} # sub upc { local($upcx) = @_; if ($docap) { $upcx =~ /^[^ \t\n]+/; ($tagname = $&) =~ tr/a-z/A-Z/; if ($upcase) {$upcx = ($tagname . $');} $oldapply = $applyit; #tag stack accounting if ((($inmost) || ($numins)|| ($numout)) && (!(defined $unpair{$tagname}))) { $applyit = 1; $clostag = ''; if ($tagname !~ /^\//) { ++$nestvar; $nestarr[$nestvar] = $tagname;} else { $clostag = substr($tagname,2); while ($nestarr[$nestvar] ne $clostag) { --$nestvar; if ($nestvar <= 0) { print "\nERROR!"; die "/$clostag tag encountered when apparently not in $clostag element";}} --$nestvar;} if (($inmost) && ($nestarr[$nestvar] ne $inmost)) { $applyit = 0;} if ($numins) { if ($nestvar < $numins) {$applyit = 0;} else { $mask = 1; $stackstr = (" " . join(" ",@nestarr[1..$nestvar]) . " "); foreach $X (keys %xxin) { if (index($stackstr,(" " . $X . " ")) <= 0) { $mask = 0;}} if (($applyit) && ($mask)) {$applyit = 1;} else {$applyit = 0;}}} if (($numout) && ($nestvar)) { $mask = 1; $stackstr = (" " . join(" ",@nestarr[1..$nestvar]) . " "); foreach $X (keys %xxout) { ##print $stackstr,"XX",(" " . $X . " ");##debugXX if (index($stackstr,(" " . $X . " ")) > 0) { $mask = 0;}} if (($applyit) && ($mask)) {$applyit = 1;} else {$applyit = 0;}} if ($clostag) {$oldapply = $applyit;}}} return $upcx;} # sub getdel { local($inz) = @_; $inz =~ /^[^ \t\n]+/; $X = $&; if ($case) { if ($X =~ /$old/io) { $deletit = 1;}} else { if ($X =~ /$old/o) { $deletit = 1;}}} # sub changeht { local($field) = @_; if ($regeval) { if ($case) { $X = ($field =~ s/$old/$new/eeigo);} else { $X = ($field =~ s/$old/$new/eego);} if ($X) {$changed = 1;} return $field;} elsif ($regexp) { if ($case) { $X = ($field =~ s/$old/$new/igo);} else { $X = ($field =~ s/$old/$new/go);} if ($X) {$changed = 1;} return $field;} else { $startf = 1; $newf = ''; while (($ndx = index(substr($field,$startf),$old)) > 0) { $changed = 1; $newf = ($newf . substr($field,$startf,($ndx-1)) . $new); $startf = ($startf + ($ndx-1) + length($old));} $newf = ($newf . substr($field,$startf)); return $newf;}} ##EOF