home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
HTML - Publishing on the Internet
/
html_cdrom.iso
/
tools
/
html
/
linux
/
check
/
htmlsrpl.pl
< prev
next >
Wrap
Perl Script
|
1995-02-18
|
11KB
|
259 lines
#!/usr/local/bin/perl
#htmlsrpl.pl: HTML-aware search-and-replace; acts either only outside HTML/SGML
# tags, or only within HTML/SGML tags; can also upper-case tag names
#
# Typical use:
#
# perl htmlsrpl.pl [options] infile.html > outfile.html
#
# Where options have the form "option=value"; all options should precede
# filename arguments on the command line. (See the documentation.)
#
# Copyright H. Churchyard 1994, 1995 -- freely redistributable. This code is
# awk-influenced (so sue me). Tested under Perl 4 (I'm still not sure whether
# the fact that "s/$x/$y/" is equivalent to "s/$x/$y/e" is a bug or not).
#
# Version 1.0 12/21/94 -- Preliminary version.
# Version 1.01 12/22/94 -- Minor bugfix.
# Version 1.1 1/7/95 -- Added inside=, inmost=, oustside= , etc. Included in
# htmlchek 4.0 release.
# Version 1.11 1/22/95 -- Added "Changed!/Unchanged" final status messages.
# Included in htmlchek 4.1 release.
#
eval "exec /usr/local/bin/perl -S $0 $*"
if $running_under_some_shell; # this emulates #! processing on NIH machines.
#process any FOO=bar switches
$old= ''; $new = ''; $intags = 0; $regexp = 0; $regeval = 0; $upcase = 0;
$lines = 0; $delete = 0; $case = 0; $slash=0; $inmost=''; $inside = '';
$outside = '';
eval '$'.$1.'$2;' while $ARGV[0] =~ /^(old=|new=|intags=|lines=|regexp=|regeval=|upcase=|delete=|case=|slash=|inmost=|inside=|outside=)(.*)/ && shift;
$[ = 1; # set array base to 1
$, = ' '; # set output field separator
$\ = "\n"; # set output record separator
foreach $X (@ARGV) {
if ($X =~ /^[^=]+=/) {
print STDERR "Apparent misspelled or badly-placed command-line option $&";
print STDERR "Attempting to continue anyway...";}}
$filstr = join(' ',@ARGV); $changed = 0;
if ($lines) {$/ = "\0777"; $* = 1;}
else {$/ = "\n";}
if (($outside) && (!(($inside) || ($inmost)))) {$applyit = 1;}
else {$applyit = 0;}
#
$unpair{'!--'} = 1; $unpair{'!DOCTYPE'} = 1; $unpair{'BASE'} = 1;
$unpair{'BR'} = 1; $unpair{'COMMENT'} = 1; $unpair{'HR'} = 1;
$unpair{'IMG'} = 1; $unpair{'INPUT'} = 1; $unpair{'ISINDEX'} = 1;
$unpair{'LINK'} = 1; $unpair{'META'} = 1; $unpair{'NEXTID'} = 1;
$unpair{'ATOP'} = 1; $unpair{'LEFT'} = 1;
$unpair{'OVER'} = 1; $unpair{'OVERLAY'} = 1; $unpair{'RIGHT'} = 1;
$unpair{'TAB'} = 1; $unpair{'BASEFONT'} = 1; $unpair{'WBR'} = 1;
$nestvar = 0; $numins = 0; $numout = 0;
if ($inmost) {
$inmost =~ tr/a-z/A-Z/;
if ($inmost =~ /[^-.a-zA-Z0-9]/) {
die 'Non-alphanumeric value of inmost= was specified';}
if (defined $unpair{$inmost}) {
die "Non-pairing tag $inmost specified as value of inmost=";}}
if ($inside) {
$numins = (@inarr = split(/,/, $inside));
for ($i = 1; $i <= $numins; ++$i) {
$inarr[$i] =~ tr/a-z/A-Z/;
if ((!$inarr[$i]) || ($inarr[$i] =~ /[^-.a-zA-Z0-9]/)) {
die 'Non-alphanumeric value of inside= was specified';}
if (defined $xxin{$inarr[$i]}) {
die 'Duplicate values of inside= were specified';}
if (defined $unpair{$inarr[$i]}) {
die "Non-pairing tag $inarr[$i] specified as value of inside=";}
else {
$xxin{$inarr[$i]} = 1;}}}
if ($outside) {
$numout = (@outarr = split(/,/, $outside));
for ($i = 1; $i <= $numout; ++$i) {
$outarr[$i] =~ tr/a-z/A-Z/;
if ((!$outarr[$i]) || ($outarr[$i] =~ /[^-.a-zA-Z0-9]/)) {
die 'Non-alphanumeric value of outside= was specified';}
if (defined $xxout{$outarr[$i]}) {
die 'Duplicate values of outside= were specified';}
if (defined $xxin{$outarr[$i]}) {
die "Tagname $outarr[$i] specified as both outside= and inside=";}
if (defined $unpair{$outarr[$i]}) {
die "Non-pairing tag $outarr[$i] specified as value of outside=";}
else {
$xxout{$outarr[$i]} = 1;}}}
#
if ((!$old) && (!$upcase)) {die "No `old=' string was specified";}
if (($delete) && (($new) || ($regexp) || ($regeval))) {
die "Incompatible option specified with `delete=1'";}
if (($regexp) && ($regeval)) {die 'Both regexp=1 and regeval=1 specified';}
if (($case) && (!$delete) && (!$regexp) && (!$regeval)) {
die 'Option case=1 specified without any of regexp=1, regeval=1, or delete=1 also being specified';}
if ($delete) {$slash=1;}
if (($upcase) || ($delete) || ($slash)) {$intags = 1;}
#
# Main
#
# Variable ``$state'' is one if there is an unresolved `<', zero otherwise.
# ``$lastbeg'' is zero if no `<' has ocurred in $_, otherwise it points to the
# character immediately after the last `<' encountered.
#
$xRS = "\n"; $state = 0;
while (<>) {
if ($_ =~ /$xRS$/o) { # strip record separator, allow for last line to
chop;} # be unterminated.
$lastbeg = 0; $currsrch = 1; $txtbeg = 1;
while ((((substr($_, $currsrch) =~ /[<>]/) eq 1) &&
($RSTART = length($`)+1)) != 0) {
$currsrch = ($currsrch + $RSTART);
if (substr($_, ($currsrch - 1), 1) eq '<') {
if ($state) {
print "\nERROR!";
die "Multiple `<' without `>' ERROR!";}
else {
if (($currsrch > length($_)) ||
(substr($_, $currsrch, 1) =~ /^[ \t]$/)) {
print "\nERROR!";
die "Whitespace after `<': Bad SGML syntax ERROR!";}
else {
if ($currsrch > ($txtbeg + 1)) {
if ((!$intags) && (($applyit) || (!(($inmost) ||
($numins) || ($numout))))) {
printf "%s", &changeht(substr($_, $txtbeg,
($currsrch - ($txtbeg + 1))));}
else {
printf "%s", substr($_, $txtbeg,
($currsrch - ($txtbeg + 1)));}}
$deletit = 0;
$lastbeg = $currsrch; $state = 1;}}}
else {
if (substr($_, ($currsrch - 1), 1) eq '>') {
if ($state == 0) {
next;} #`>' without `<'
else {
&parsetag($currsrch - 1);
if (!$deletit) {printf "%s", '>';}
$txtbeg = $currsrch; $state = 0;}}
else {die 'Internal error, ignore';}}}
#At EOL:
if ($state == 1) {
&parsetag(length($_) + 1);}
elsif ($txtbeg <= length($_)) {
if ((!$intags) && (($applyit) || (!(($inmost) || ($numins) ||
($numout))))) {
printf "%s", &changeht(substr($_, $txtbeg));}
else {printf "%s", substr($_, $txtbeg);}}
if (!(($state) && ($deletit))) {printf "\n";}}
#
#END routine:
#
if ($state) {
die "Was awaiting a `>' ERROR! at END";}
if ($changed) {
print STDERR "Changed! on input", $filstr;}
else {
print STDERR "Unchanged on input", $filstr;}
#
#
sub parsetag {
local($inp) = @_;
$docap = $lastbeg;
if (!$lastbeg) {
$strx = '' ; $lastbeg = 1;}
else {$strx= '<';}
if ($inp != $lastbeg) {
$str = &upc(substr($_, $lastbeg, ($inp - $lastbeg)));
if (($oldapply) || (!(($inmost) || ($numins) ||($numout)))) {
if (($slash) && ($docap) && ($str =~ /^\//))
{$strx = ($strx . '/'); $str= substr($str, 2);}
if ($delete) {
if ($docap) {&getdel($str);}
if (!$deletit) {printf "%s%s", $strx, $str;}
else {$changed=1;}}
else {
if (($intags) && ($old))
{printf "%s%s", $strx, &changeht($str);}
else {printf "%s%s", $strx, $str;}}}
else {printf "%s%s", $strx, $str;}}}
#
sub upc {
local($upcx) = @_;
if ($docap) {
$upcx =~ /^[^ \t\n]+/;
($tagname = $&) =~ tr/a-z/A-Z/;
if ($upcase) {$upcx = ($tagname . $');}
$oldapply = $applyit;
#tag stack accounting
if ((($inmost) || ($numins)|| ($numout)) &&
(!(defined $unpair{$tagname}))) {
$applyit = 1; $clostag = '';
if ($tagname !~ /^\//) {
++$nestvar;
$nestarr[$nestvar] = $tagname;}
else {
$clostag = substr($tagname,2);
while ($nestarr[$nestvar] ne $clostag) {
--$nestvar;
if ($nestvar <= 0) {
print "\nERROR!";
die "/$clostag tag encountered when apparently not in $clostag element";}}
--$nestvar;}
if (($inmost) && ($nestarr[$nestvar] ne $inmost)) {
$applyit = 0;}
if ($numins) {
if ($nestvar < $numins) {$applyit = 0;}
else {
$mask = 1;
$stackstr = (" " . join(" ",@nestarr[1..$nestvar]) . " ");
foreach $X (keys %xxin) {
if (index($stackstr,(" " . $X . " ")) <= 0) {
$mask = 0;}}
if (($applyit) && ($mask)) {$applyit = 1;}
else {$applyit = 0;}}}
if (($numout) && ($nestvar)) {
$mask = 1;
$stackstr = (" " . join(" ",@nestarr[1..$nestvar]) . " ");
foreach $X (keys %xxout) {
##print $stackstr,"XX",(" " . $X . " ");##debugXX
if (index($stackstr,(" " . $X . " ")) > 0) {
$mask = 0;}}
if (($applyit) && ($mask)) {$applyit = 1;}
else {$applyit = 0;}}
if ($clostag) {$oldapply = $applyit;}}}
return $upcx;}
#
sub getdel {
local($inz) = @_;
$inz =~ /^[^ \t\n]+/;
$X = $&;
if ($case) {
if ($X =~ /$old/io) {
$deletit = 1;}}
else {
if ($X =~ /$old/o) {
$deletit = 1;}}}
#
sub changeht {
local($field) = @_;
if ($regeval) {
if ($case) {
$X = ($field =~ s/$old/$new/eeigo);}
else {
$X = ($field =~ s/$old/$new/eego);}
if ($X) {$changed = 1;}
return $field;}
elsif ($regexp) {
if ($case) {
$X = ($field =~ s/$old/$new/igo);}
else {
$X = ($field =~ s/$old/$new/go);}
if ($X) {$changed = 1;}
return $field;}
else {
$startf = 1; $newf = '';
while (($ndx = index(substr($field,$startf),$old)) > 0) {
$changed = 1;
$newf = ($newf . substr($field,$startf,($ndx-1)) . $new);
$startf = ($startf + ($ndx-1) + length($old));}
$newf = ($newf . substr($field,$startf));
return $newf;}}
##EOF