home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
HTML - Publishing on the Internet
/
html_cdrom.iso
/
tools
/
html
/
windows
/
check
/
xtraclnk.pl
< prev
Wrap
Perl Script
|
1995-01-20
|
23KB
|
485 lines
#!/usr/local/bin/perl
#xtraclnk.pl: Extracts hypertext links from HTML files; isolates text contained
# in <A> and <TITLE> elements.
#
# Typical use:
#
# perl xtraclnk.pl [options] infiles.html > outfile
#
# Where options have the form "option=value", as discussed below (command line
# options other than ``title='' and ``loc='' work the same way as those of the
# htmlchek program in this distribution).
#
# Whenever xtraclnk.pl encounters an <A HREF="URL">Text</A> link in an input
# file, it copies this to the output. Whenever xtraclnk.pl encounters an
# <A NAME="name">Text</A> anchor in an input file, it copies this as an
# <A HREF="currentfile.html#name">Text</A> link _to_ the current input file.
# Finally, the contents of a <TITLE>Text</TITLE> element are copied as an
# <A HREF="currentfile.html">Text</A> link _to_ the current input file.
# Each link in the ouput occupies exactly one line.
#
# This program was suggested by an idea of John Harper; what he had in mind,
# I think, was to use this as part of a CGI script which would dynamically
# construct an HTML document with links to all files with a title or anchors
# that contain text matching a user-specified search pattern. However,
# xtraclnk.pl also has some value as an HTML style debugging tool: if you have
# used a lot of context-dependent titles like "Intro" and meaningless anchor
# text like "Click Here", this will be very apparent when you view the HTML
# document (derived with xtraclnk.pl using the ``title='' option) which
# contains only the text inside titles and anchors in your other HTML
# documents. This program can also be used to enforce consistency in link text:
# if there is random variation between different <A HREF="...">LinkText</A>
# elements which all point towards the same resource, this will be apparent
# when the output of xtraclnk.pl is sorted. Also, by looking over the sorted
# output of <tt>xtraclnk.pl</tt>, it becomes relatively easy to detect mistaken
# links, that point to someplace other than what was intended.
#
# If you apply xtraclnk.pl to a list of filenames that are all specified
# relative to the current directory then all the references to files in
# subordinate directories will be expressed from the point of view of the top
# directory (i.e. relative URL pathnames will have the current directory as
# starting point). Under Unix, you can use:
#
# perl xtraclnk.pl `find . -name \*.html -print` > output
#
# Since xtraclnk.pl is a hacked-down version of the htmlchek error checker, it
# is rather robust in its handling of incorrect HTML code (but it generally has
# the same limitations that htmlchek does with metachar=2). Though it is not
# a general-purpose error checker like htmlchek, xtraclnk.pl does return
# errormessages about HTML errors connected with its functioning (note that it
# ignores all tags in a file except <A>, <BASE>, <TITLE>, and the
# ALT="..." attribute valuex of <IMG>).
#
# Command-line
# options:
#
# dirprefix=... A string to be prefixed to URL's in the output links, in
# order to resolve relative URL's into absolute URL's.
# (See the htmlchek documentation for the complexities of use.)
#
# usebase=1 Take the prefix from a <BASE HREF="..."> tag in each file.
#
# sugar=1 Use the Unix ``filename: linenumber:'' format in reporting
# errors.
#
# title=... Make the output file a valid HTML document, with <br> at
# the end of each line, and a title as specified. Error
# messages (if any) appear as HTML comments in the outputfile.
# (If this title= option is not specified on the command line,
# the output will tailored for human readability, and will not
# really be an HTML file.)
# Note that the output with title= will still be a HTML file
# if you run it though the ``sort'' and ``uniq'' filters. It
# will also remain HTML if you run it through ``grep'' -- as
# long as you keep the first and last lines; for example (under
# Unix):
# perl xtraclnk.pl title="Link Stuff" *.html > out
# head -1 out > linkfile.html
# egrep 'pattern' out >> linkfile.html
# tail -1 out >> linkfile.html
#
# loc=... Whether or not to include the location (input filename and
# linenumber) from which each output link is derived. If you
# don't include locations, it's hard to tell where bad links
# came from; if you do include locations, the output will be
# larger, and running the output though sort and uniq won't be
# as useful for detecting inconsistent link text.
# By default, source locations are not included in the
# output. A value of loc=1 causes locations to be included.
# A value of loc=hide (or anything beginning with the three
# characters "hid...") will include locations as HTML comments,
# if the title= option has alson been specified.
#
# Copyright 1994, 1995 by H. Churchyard, churchh@uts.cc.utexas.edu -- freely
# redistributable.
#
# Version 1.0 12/15/94
# Version 1.1 12/18/94 -- improve HTML-icity of "title=" option output, etc.
# Version 1.11 12/19/94 -- squashed minor bugs. Was informally made
# available by HTTP from uts.cc.utexas.edu.
# Version 1.2 1/9/95 -- Added loc= option, include <IMG ALT="..."> text in
# links. Included in htmlchek 4.0 release.
#
eval "exec /usr/local/bin/perl -S $0 $*"
if $running_under_some_shell; # This emulates #! processing on NIH machines
#
# Setup:
#
$known{'A'} = 1; $known{'IMG'} = 1; $known{'TITLE'} = 1; $known{'/A'} = 1;
$known{'/TITLE'} = 1; $known{'BASE'} = 1; $pair{'A'} = 1; $pair{'TITLE'} = 1;
#
&initscalrs();
$usebase = 0; $dirprefix = ''; $sugar = 0; $title = ''; $loc = 0;
#process any FOO=bar switches
eval '$'.$1.'$2;' while $ARGV[0] =~ /^(usebase=|dirprefix=|sugar=|title=|loc=)(.*)/ && shift;
$[ = 1; # set array base to 1
$, = ' '; # set output field separator
$\ = "\n"; # set output record separator
foreach $X (@ARGV) {
if ($X =~ /^[^=]+=/) {
print STDERR "Apparent misspelled or badly-placed command-line option $&";
print STDERR "Attempting to continue anyway...";}}
#
if ($title) {
print " <html><head><title>$title</title></head><body><h1>$title</h1>";
$E = '<br>'; $gt = '>'; $lt = '<'; $A = '<!-- '; $Z = ' -->';
if ($loc =~ /^HID/i)
{$AA = '<!-- '; $ZZ = ' -->';}
else
{$AA = ''; $ZZ = '';}}
else {
$E = ''; $gt = '>'; $lt = '<'; $A = ''; $Z = ''; $AA = ''; $ZZ = '';}
#
# Main
#
$stuperlRS = $/;
while (<>) {
if ($_ =~ /$stuperlRS$/o) { # strip record separator, allow for last line to
chop;} # be unterminated.
if (($.-$FNRbase) == 1) {
$fn = $ARGV;
# Next line is Unix-specific
$fn =~ s/^\.\///;
$nampref = ($dirprefix . $fn . '#');
$lochpref = ($dirprefix . $fn);
if ($fn =~ /.\//) {
$fromroot = $fn; $fromroot =~ s/\/[^\057]*$/\//;}
else {
$fromroot = '';}
$fromroot=($dirprefix . $fromroot);}
if ($sugar) {$S = ($fn . ': ' . ($.-$FNRbase) . ': ');}
if ($loc) {$L = ($fn . ' ' . ($.-$FNRbase));}
$lastbeg = 0; $currsrch = 1; $txtbeg = 1;
while ((((substr($_, $currsrch) =~ /[<>]/) eq 1) &&
($RSTART = length($`)+1)) != 0) {
$currsrch = ($currsrch + $RSTART);
if (substr($_, ($currsrch - 1), 1) eq '<') {
if ($state) {
print $A . $S . "Multiple `$lt' without `$gt' ERROR!", &crl() .
$Z;}
else {
if (($currsrch > length($_)) ||
(substr($_, $currsrch, 1) =~ /^[ \t]$/)) {
print $A . $S .
"Whitespace after `$lt': Incorrect SGML syntax ERROR!",
&crl() . ", Ignoring$Z";}
else {
if (($nestvar) && ($currsrch > ($txtbeg + 1))) {
$line = ($line . substr($_, $txtbeg,
($currsrch - ($txtbeg + 1))));}
$lastbeg = $currsrch; $state = 1;
$lasttag = ''; $lastopt = '';}}}
else {
if (substr($_, ($currsrch - 1), 1) eq '>') {
if ($state == 0) {
next;} #`>' without `<'
else {
&parsetag($currsrch - 1);
if (($inquote) || ($inequal)) {
&malft();}
if ($optfree) {
&misstest();}
if (($lasttag eq 'A') && (!$wasname) && (!$washref)) {
print $A . $S . $lt .
"A$gt tag occurred without reference (NAME,HREF,ID) option ERROR!",
&crl() . $Z;}
if (($wasname > 1) || ($washref > 1)) {
print $A . $S .
'Multiple reference (NAME,ID;HREF) options ERROR!',
&crl(), 'on tag', $lasttag . $Z;}
$txtbeg = $currsrch;
$state = 0; $continuation = 0;}}
else {
print $A . $S . 'Internal error', &crl(), 'ignore' . $Z;}}}
if (($state == 1) || (($lastbeg == 0) && ($continuation == 1))) {
&parsetag(length($_) + 1);
$continuation = 1;}
else {
if (($nestvar) && (!$state) && ($txtbeg <= length($_))) {
$line = ($line . substr($_, $txtbeg) . ' ');}
else {
$line = ($line . ' ');}}}
continue {
$FNRbase = $. if eof;}
#
# End-of-file routine.
#
if ($. > 0) {&endit()};
if ($title) {print '<hr></body></html>';}
#
#
# parsetag() communicates with main() through these global variables:
# - $lastbeg (zero if no `<' ocurred on line, otherwise points to character
# immediately after the last `<' encountered).
# - $state (one if unresolved `<', zero otherwise).
# - $continuation (one if unresolved `<' from previous line, zero otherwise),
# - $inquote (one if inside option quotes <tag opt="...">).
#
sub parsetag {
local($inp) = @_;
if (!$lastbeg) {
$lastbeg = 1;}
$numf = (@arr = split(' ', substr($_, $lastbeg, ($inp - $lastbeg))));
if ($numf == 0) {
if (!$continuation) {
print $A . $S . "Blank $lt$gt ERROR!", &crl() . $Z;
$state = 0;}
return;}
else {
if (!$continuation) {
$arr[1] =~ tr/a-z/A-Z/;
$lasttag = $arr[1];
if (defined $known{$arr[1]}) {
if ($arr[1] =~ /^\//) {
# </TAG> found
$arr[1] =~ s/^\///;
if (defined $pair{$arr[1]}) {
if (($nestvar <= 0) || ($lev{$arr[1]} <= 0)) {
print $A . $S . 'Extraneous /' . $arr[1],
'tag without preceding', $arr[1], 'tag ERROR!',
&crl() . ', Ignoring' . $Z;}
else {--$nestvar; --$lev{$arr[1]};
if ($arr[1] eq 'TITLE') {&doout($lochpref);}
else
{if ($currf[2]) {&doout($currf[2]);}
if ($currf[3]) {&doout($currf[3]);}}}}}
else {
# <TAG> found
if ($arr[1] ne 'IMG') {$line = '';}
++$lev{$arr[1]};
if (defined $pair{$arr[1]}) {
$currf[2] = ''; $currf[3] = '';
++$nestvar;
if (($lev{$arr[1]} > 1) || ($nestvar > 1)) {
print $A . $S . 'Nesting ERROR!', &crl(),
"on tag $arr[1]" . $Z;}}}}
$startf = 2; $inquote = 0; $inequal = 0; $optfree = 0;
$wasopt = 0; $wasname = 0; $washref = 0;}
else {
$startf = 1;}
# Remainder of stuff in <...> after tag word
if (defined $known{$lasttag}) {
for ($i = $startf; $i <= $numf; ++$i) {
if ((!$inequal) && (!$inquote)) {
if (($arr[$i] =~
/^[^=\042]*(=\042[^\042]*\042)?$/) ||
($arr[$i] =~ /^[^=\042]*=(\042)?[^\042]*$/)) {
if (($optfree) &&
(($arr[$i] =~ /^=[^=\042][^=\042]*$/) ||
($arr[$i] =~ /^=\042[^\042]*\042$/))) {
if (!$malftag) {
$arr[$i] =~ s/^\075//;
if ($arr[$i] =~ /\042/) {
&optvalproc($arr[$i],1);}
else {&optvalproc($arr[$i],0);}}
$optfree = 0; ++$tagwarn;}
else {
if (($optfree) && (($arr[$i] =~ /^=\042/) ||
($arr[$i] eq '='))) {
$inequal = 1; ++$tagwarn;}
@arr2 = split(/=/, $arr[$i], 2);
if ($arr2[1] eq '') {
if (!$inequal) {
print $A . $S . 'Null tag option ERROR!',
&crl(), "on tag $lasttag" . $Z;
$malftag = 1;}}
else {
if ($optfree) {
&misstest();}
$arr2[1] =~ tr/a-z/A-Z/;
$optfree = 1; ++$wasopt;
$malftag = 0; $optvalstr = '';
if ($lasttag =~ /^\//) {
print $A . $S . 'Option on closing tag',
$lasttag, 'Warning!', &crl() . $Z;}
else {
$lastopt = $arr2[1];}}
if ($arr[$i] =~ /^[^=\042][^=\042]*=$/) {
$inequal = 1;}
if ($arr[$i] =~ /[\075]/) {
$optvalstr = $arr[$i];
$optvalstr =~ s/^[^=]*=//;}
$stuperltmp = $arr[$i];
$Q = ($stuperltmp =~ s/\042//g);
if ($Q == 1) {
$inquote = 1;}
if (($optvalstr)&&(!$inequal)&&(!$inquote)) {
$optfree = 0;
if (!$malftag) {
&optvalproc($optvalstr,$Q);}}}}
else {
&malft();}}
else {
if (($inequal) && (!$inquote)) {
++$tagwarn;
if ($arr[$i] =~ /\042/) {
if ($arr[$i] =~ /^\042[^\042]*(\042)?$/) {
$stuperltmp = $arr[$i];
if (($stuperltmp =~ s/\042//g) == 2) {
if (!$malftag) {
$stuperltmp =~ s/^\075//;
&optvalproc($stuperltmp,1);}
$inequal = 0; $optfree = 0;}
else {
$optvalstr = $arr[$i];
$inquote = 1;}}
else {
&malft();}}
else {
if ($arr[$i] !~ /[\075]/) {
if (!$malftag) {
&optvalproc($arr[$i],0);}
$inequal = 0; $optfree = 0;}
else {
&malft();}}}
else {
if ($arr[$i] =~ /\042/) {
$inquote = 0; $inequal = 0; $optfree = 0;
if ($arr[$i] !~ /^[^\042]*\042$/) {
&malft();}
else {
$optvalstr = ($optvalstr . ' ' . $arr[$i]);
if (!$malftag) {
&optvalproc($optvalstr,1);}}}
else {
$optvalstr = ($optvalstr . ' ' . $arr[$i]);}}}}}
return;}}
#
#
# Return as much location information as possible in diagnostics:
#
# Current location:
sub crl {
if (($fn)&&($fn ne '-')) {
return ('at line ' . ($.-$FNRbase) . " of file \042" . $fn . "\042");}
else {
return ('at line ' . $.);}}
#
# End of file location:
sub ndl {
if (($fn)&&($fn ne '-')) {
return ("at END of file \042" . $fn . "\042");}
else {
return 'at END';}}
#
# Error message returned from numerous places in the program...
#
sub malft {
print $A . $S . 'Malformed tag option ERROR!', &crl(), 'on tag', $lasttag .
$Z;
$malftag = 1;}
#
#
#Check for non-kosher null options:
#
sub misstest {
if ((($lasttag eq 'A') && ($lastopt eq 'NAME')) || ($lastopt eq 'HREF') ||
($lastopt eq 'ID')) {
print $A . $S . 'Missing reference option value', &crl(),
"on tag $lasttag, option $lastopt" . $Z;}}
#
#
sub doout {
local($href) = @_;
$line =~ s/[ \t][ \t]+/ /g; $line =~ s/\t/ /g;
$line =~ s/^ //; $line =~ s/ $//;
if ($line eq '') {
$line = '[ EMPTY ANCHOR TEXT ]';}
print "<A HREF=\042" . $href . "\042>" . $line . '</A>', $AA . $L . $ZZ . $E;}
#
# This subroutine receives the raw option value string, for every tag option
# that does have a value. It does some errorchecking and cleanup, and sets
# the URL or name of the current anchor.
#
sub optvalproc {
local($val, $quoted) = @_;
$currfn = 0;
if ($quoted) {
$val =~ s/\042//g; $val =~ s/^ //; $val =~ s/ $//;}
if ($lasttag eq 'IMG') {
if (($lastopt eq 'ALT') && ($val =~ /[^ \t]/)) {
$line = ($line . " [ $val ] ");}}
elsif ($lasttag eq 'BASE') {
if (($usebase) && ($lastopt eq 'HREF')) {
if (($quoted) && ($val) && ($val ne '=') && ($val !~ /[^ ] [^ ]/)) {
$nampref = ($val . '#'); $lochpref = $val;
if ($val =~ /.\//) {
$fromroot = $val;
$fromroot =~ s/\/[^\057]*$/\//;}
else {
$fromroot = '';}}
else {
print $A . $S . "Bad $lt" . "BASE HREF=\042...\042$gt", &crl() .
', Ignoring' . $Z;}}}
else {
if ((($lasttag eq 'A') && ($lastopt eq 'NAME')) || ($lastopt eq 'ID')) {
$currfn = 2; ++$wasname;
if ($val =~ /^#/) {
print $A . $S . "Invalid #-initial location \042" .
$val . "\042 ERROR!", &crl(), 'on tag', $lasttag,
'option', $lastopt . $Z;}}
else {
if ($lastopt eq 'HREF') {
$currfn = 3; ++$washref;}}}
if ($currfn) {
if (!$quoted) {
print $A . $S . 'Unquoted reference option value Warning!', &crl(),
"on tag $lasttag, option $lastopt$Z";}
if ($val =~ /[^ ] [^ ]/) {
print $A . $S . 'Whitespace in reference option value Warning!',
&crl(), "on tag $lasttag, option $lastopt$Z";}
else {
if ($val eq '') {
print $A . $S . 'Null reference option value ERROR!', &crl(),
"on tag $lasttag, option $lastopt$Z";}
else {
# Skip the residue of Malformed Tag Option cases; OK to do
# this, since "=" is not a valid URL; However, a minor bug
# is that <A NAME="="> will not be checked, and will not
# result in any errormessage.
if ($val ne '=') {
if ($currfn == 2) {
$val = ($nampref . $val);}
else {
if (($currfn == 3) && ($val =~ /^#/)) {
$val = ($lochpref . $val);}
else {
if ($val =~ /^http:[^\057]*$/) {
$val =~ s/^http://;}
if (($val !~ /^[^\057]*:/) && ($val !~ /^\//)) {
if ($val =~ /^~/) {
print $A . $S .
"Relative URL beginning with '~' Warning!",
&crl(),"on tag $lasttag option $lastopt$Z";}
else {
$val = ($fromroot . $val);}}}}
# This monstrosity supports "../" in URL's:
while ($val =~ /\057[^\057]*[^\057]\057\.\.\057/) {
$val =~ s/\057[^\057]*[^\057]\057\.\.\057/\057/;}
if (($val =~ /[:\057]\.\.\057/) || ($val =~ /^\.\.\057/)) {
print $A . $S . "Unresolved \042../\042 in URL Warning!",
&crl(), "on tag $lasttag option $lastopt$Z";}
$currf[$currfn] = $val;}}}}}
#
#
# Start each file with a clean slate.
#
sub initscalrs {
$state = 0; $continuation = 0; $nestvar = 0; $S = ''; $L = ''; $line = '';}
#
#
#
sub endit {
if ($sugar) {$S = ($fn . ': END: ');}
if ($continuation) {
print $A . $S . "Was awaiting a `$gt' ERROR!", &ndl() . $Z;}
foreach $X (sort(keys %pair)) {
if ($lev{$X} > 0) {
print $A . $S . "Pending unresolved $lt" .
"x$gt without $lt/x$gt ERROR!", &ndl(), 'on tag', $X . $Z;}}
#Reinitialize for next file
&initscalrs();
undef %lev;}
#-=- -=- -=- -=- -=- -=- -=- -=- -=- -=- -=- -=- -=- -=- -=- -=-
##EOF