home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Meeting Pearls 3
/
Meeting_Pearls_III.iso
/
Pearls
/
tcp
/
Networking
/
TCP
/
WWW
/
GetURL
/
GetURL-1.03.rexx
< prev
next >
Wrap
OS/2 REXX Batch file
|
1995-01-31
|
58KB
|
1,642 lines
/*=======================================================================*/
/* File Name : 'geturl.rexx', 02-Jan-95 */
/*=======================================================================*/
/* Script to download HTML systems across the network */
/*
Written by James Burton (burton@cs.latrobe.edu.au)
02-Jan-95
**NOTE: requires AmiTCP & TCP: device**
*/
gv.version = 'v1.03'
version_string = '$VER: GetURL.rexx'
/*=======================================================================*/
/* Global Variables */
/*=======================================================================*/
/* Constants (huh?) */
gv.cr = '0d'x
gv.lf = '0a'x
gv.crlf = gv.cr||gv.lf
gv.true = 1
gv.false = 0
/* Proxy stuff */
gv.http_proxy_host = ''
gv.http_proxy_port = '80'
gv.ftp_proxy_host = ''
gv.ftp_proxy_port = '80'
/* Root Url */
gv.root_url = ''
gv.root_method = ''
gv.root_host = ''
gv.root_port = ''
gv.root_path = ''
/* Files */
gv.input_file = ''
gv.output_file = ''
gv.agenda_file = 't:agenda-'||tmpname()
gv.prune_file = 't:prune-'||tmpname()
gv.header_file = 't:header-'||tmpname()
gv.failed_file = 't:failed-'||tmpname()
/* Flags */
gv.recursive_fetch = gv.false
gv.use_proxy_cache = gv.true
gv.retry_failed = gv.false
gv.use_match = gv.false
gv.save_headers = gv.false /* Change the default to true for Brian :-) */
gv.use_modified_since = gv.false
gv.use_associative = gv.false
gv.use_level_marker = gv.false
/* Misc */
gv.local_prefix = 'file://localhost|Mosaic:'
gv.match_host = ''
gv.match_path = ''
gv.match_url = ''
gv.user_address = ''
gv.number_iterations = ''
gv.depth = 0 /* bt */
gv.max_depth = 20 /* bt */
gv.max_length = '' /* bt */
gv.timezone = '+1100'
gv.problem_address = 'burton@latcs1.lat.oz.au'
gv.delay = 2
gv.level_marker = '-'
/* arrays to hold agenda items */
/* use:
agenda. defaults to 0
agenda.head contains index of 1st path yet-to-visit
agenda.tail contains index of last path yet-to-visit
agenda.N contains Nth path yet-to-visit
agenda.pathname contains recursion-level of path-yet-to-visit
prune.tail contains index of next visited-path
prune.N contains name of a visited-path
prune.pathname contains non-zero
failed.tail contains index of next failed-path
failed.N contains name of a failed path
failed.pathname contains non-zero
*/
agenda. = 0
agenda.head = 1
agenda.tail = 0
prune. = 0
prune.tail = 0
failed. = 0
failed.tail = 0
/*=======================================================================*/
/* Main */
/*=======================================================================*/
Main:
parse arg gv.root_url rest
if substr(gv.root_url,1,1) = '-' then do /* no URL */
rest = gv.root_url rest
gv.root_url = ''
end
signal ON break_c /* install control-c handler */
OpenLibrary('rexxsupport.library',gv.true) /* insist on using this library */
OpenLibrary('rexxarplib.library') /* non essential library */
Configure() /* setup global variables */
/* look for and parse arguments */
arglist = rest
do while arglist ~= ''
parse var arglist flagname arglist
uflagname = upper(flagname)
DoArguments(uflagname)
end
if gv.root_url ~= '' then
do
if gv.input_file = '' then
do
/* parse Root URL */
parse value ParseUrl(gv.root_url) with ok','url','gv.root_method','gv.root_host','gv.root_port','gv.root_path
if ~ok then do
say 'failed to parse initial URL:' gv.root_url
EXIT 20
end
if gv.output_file = '' then
outputfile = MakePathname(gv.root_host,gv.root_path)
else
outputfile = gv.output_file
if ~AppendHTTP(gv.root_host,gv.root_port,gv.root_path,outputfile,gv.agenda_file,gv.root_url) then
AddToFailed(gv.failed_file,gv.root_url)
else
do
if ~PruneUrl(gv.root_url) then
AddToPruned(gv.prune_file,gv.root_url) /* don't want to visit the root again */
end
if gv.recursive_fetch then
DoAgenda(gv.agenda_file)
end
else /* ignore URL, use input file */
do
if gv.output_file = '' then
outputfile = 't:output-'tmpname()
else
outputfile = gv.output_file
AppendFile(gv.input_file,outputfile,gv.agenda_file)
if gv.recursive_fetch then
DoAgenda(gv.agenda_file)
end
end
else /* no URL specified */
do
if gv.input_file ~= '' then
do
if gv.output_file = '' then
outputfile = 't:output-'tmpname()
else
outputfile = gv.output_file
AppendFile(gv.input_file,outputfile,gv.agenda_file)
if gv.recursive_fetch then
DoAgenda(gv.agenda_file)
end
else
do
/* no input file, no URL */
/* may be something in the agenda */
if gv.recursive_fetch then
DoAgenda(gv.agenda_file)
end
end
if gv.recursive_fetch & gv.retry_failed then do
if gv.use_associative then
BTAddFileTo(agenda,gv.failed_file)
else
address command 'copy' gv.failed_file gv.agenda_file
address command 'delete >NIL:' gv.failed_file
say 'Retrying previously failed URLs...'
DoAgenda(gv.agenda_file)
end
if gv.use_associative then do
BTDumpAgendaToFile(gv.agenda_file)
BTDumpFailedToFile(gv.failed_file)
end
EXIT
/*=======================================================================*/
/* DoArguments */
/*=======================================================================*/
DoArguments:
parse var uflagname
select
when abbrev('-HELP',uflagname,1) then
printusage()
when abbrev('-RECURSIVE',uflagname,2) then
gv.recursive_fetch = gv.true
when abbrev('-HOST',uflagname,2) then
parse var arglist gv.match_host arglist
when abbrev('-PATH',uflagname,2) then
parse var arglist gv.match_path arglist
when abbrev('-URL',uflagname,2) then
do
parse var arglist gv.match_url arglist
parse var gv.match_url dummy'://'gv.match_host'/'gv.match_path
end
when abbrev('-OUTPUT',uflagname,2) then
parse var arglist gv.output_file arglist
when abbrev('-NOPROXY',uflagname,3) then
do
gv.http_proxy_host = ''
gv.ftp_proxy_host = ''
end
when abbrev('-INPUT',uflagname,2) then
do
if arglist = '' then
gv.input_file = '-'
else
do
parse var arglist gv.input_file arglist
if substr(gv.input_file,1,1) = '-' then do
/* put the argument back please */
arglist = gv.input_file arglist
gv.input_file = '-' /* flag to use stdin */
end
end
end
when abbrev('-VISITED',uflagname,2) then
do
parse var arglist gv.prune_file arglist
if gv.use_associative then
BTAddFileTo(prune,gv.prune_file)
end
when abbrev('-UNVISITED',uflagname,2) then
do
parse var arglist gv.agenda_file arglist
if gv.use_associative then
BTAddFileTo(agenda,gv.agenda_file)
end
when abbrev('-DIRECTORY',uflagname,2) then
say 'DIRECTORY flag not implemented yet'
when abbrev('-UPDATE',uflagname,2) then
say 'UPDATE flag not implemented yet'
when abbrev('-NOPROXYCACHE',uflagname,2) then
gv.use_proxy_cache = gv.false
when abbrev('-PROBLEM',uflagname,2) then
do
writeln(stdout,'Start typing problem or bug report...<end with '.' on line by itself>')
SendMail(stdin,gv.problem_address,'GetUrl.rexx Problem' gv.version)
end
when abbrev('-SAVEROOT',uflagname,2) then
do
parse var arglist newdir arglist
if (right(newdir,1) ~= ':') & (right(newdir,1) ~= '/') then
newdir = newdir'/'
parse var gv.local_prefix before'|'after
gv.local_prefix = before'|'newdir
end
when abbrev('-LENGTH',uflagname,2) then
parse var arglist gv.max_length arglist
when abbrev('-DEPTH',uflagname,2) then
do
parse var arglist gv.max_depth arglist
gv.use_level_marker = gv.true
end
when abbrev('-SAVEHEADERS',uflagname,2) then
gv.save_headers = gv.true
when abbrev('-NUMBER',uflagname,2) then
parse var arglist gv.number_iterations arglist
when abbrev('-NEWVERSION',uflagname,2) then
do
parse var arglist newfile arglist
if newfile = '' then do
writech(stdout,'Destination for new version? [t:GetURL.rexx] ')
newfile = readln(stdin)
if newfile = '' then
newfile = 't:GetURL.rexx'
end
say 'downloading new script...'
CALL AppendHTTP('www.cs.latrobe.edu.au','','~burton/Public/GetURL.rexx',newfile,gv.agenda_file,'')
say 'downloaded' newfile', fetching documentation...'
p = lastpos('.',newfile)
if p > 0 then
newfile = left(newfile,p)
newfile = newfile'.doc'
CALL AppendHTTP('www.cs.latrobe.edu.au','','~burton/Public/GetURL.doc',newfile,gv.agenda_file,'')
say 'downloaded' newfile
end
when abbrev('-FAILED',uflagname,2) then
do
parse var arglist gv.failed_file arglist
if gv.use_associative then
BTAddFileTo(failed,failed_file)
end
when abbrev('-RETRY',uflagname,2) then
gv.retry_failed = gv.true
when abbrev('-PATTERNMATCHING',uflagname,2) then
do
writech(stdout,'should I really download Match? This will allow the use of regular expression patterns. [y/n] ')
line = readln(stdin)
if upper(left(line,1)) = 'Y' then do
say 'Downloading Match...'
newfile = 't:Match'
CALL AppendHTTP('www.cs.latrobe.edu.au','','~burton/Public/C/Match',newfile,gv.agenda_file,'')
say 'Match is now in your T: directory'
say 'it matches a pattern against a string and prints either yes ot no.'
say 'e.g. Match #?.gif hello.gif --> yes'
say ' Match #?.gif hello.lha --> no'
say 'please install this program somewhere in your shell path.'
end
end
when abbrev('-IFMODIFIED',uflagname,2) then
gv.use_modified_since = gv.true
when abbrev('-ASSOCIATIVE',uflagname,2) then
gv.use_associative = gv.true
when abbrev('-DELAY',uflagname,2) then
parse var arglist gv.delay arglist
otherwise
say 'unused argument :' flagname
end
RETURN gv.true
/*=======================================================================*/
/* SearchLine */
/*=======================================================================*/
SearchLine: Procedure expose gv. agenda.
parse arg buffer,prefix,agenda
signal ON break_c
line = buffer
do while line ~= ''
parse var line before method '=' '"' link '"'line
method = upper(strip(method))
if link ~= '' & (method = 'HREF' | method = 'SRC') then do
AddToAgenda(agenda,CompleteUrl(prefix,link))
end
end
RETURN gv.true
/*=======================================================================*/
/* CompleteUrl */
/*=======================================================================*/
CompleteUrl: Procedure expose gv.
parse arg prefix,link
if prefix = '' then /* nothing else we can do */
RETURN link
if left(link,6) = 'mailto' then
RETURN link
parse var link method '://' rest
if rest ~= '' then /* is already complete */
do
RETURN link
end
else
do
parse var prefix front '|' back
select
when left(link,1) = '/' then /* absolute path */
RETURN front||link
when left(link,1) = '#' then /* internal path */
RETURN front
otherwise /* probably starts with '../' */
if (right(back,1) = '/') | (back = '') then
RETURN front'/'back||link
else
RETURN front'/'back'/'link
end
RETURN gv.true
/*=======================================================================*/
/* AddToAgenda */
/*=======================================================================*/
AddToAgenda: Procedure expose gv. agenda.
parse arg agenda,item,nocheck
if (gv.use_associative) & (nocheck = '') then
RETURN(BTAddToAgenda(item))
if nocheck = '' then do
if AgendaSearch(item,agenda) then
RETURN(gv.false)
end
if open(listf,agenda,'A') then
do
writeln(listf,item)
close(listf)
end
else
do
if open(listf,agenda,'Write') then
do
writeln(listf,item)
close(listf)
end
else
say 'unable to add to agenda file:' agenda
end
RETURN gv.true
AddToPruned: Procedure expose gv. prune.
parse arg agenda,item
if gv.use_associative then
RETURN(BTAddToPrune(item))
AddToAgenda(agenda,item)
RETURN gv.true
AddToFailed: Procedure expose gv. failed.
parse arg agenda,item
if gv.use_associative then
RETURN(BTAddToFailed(item))
AddToAgenda(agenda,item)
RETURN gv.true
/*=======================================================================*/
/* RemoveFromAgenda */
/*=======================================================================*/
RemoveFromAgenda: Procedure expose gv.
parse arg agenda
if gv.use_associative then
RETURN gv.true
signal ON break_c
tmpname = agenda||'.tmp'
if open(listf,agenda,'Read') then do
item = readln(listf)
if open(tmp,tmpname,'Write') then
do
do while ~eof(listf)
line = readln(listf)
if length(line) > 0 then
writeln(tmp,line)
end
close(tmp)
close(listf)
address command 'delete >NIL:' agenda
address command 'rename' tmpname agenda
end
else
do
close(listf)
say 'unable to remove item from agenda file:' agenda
end
/* ***** NOTE: listf has been closed ***** */
end
RETURN gv.true
/*=======================================================================*/
/* AgendaTop */
/*=======================================================================*/
AgendaTop: Procedure expose gv. agenda.
parse arg agenda
if gv.use_associative then
RETURN(BTAgendaTop())
top = ''
if open(listf,agenda,'Read') then do
if eof(listf) then
top = ''
else
top = readln(listf)
close(listf)
end
RETURN top
/*=======================================================================*/
/* AgendaSearch */
/*=======================================================================*/
AgendaSearch: Procedure expose gv.
parse arg target,file
if open(listf,file,'Read') then do
do while ~eof(listf)
line = readln(listf)
if line = target then do
close(listf)
RETURN gv.true
end
end
close(listf)
end
RETURN gv.false
/*=======================================================================*/
/* PruneUrl */
/*=======================================================================*/
PruneUrl: Procedure expose gv.
parse arg url
seen_previously = gv.false
if exists(gv.prune_file) then do
if AgendaSearch(url,gv.prune_file) then
seen_previously = gv.true
end
RETURN seen_previously
/*=======================================================================*/
/* DoAgenda */
/*=======================================================================*/
DoAgenda: Procedure expose gv. agenda. failed. prune.
parse arg agenda
signal ON break_c
count = 0
url = AgendaTop(agenda)
do while url ~= ''
if left(url,1) = gv.level_marker then do
parse var url dummy gv.depth
RemoveFromAgenda(agenda)
url = AgendaTop(agenda)
iterate
end
parse value ParseUrl(url) with ok','url','method','host','port','path
select
when ~ok then
nop
when (gv.number_iterations ~= '') & (count >= gv.number_iterations) then
break
when (gv.depth >= gv.max_depth) then
break
when PruneUrl(url) then
nop
when ~MatchHost(host,gv.match_host) then
nop
when ~MatchPath(path,gv.match_path) then
nop
when method = 'HTTP' then
do
count = count + 1
if gv.output_file = '' then
fname = MakePathname(host,path)
else
fname = gv.output_file
say '['right(count,4)']' url '->' fname
if ~AppendHTTP(host,port,path,fname,agenda,url) then
AddToFailed(gv.failed_file,url)
else
AddToPruned(gv.prune_file,url)
end
when method = 'FILE' then
do
/* FILE is always on localhost. Any other host uses FTP */
count = count + 1
say '['right(count,4)']' 'reading' url
parse var gv.local_prefix before'|'after
if (right(after,1) ~= ':') & (right(after,1) ~= '/') then
after = after'/'
if pos(':',path) = 0 then
fname = after||path
else
fname = path
if TEXTFILE_TYPE(fname) then
AppendFile(fname,,agenda)
AddToPruned(gv.prune_file,url)
end
otherwise
nop
end
RemoveFromAgenda(agenda)
address command 'wait' gv.delay
url = AgendaTop(agenda)
end
RETURN gv.true
/*=======================================================================*/
/* MatchHost */
/*=======================================================================*/
MatchHost: Procedure expose gv.
parse arg hostname,pattern
if pattern = '' then
RETURN gv.true
if ~gv.use_match then
do
host = hostname
patn = pattern
do while (host ~= '') & (patn ~= '')
parse var host hname '.' host
parse var patn pname '.' patn
if (hname ~= pname) & (pname ~= '*') then
RETURN gv.false
end
if patn = '*' then
RETURN gv.true
if host ~= patn then
RETURN gv.false
end
else
do
address command 'match >ENV:result' '"'pattern'"' '"'hostname'"'
if GetEnvVar('result') = 'yes' then
RET = gv.true
else
RET = gv.false
RETURN RET
end
RETURN gv.true
/*=======================================================================*/
/* MatchPath */
/*=======================================================================*/
MatchPath: Procedure expose gv.
parse arg pathname,pattern
if pattern = '' then
RETURN gv.true
if ~gv.use_match then
do
path = pathname
patn = pattern
do while (path ~= '') & (patn ~= '')
parse var path dname '/' path
parse var patn pname '/' patn
if (dname ~= pname) & (pname ~= '*') then
RETURN gv.false
end
if patn = '*' then
RETURN gv.true
if path ~= patn then
RETURN gv.false
end
else
do
address command 'match >ENV:result' '"'pattern'"' '"'pathname'"'
if GetEnvVar('result') = 'yes' then
RETURN gv.true
else
RETURN gv.false
end
RETURN gv.true
/*=======================================================================*/
/* MakePathname */
/*=======================================================================*/
MakePathname: Procedure expose gv.
parse arg host,path
parse var gv.local_prefix localhost'|'savepath
ret = savepath||host'/'path
if (right(path,1) = '/') | (path = '') then
ret = ret||'index.html'
RETURN ret
/*=======================================================================*/
/* Break_C */
/*=======================================================================*/
Break_C:
say '*** BREAK ***'
say 'Transfer aborted'
if gv.use_associative then do
BTDumpAgendaToFile(gv.agenda_file)
BTDumpFailedToFile(gv.failed_file)
end
EXIT 20
/*=======================================================================*/
/* AppendHTTP */
/*=======================================================================*/
AppendHTTP: Procedure expose gv. agenda. prune. failed.
parse arg host,port,path,filename,agenda,url
signal ON break_c
cache_header = ''
/* check that we can actually proceed */
if ~showlist('H','TCP') then do
say 'TCP: device is not mounted'
RETURN gv.false
end
if gv.output_file = '' then
output_mode = 'Write'
else
do
if ~exists(gv.output_file) then
output_mode = 'Write'
else
output_mode = 'Append'
end
if right(path,1) = '/' then
prefix_path = path
else
do
lastslash = lastpos('/',path)
if lastslash = 0 then
prefix_path = ''
else
prefix_path = left(path,lastslash)
end
/* determine correct socket to open (check for proxy) */
if gv.http_proxy_host ~= '' then
do
/* use proxy host */
socketname = 'tcp:'gv.http_proxy_host'/'gv.http_proxy_port
if port = '' then
do
get_command = 'GET http://'host'/'path
prefix = 'http://'host'|'prefix_path
end
else
do
get_command = 'GET http://'host':'port'/'path
prefix = 'http://'host':'port'|'prefix_path
end
end
else
do
/* no proxy, so can do away with method,host,port */
if port = '' then
do
socketname = 'tcp:'host'/80'
get_command = 'GET /'path
prefix = 'http://'host'|'prefix_path
end
else
do
socketname = 'tcp:'host'/'port
get_command = 'GET /'path
prefix = 'http://'host':'port'|'prefix_path
end
end
OK = gv.false
/* Open socket to host */
if open(socket,socketname,'A') then
do
/* send FullRequest command */
writech(socket,get_command 'HTTP/1.0'||gv.crlf)
writech(socket,'From:' gv.user_address||gv.crlf)
writech(socket,'User-Agent: GetUrl.rexx' gv.version 'by burton@cs.latrobe.edu.au'||gv.crlf)
writech(socket,'Accept: */*, text/plain, text/html'||gv.crlf)
if (gv.http_proxy_host ~= '') & (~gv.use_proxy_cache) then
writech(socket,'Pragma: no-cache'||gv.crlf)
if gv.use_modified_since & exists(filename) then do
parse value statef(filename) with type len blocks prot days mins ticks comment
/*say '<'days'/'mins'>'*/
parse value ApplyTimeZone(days,mins) with days','mins
/*say '<'days'/'mins'>'*/
hr = mins % 60
mn = mins // 60
sc = 0
/*dy = left(date('weekday',days),3)*/
dy = date('weekday',days)
parse value date('normal',days) with dd mmm yyyy
dmy = dd'-'mmm'-'right(yyyy,2)
dt = dy',' dmy right(hr,2,'0')':'right(mn,2,'0')':'right(sc,2,'0')
/*say '{'dt'}'*/
writech(socket,'If-Modified-Since:' dt||gv.crlf)
end
writech(socket,gv.crlf)
/* read first line of response */
if ~eof(socket) then
checkline = readch(socket,8)
OK = gv.true
if checkline = 'HTTP/1.0' then
do
header = gv.false
/* Deal with header */
line = readln(socket)
parse var line response_code rest
if response_code = '200' then
do
OK = gv.true
if gv.save_headers then
hfile = filename'.HDF'
else
hfile = gv.header_file
header = open(headf,hfile,'Write')
end
else
OK = gv.false
/* read off header returned */
do while (~eof(socket)) & (line ~= '') & (line ~= gv.cr) & (line ~= gv.crlf)
line = readln(socket)
if header then
writeln(headf,line)
if gv.max_length ~= '' then do
if upper(left(line,16)) = 'CONTENT-LENGTH: ' then do
parse UPPER var line 'CONTENT-LENGTH: 'size
if size > gv.max_length then do
OK = gv.false
response_code = 'file too large ('size' Bytes)'
end
end
end
end
if header then
close(headf)
end
if OK then
do
CreateDirectories(filename)
if open(new,filename,output_mode) then
do
if HTMLFILE_TYPE(filename) then do
date_string = time('Civil') date('Normal')
original_link = '[<a href="'url'">Original</a>]'
local_link = '[<a href="file://localhost/'filename'">Local copy</a>]'
author_string = 'Saved by GetURL.rexx by <a href="http://www.cs.latrobe.edu.au/~burton/">James Burton</a>'
cache_header = '<h6>Date saved : 'date_string' 'original_link' 'local_link' 'author_string'</h6>'
writeln(new,cache_header)
end
if checkline ~= 'HTTP/1.0' then
writech(new,checkline)
/* byte by byte transfer of the file */
do while ~eof(socket)
buffer = readch(socket,4096)
writech(new,buffer)
end
close(new)
end
else
say 'unable to open file:' filename 'for writing'
end
else
do
say 'HTTP ERROR' response_code':' rest
say 'COMMAND:' get_command
OK = gv.false
end
close(socket)
end
else
do
say 'unable to reach host:' host
say 'socket:' socketname
OK = gv.false
end
if OK then
SearchFile(filename,prefix,agenda,length(cache_header))
RETURN OK
/*=======================================================================*/
/* ApplyTimeZone */
/*=======================================================================*/
ApplyTimeZone: Procedure expose gv.
parse arg days,mins
sign = left(gv.timezone,1)
offset = strip(substr(gv.timezone,2))
mn_offset = right(offset,2)
hr_offset = left(offset,length(offset)-2)
hr = mins % 60
mn = mins // 60
if sign = '+' then
do /* subtract the offset */
if hr_offset > hr then do
days = days - 1
hr = hr + 24
end
hr = hr - hr_offset
if mn_offset > mn then do
hr = hr - 1
mn = mn + 60
end
mn = mn - mn_offset
end
else
do /* add the offset */
hr = hr + hr_offset
if hr > 24 then do
days = days + 1
hr = hr - 24
end
mn = mn + mn_offset
if mn > 60 then do
hr = hr + 1
mn = mn - 60
end
end
mins = hr * 60 + mn
RETURN days','mins
/*=======================================================================*/
/* SearchFile */
/*=======================================================================*/
SearchFile: Procedure expose gv. agenda. prune.
parse arg filename,prefix,agenda,skip_length
signal ON break_c
/* now search it */
if HTMLFILE_TYPE(filename) then do
if open(new,filename,'Read') then do
if skip_length > 0 then
readch(new,skip_length) /* skip the top of the file */
do while ~eof(new)
buffer = readln(new)
SearchLine(buffer,prefix,agenda)
end
if (gv.use_level_marker) & (~gv.use_associative) then
AddToAgenda(agenda,gv.level_marker gv.depth+1,'force')
close(new)
end
end
RETURN gv.true
/*=======================================================================*/
/* CreateDirectories */
/*=======================================================================*/
CreateDirectories: Procedure expose gv.
parse arg filename
signal ON break_c
parse var filename dev ':' path
current_dir = dev':'
do while path ~= ''
parse var path dir '/' path
if right(current_dir,1) = ':' then
current_dir = current_dir||dir
else
current_dir = current_dir'/'dir
if path ~= '' then
if ~exists(current_dir) then
address command 'makedir' current_dir
end
RETURN gv.true
/*=======================================================================*/
/* HTMLFILE_TYPE */
/*=======================================================================*/
HTMLFILE_TYPE: Procedure expose gv.
parse arg path
dot = lastpos('.',path)
if dot = 0 then
do
if right(path,1) = '/' then
RETURN gv.true
else
RETURN gv.false
end
else
do
extension = upper(substr(path,dot,4))
if extension = '.HTM' then
RETURN gv.true
else
RETURN gv.false
end
RETURN gv.true
/*=======================================================================*/
/* TEXTFILE_TYPE */
/*=======================================================================*/
TEXTFILE_TYPE: Procedure expose gv.
parse arg path
dot = lastpos('.',path)
if dot = 0 then
do
if right(path,1) = '/' then
RETURN gv.true
else
RETURN gv.false
end
else
do
extension = upper(substr(path,dot,4))
if (extension = '.TXT') | (extension = '.DOC') | (extension = '.TEX') | (extension = '.HTM') then
RETURN gv.true
else
RETURN gv.false
end
RETURN gv.true
/*=======================================================================*/
/* AppendFile */
/*=======================================================================*/
AppendFile: Procedure expose gv. agenda. prune.
parse arg input,output,agenda
signal ON break_c
out = gv.false
if output ~= '' then do
out = open(outputfile,output,'Write')
if ~out then
say 'unable to open output file :' output
end
if input = '-' then
do
do while ~eof(stdin)
buffer = readln(stdin)
SearchLine(buffer,gv.local_prefix,agenda)
if out then
writeln(outputfile,buffer)
end
end
else
do
if open(inputfile,input,'Read') then
do
do while ~eof(inputfile)
buffer = readln(inputfile)
SearchLine(buffer,gv.local_prefix,agenda)
if out then
writeln(outputfile,buffer)
end
close(inputfile)
end
else
say 'unable to open input file :' input
end
if out then do
close(outputfile)
end
RETURN gv.true
/*=======================================================================*/
/* PrintUsage */
/*=======================================================================*/
PrintUsage:
say 'GetURL' gv.version ': Written by James Burton (burton@cs.latrobe.edu.au)'
say 'USAGE :'
say ' rx geturl [<url>] [<options>]'
say
say '<url> can be'
say ' http:[//]host[:port]/[path]'
say
say '<options> can be'
say ' -Help display this message'
say ' -Problem send message to me, bug report, query etc.'
say ' -NewVersion <file> downloads most recent version of GetURL.rexx'
say ' -PatternMatching downloads pattern matching utility'
say ' -Associative use associative array rather than files'
say ' -Delay <num> delay <num> seconds before next file'
say
say ' -Recursive recursively collect files, following contained URLs'
/*say ' -Directory <dir> recursively collect URLs from text files in <dir>'*/
/*say ' -Update <dir> recursively check files in <dir> for updates'*/
say ' -NoProxy fetch directly from remote site, ignore proxy config'
say ' -NoProxyCache force proxy to reload cache'
say ' -Retry retry previously failed URLs (if -recursive set)'
say
say ' -Host <pattern> only collect files from hosts matching <pattern>'
say ' -Path <pattern> only collect files from paths matching <pattern>'
say ' -URL <pattern> only collect files from URLs matching <pattern>'
say ' -Number <num> only collect <num> files'
say ' -Length <num> only collect files smaller than <num> bytes'
say ' -IfModified only collect files modified since cached'
say ' -Depth <num> only collect files from <num> levels of recursion'
say
say ' -Input <filename> read input from file instead of <url>'
say ' -Input read input from standard input instead of <url>'
say ' -Output <filename> concatenate all files onto <filename>'
say ' -SaveRoot <dir> save files under <dir> rather than Mosaic:'
say ' -Visited <file> use specified visited file, won''t revisit these URLs'
say ' -Unvisited <file> use specified unvisited file, visit these URLs first'
say ' -Failed <file> use specified failed file, these ones did not transfer'
say ' -SaveHeaders save transfer header to *.HDF'
say
say '<options> can all be abbreviated.'
say
RETURN gv.true
/*=======================================================================*/
/* Configuration */
/*=======================================================================*/
Configure: Procedure expose gv.
/* look for HTTP Proxy server */
if GetEnvVar('WWW_HTTP_GATEWAY') ~= '' then
parse value GetEnvVar('WWW_HTTP_GATEWAY') with 'http://'gv.http_proxy_host':'gv.http_proxy_port'/'
else if GetEnvVar('HTTP_PROXY') ~= '' then
parse value GetEnvVar('HTTP_PROXY') with 'http://'gv.http_proxy_host':'gv.http_proxy_port'/'
else
gv.http_proxy_host = ''
/* look for FTP Proxy server */
if GetEnvVar('WWW_FTP_GATEWAY') ~= '' then
parse value GetEnvVar('WWW_FTP_GATEWAY') with 'http://'gv.ftp_proxy_host':'gv.ftp_proxy_port'/'
else if GetEnvVar('FTP_PROXY') ~= '' then
parse value GetEnvVar('FTP_PROXY') with 'http://'gv.ftp_proxy_host':'gv.ftp_proxy_port'/'
else
gv.ftp_proxy_host = ''
/* look for User's login name */
if GetEnvVar('USER') ~= '' then
user = GetEnvVar('USER')
else
do
if GetEnvVar('LOGNAME') ~= '' then
user = GetEnvVar('LOGNAME')
else
user = GetEnvVar('USERNAME')
end
/* look for User's real name */
real = GetEnvVar('REALNAME')
/* look for Machine's name */
if GetEnvVar('HOSTNAME') ~= '' then
host = GetEnvVar('HOSTNAME')
else
do
if GetEnvVar('HOST') ~= '' then
host = GetEnvVar('HOST')
else
host = GetEnvVar('NODENAME')||GetEnvVar('DOMAINNAME')
end
/* construct email address */
gv.user_address = real '<'user'@'host'>'
/* look for match program */
address command 'which >env:result match'
if GetEnvVar('result') ~= '' then
do
r = GetEnvVar('result')
address command 'version >ENV:result' r 'full'
r = GetEnvVar('result')
r = compress(r,'0a'x)
if r = 'Match 1.0 (16/01/95)(16-Jan-95) James Burton (burton@cs.latrobe.edu.au)' then
gv.use_match = gv.true
else
gv.use_match = gv.false
end
else
gv.use_match = gv.false
/* make sure we have a valid timezone */
if gv.timezone = '' then do
say 'Since this is the first time you have run GetURL' gv.version
say 'we need to know the Time Zone you are in relative to GMT (England time)'
say 'Please enter the hrs & mins e.g. +11:00 (AEDT) or +10:30 (ACDT)'
writech(stdout,'Enter TimeZone [+/-]<hr>:<mins> ')
line = readln(stdin)
parse value substr(line,2) with hr ':' mn
sign = left(line,1)
say 'TimeZone is' sign||hr':'mn
gv.timezone = sign||hr||mn
parse source dummy dummy dummy resolved_name dummy
if open(f,resolved_name,'read') then
do
t_tmpname = 't:tmp-'tmpname()
if open(fout,t_tmpname,'Write') then
do
i = 0
do while ~eof(f)
line = readln(f)
if left(line,11) = 'gv.timezone' then
do
writeln(fout,'gv.timezone =' '"'gv.timezone'"')
writech(stdout,'I')
end
else
do
writeln(fout,line)
if (i // 30) = 0 then
writech(stdout,'.')
end
i = i + 1
end
writeln(stdout,'<<')
close(fout)
/*address command 'copy' t_tmpname resolved_name*/
address command 'copy' t_tmpname 't:GetURL-configured.rexx'
address command 'delete >NIL:' t_tmpname
say 'Please use t:GetURL-configured.rexx in future'
end
else
say 'unable to open temp file'
close(f)
end
else
say 'unable to find the program file, please edit the timzone in by hand.'
end
RETURN gv.true
/*=======================================================================*/
/* OpenLibrary */
/*=======================================================================*/
OpenLibrary: Procedure expose gv.
parse arg libname,strict
ret = gv.true
if ~show('Libraries',libname) then do
if ~addlib(libname,0,-30,0) then do
ret = gv.false
if strict then do
say 'unable to open ARexx library :' libname
EXIT 20
end
end
end
RETURN ret
/*=======================================================================*/
/* GetEnvVar */
/*=======================================================================*/
GlobalGetEnv: Procedure expose gv.
parse arg env_variable
if open(varfile,'ENV:'||env_variable,'Read') then
do
line = readln(varfile)
close(varfile)
end
else
line = ''
RETURN line
GetEnvVar: Procedure expose gv.
parse arg env_variable
if show('Libraries','rexxarplib.library') then
do
r = getenv(env_variable)
if right(r,1) = '0a'x then
r = left(r,length(r)-1)
RETURN r
end
else
RETURN GlobalGetEnv(env_variable)
end
RETURN
/*=======================================================================*/
/* ParseUrl */
/*=======================================================================*/
ParseUrl: Procedure expose gv.
parse arg ret_url
signal ON break_c
ok = gv.true
parse var ret_url ret_method':'rest
ret_method = upper(ret_method)
select
when ret_method = 'HTTP' then
do
/* set defaults for HTTP */
ret_host = ''
ret_port = 80
ret_path = ''
parse var rest '//'host_part'/'ret_path
if host_part = '' then /* allow missing '//' */
parse var rest host_part'/'ret_path
parse var host_part ret_host':'ret_port
parse var ret_path ret_path '#' dummy /* these can be safely ignored */
if pos('?',ret_path) = 0 then
ok = gv.true
else
ok = gv.false
end
when ret_method = 'FILE' then
do
/* set defaults for FILE */
ret_host = ''
ret_port = ''
ret_path = ''
parse var rest '//'host_part'/'ret_path
if host_part = '' then
parse var rest host_part'/'ret_path
parse var host_part ret_host':'ret_port
if (ret_host = 'localhost') | (ret_host = '') then
ret_host = 'localhost'
else
ret_method = 'FTP'
ok = gv.true
end
when ret_method = 'FTP' then
do
say 'FTP unimplemented'
ok = gv.false
end
when ret_method = 'MAILTO' then
do
say 'MAILTO unimplemented'
ok = gv.false
end
when ret_method = 'TELNET' then
do
say 'TELNET unimplemented'
ok = gv.false
end
when ret_method = 'NEWS' then
do
say 'NEWS unimplemented'
ok = gv.false
end
when ret_method = 'WAIS' then
do
say 'WAIS unimplemented'
ok = gv.false
end
when ret_method = 'GOPHER' then
do
say 'GOPHER unimplemented'
ok = gv.false
end
otherwise
do
/* not much more we can do if do not recognise the access method */
say 'unknown HTTP Method :' ret_method
ok = gv.false
end
end
RETURN ok','ret_url','ret_method','ret_host','ret_port','ret_path
/*=======================================================================*/
/* TmpName */
/*=======================================================================*/
TmpName:
RETURN date(days)"-"time(seconds)
/*=======================================================================*/
/* SendMail */
/*=======================================================================*/
SendMail: Procedure expose gv.
parse arg openfile,to_address,subject
parse var to_address to_login'@'to_host
parse var gv.user_address real'<'from_login'@'from_host'>'
socketname = 'tcp:'to_host'/smtp'
end_mail = '2e'x
send_timestamp = left(date('weekday'),3) date('normal') time('normal')
message_id = tmpname()'@'from_host
if open(socket,socketname,'Write') then
do
if ~CheckMailResult(socket) then call SendMailQuit
writech(socket,'HELO' from_host||gv.crlf)
if ~CheckMailResult(socket) then call SendMailQuit
writech(socket,'MAIL FROM:' '<'from_login'@'from_host'>'||gv.crlf)
if ~CheckMailResult(socket) then call SendMailQuit
writech(socket,'RCPT TO:' '<'to_address'>'||gv.crlf)
if ~CheckMailResult(socket) then call SendMailQuit
writech(socket,'DATA'||gv.crlf)
if ~CheckMailResult(socket) then call SendMailQuit
writech(socket,'From' from_login'@'from_host send_timestamp||gv.crlf)
writech(socket,'Received: by' from_host '(GetUrl.rexx' gv.version 'burton@cs.latrobe.edu.au)'||gv.crlf)
writech(socket,'Message-Id:' '<'message_id'>'||gv.crlf)
writech(socket,'Date:' send_timestamp '+0000'||gv.crlf)
writech(socket,'From:' gv.user_address||gv.crlf)
writech(socket,'To:' to_address||gv.crlf)
writech(socket,'Subject:' subject||gv.crlf)
writech(socket,'X-Mailer: GetURL.rexx' gv.version 'by James Burton (burton@cs.latrobe.edu.au)'||gv.crlf)
writech(socket,gv.crlf)
address command 'wait 1'
do while ~eof(openfile)
line = readln(openfile)
if line = '.' then
break
writech(socket,line||gv.crlf)
end
writech(socket,end_mail||gv.crlf)
if CheckMailResult(socket) then
say 'Message sent OK.'
SendMailQuit:
writech(socket,'QUIT'||gv.crlf)
close(socket)
end
else
say 'could not open SMTP socket to host:' to_host
RETURN gv.true
CheckMailResult: Procedure expose gv.
parse arg socket
do while ~eof(socket)
line = readln(socket)
/*say '"'line'"'*/
if substr(line,4,1) ~= '-' then
break
end
status_code = left(line,3)
/*say '['status_code']'*/
if status_code > 399 then
RETURN gv.false
else
RETURN gv.true
RETURN gv.true
/*=======================================================================*/
/* AddToFailed */
/* adds item to next failed */
/*=======================================================================*/
BTAddToFailed: Procedure expose gv. failed.
parse arg item
say 'AddToFailed ' item
if failed.item = 0 then /* if not already known */
do
say 'accepted'
next = failed.tail + 1 /* index for this item */
failed.tail = next /* advance tail of failed */
failed.next = item /* store item at tail */
failed.item = 1
end
RETURN gv.true
/*=======================================================================*/
/* AddToPrune */
/* adds item to next prune */
/*=======================================================================*/
BTAddToPrune: Procedure expose gv. prune.
parse arg item
say 'AddToPrune ' item
if prune.item = 0 then /* if not already known */
do
say 'accepted'
next = prune.tail + 1 /* index for this item */
prune.tail = next /* advance tail of prune */
prune.next = item /* store item at tail */
prune.item = 1
end
RETURN gv.true
/*=======================================================================*/
/* AddToAgenda */
/* adds item to tail of agenda */
/*=======================================================================*/
BTAddToAgenda: Procedure expose gv. agenda.
parse arg item
say 'AddToAgenda' item
say agenda.item
if (agenda.item ~= 0) then say 'already known'
if (index(item,'#') ~= 0) then say 'is a # reference'
/* if not already known and not an internal reference */
if (agenda.item = 0) & (index(item,'#') = 0) then
do
say 'accepted'
next_depth = gv.depth + 1 /* next recursion level */
next = agenda.tail + 1 /* index for this item */
agenda.tail = next /* advance tail of agenda */
agenda.next = item /* store item at tail */
agenda.item = next_depth /* and store depth at item */
end
RETURN gv.true
/*=======================================================================*/
/* AgendaTop */
/* return the topmost (first) item in the agenda */
/* and remove it from the agenda */
/*=======================================================================*/
BTAgendaTop: Procedure expose gv. agenda.
say 'AgendaTop'
next = agenda.head /* location of next item */
say 'next ' next
item = agenda.next /* get it */
say 'item ' item
if (item ~= 0) then
do
gv.depth = agenda.item /* set new depth */
say 'depth ' gv.depth
agenda.next = 0 /* drop it from agenda */
agenda.item = 0 /* and forget its depth */
agenda.head = next + 1 /* and advance the head */
end
else
do
item = ''
end
say 'AgendaTop ' item
RETURN item
/*=======================================================================*/
/* AddFileTo */
/* load an array from a file */
/*=======================================================================*/
BTAddFileTo: Procedure expose gv. agenda. prune. failed.
parse arg array,filename
say 'AddFileTo ' array filename
/* performance shouldnt be an issue here */
if open(filep, filename, 'Read') then
do
do while ~eof(filep)
buffer = readln(filep)
if (buffer ~= '') then
select
when (array = prune) then
AddToPrune(buffer)
when (array = agenda) then
AddToAgenda(buffer)
when (Array = failed) then
AddToFailed(buffer)
otherwise
nop
end /* select */
end /* while */
close(filep)
end /* if */
else
do
say 'AddFileTo ' array ' could not read ' filename
end /* else */
say 'AddFileTo exits'
RETURN gv.true
/*=======================================================================*/
/* DumpAgendaTofile */
/* unload agenda to a file */
/*=======================================================================*/
BTDumpAgendaToFile: Procedure expose gv. agenda.
parse arg filename
say 'DumpAgendaToFile ' filename
/* note: we deliberately do NOT allow this to effect the global 'depth' */
depth = 1
if open(filep, filename, 'Write') then
do
buffer = AgendaTop()
do while (buffer ~= '' )
writeln(filep, buffer)
buffer = AgendaTop()
end /* while */
agenda.head = 1
agenda.tail = 0
close(filep)
end /* if */
else
do
say 'DumpAgendaToFile could not write to ' filename
end /* else */
say 'DumpAgendaToFile exits '
RETURN gv.true
/*=======================================================================*/
/* DumpFailedToFile */
/* unload failed to a file */
/*=======================================================================*/
BTDumpFailedToFile: procedure expose gv. failed.
parse arg filename
say 'DumpFailedToFile ' filename
if open(filep, filename, 'Write') then
do
i = 1
buffer = failed.i
do while (buffer ~= '0' )
writeln(filep, buffer)
failed.i = 0
failed.buffer = 0
i = i + 1
buffer = failed.i
end /* while */
failed.tail = 0
close(filep)
end /* if */
else
do
say 'DumpFailedToFile could not write to ' filename
end /* else */
say 'DumpFailedToFile exits'
RETURN gv.true
/*=======================================================================*/
/* End of File 'geturl.rexx' */
/*=======================================================================*/