home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
The Datafile PD-CD 5
/
DATAFILE_PDCD5.iso
/
utilities
/
p
/
python
/
!Python
/
Lib
/
NetLib
/
py
/
urllib
< prev
next >
Wrap
Text File
|
1996-10-22
|
22KB
|
758 lines
# Open an arbitrary URL
#
# See the following document for a tentative description of URLs:
# Uniform Resource Locators Tim Berners-Lee
# INTERNET DRAFT CERN
# IETF URL Working Group 14 July 1993
# draft-ietf-uri-url-01.txt
#
# The object returned by URLopener().open(file) will differ per
# protocol. All you know is that is has methods read(), readline(),
# readlines(), fileno(), close() and info(). The read*(), fileno()
# and close() methods work like those of open files.
# The info() method returns an mimetools.Message object which can be
# used to query various info about the object, if available.
# (mimetools.Message objects are queried with the getheader() method.)
import string
import socket
import regex
import os
__version__ = '1.5'
# Helper for non-unix systems
if os.name == 'mac':
from macurl2path import url2pathname, pathname2url
elif os.name == 'nt':
from nturl2path import url2pathname, pathname2url
else:
def url2pathname(pathname):
return pathname
def pathname2url(pathname):
return pathname
# This really consists of two pieces:
# (1) a class which handles opening of all sorts of URLs
# (plus assorted utilities etc.)
# (2) a set of functions for parsing URLs
# XXX Should these be separated out into different modules?
# Shortcut for basic usage
_urlopener = None
def urlopen(url):
global _urlopener
if not _urlopener:
_urlopener = FancyURLopener()
return _urlopener.open(url)
def urlretrieve(url, filename=None):
global _urlopener
if not _urlopener:
_urlopener = FancyURLopener()
if filename:
return _urlopener.retrieve(url, filename)
else:
return _urlopener.retrieve(url)
def urlcleanup():
if _urlopener:
_urlopener.cleanup()
# Class to open URLs.
# This is a class rather than just a subroutine because we may need
# more than one set of global protocol-specific options.
# Note -- this is a base class for those who don't want the
# automatic handling of errors type 302 (relocated) and 401
# (authorization needed).
ftpcache = {}
class URLopener:
# Constructor
def __init__(self, proxies=None):
if proxies is None:
proxies = getproxies()
self.proxies = proxies
server_version = "Python-urllib/%s" % __version__
self.addheaders = [('User-agent', server_version)]
self.tempcache = None
# Undocumented feature: if you assign {} to tempcache,
# it is used to cache files retrieved with
# self.retrieve(). This is not enabled by default
# since it does not work for changing documents (and I
# haven't got the logic to check expiration headers
# yet).
self.ftpcache = ftpcache
# Undocumented feature: you can use a different
# ftp cache by assigning to the .ftpcache member;
# in case you want logically independent URL openers
def __del__(self):
self.close()
def close(self):
self.cleanup()
def cleanup(self):
import os
if self.tempcache:
for url in self.tempcache.keys():
try:
os.unlink(self.tempcache[url][0])
except os.error:
pass
del self.tempcache[url]
# Add a header to be used by the HTTP interface only
# e.g. u.addheader('Accept', 'sound/basic')
def addheader(self, *args):
self.addheaders.append(args)
# External interface
# Use URLopener().open(file) instead of open(file, 'r')
def open(self, fullurl):
fullurl = unwrap(fullurl)
type, url = splittype(fullurl)
if not type: type = 'file'
self.openedurl = '%s:%s' % (type, url)
if self.proxies.has_key(type):
proxy = self.proxies[type]
type, proxy = splittype(proxy)
host, selector = splithost(proxy)
url = (host, fullurl) # Signal special case to open_*()
name = 'open_' + type
if '-' in name:
import regsub
name = regsub.gsub('-', '_', name)
if not hasattr(self, name):
return self.open_unknown(fullurl)
try:
return getattr(self, name)(url)
except socket.error, msg:
raise IOError, ('socket error', msg)
# Overridable interface to open unknown URL type
def open_unknown(self, fullurl):
type, url = splittype(fullurl)
raise IOError, ('url error', 'unknown url type', type)
# External interface
# retrieve(url) returns (filename, None) for a local object
# or (tempfilename, headers) for a remote object
def retrieve(self, url, filename=None):
if self.tempcache and self.tempcache.has_key(url):
return self.tempcache[url]
url1 = unwrap(url)
self.openedurl = url1
if self.tempcache and self.tempcache.has_key(url1):
self.tempcache[url] = self.tempcache[url1]
return self.tempcache[url1]
type, url1 = splittype(url1)
if not filename and (not type or type == 'file'):
try:
fp = self.open_local_file(url1)
del fp
return url2pathname(splithost(url1)[1]), None
except IOError, msg:
pass
fp = self.open(url)
headers = fp.info()
if not filename:
import tempfile
filename = tempfile.mktemp()
result = filename, headers
if self.tempcache is not None:
self.tempcache[url] = result
tfp = open(filename, 'w')
bs = 1024*8
block = fp.read(bs)
while block:
tfp.write(block)
block = fp.read(bs)
del fp
del tfp
return result
# Each method named open_<type> knows how to open that type of URL
# Use HTTP protocol
def open_http(self, url):
import httplib
if type(url) is type(""):
host, selector = splithost(url)
user_passwd, host = splituser(host)
else:
host, selector = url
urltype, rest = splittype(selector)
if string.lower(urltype) == 'http':
realhost, rest = splithost(rest)
user_passwd, realhost = splituser(realhost)
if user_passwd:
selector = "%s://%s%s" % (urltype,
realhost, rest)
print "proxy via http:", host, selector
if not host: raise IOError, ('http error', 'no host given')
if user_passwd:
import base64
auth = string.strip(base64.encodestring(user_passwd))
else:
auth = None
h = httplib.HTTP(host)
h.putrequest('GET', selector)
if auth: h.putheader('Authorization: Basic %s' % auth)
for args in self.addheaders: apply(h.putheader, args)
h.endheaders()
errcode, errmsg, headers = h.getreply()
fp = h.getfile()
if errcode == 200:
return addinfourl(fp, headers, self.openedurl)
else:
return self.http_error(url,
fp, errcode, errmsg, headers)
# Handle http errors.
# Derived class can override this, or provide specific handlers
# named http_error_DDD where DDD is the 3-digit error code
def http_error(self, url, fp, errcode, errmsg, headers):
# First check if there's a specific handler for this error
name = 'http_error_%d' % errcode
if hasattr(self, name):
method = getattr(self, name)
result = method(url, fp, errcode, errmsg, headers)
if result: return result
return self.http_error_default(
url, fp, errcode, errmsg, headers)
# Default http error handler: close the connection and raises IOError
def http_error_default(self, url, fp, errcode, errmsg, headers):
void = fp.read()
fp.close()
raise IOError, ('http error', errcode, errmsg, headers)
# Use Gopher protocol
def open_gopher(self, url):
import gopherlib
host, selector = splithost(url)
if not host: raise IOError, ('gopher error', 'no host given')
type, selector = splitgophertype(selector)
selector, query = splitquery(selector)
selector = unquote(selector)
if query:
query = unquote(query)
fp = gopherlib.send_query(selector, query, host)
else:
fp = gopherlib.send_selector(selector, host)
return addinfourl(fp, noheaders(), self.openedurl)
# Use local file or FTP depending on form of URL
def open_file(self, url):
if url[:2] == '//':
return self.open_ftp(url)
else:
return self.open_local_file(url)
# Use local file
def open_local_file(self, url):
host, file = splithost(url)
if not host:
return addinfourl(open(url2pathname(file), 'r'), noheaders(), 'file:'+file)
host, port = splitport(host)
if not port and socket.gethostbyname(host) in (
localhost(), thishost()):
file = unquote(file)
return addinfourl(open(url2pathname(file), 'r'), noheaders(), 'file:'+file)
raise IOError, ('local file error', 'not on local host')
# Use FTP protocol
def ope