Windows News 2005 November

home *** CD-ROM | disk | FTP | other *** search

/ Windows News 2005 November / WNnov2005.iso / Windows / Equipement / Media-portal / Setup.msi / _6F5CFA107520147A76D0EEE5A1A7F29D / _1EE7E06543CB417684249FDC75CDC7EE < prev next >

Wrap

Text File | 2004-04-22 | 30.5 KB | 968 lines

#!/usr/bin/env python # $Id: tv_grab_nl.py,v 1.2 2004/04/22 12:12:53 yamp Exp $ # $Log: tv_grab_nl.py,v $ # Revision 1.2 2004/04/22 12:12:53 yamp # 22-04-2004: tvguide : fixed : Mediaportal didnt use the timezone settings specified in tvguide.xml # # Revision 1.1 2004/04/21 14:56:30 yamp # no message # # Revision 1.1 2004/04/21 05:47:14 yamp # no message # # Revision 1.16 2004/03/04 12:48:15 paul # Added check for programming with equal begin and end time. Sesamstraat # now works :-) # # Revision 1.15 2003/12/28 10:03:56 paul # Added --slowdays option. Grabs --slowdays with full info and the rest # with fast info. # # Revision 1.14 2003/11/06 07:12:36 paul # Python barfed on a regexp with a recursion limit error. Reworked the # regular expression. # # Revision 1.13 2003/11/02 07:54:45 paul # Fix: loop continues after exception. # # Revision 1.12 2003/11/01 21:53:06 paul # Blah. # # Revision 1.11 2003/09/30 11:27:03 paul # 1) FIXED BUG: two programs with identical end times and close start times could # remove programming for the rest of the day. # 2) FIXED BUG: Removing items from a list did not check for double entries. This # could cause the removal of a single program. # 3) FIXED BUG: the cache counters are now updated correctly. Before, a # cached entry would not be counted anymore, now the cleaning and # thresholding is more functional. Caching still needs a rethinking. # 4) MINOR: changed the output while fetching a little bit. # # Revision 1.10 2003/09/10 07:16:56 paul # Long overdue indentation tab->space conversion # Fixed bug in ampersand handling: &...; tags were allowed to include # whitespace. # # Revision 1.9 2003/08/01 06:50:56 paul # Config file quick fix # # Revision 1.8 2003/07/28 11:26:29 paul # Small fixes for pages that cannot be downloaded. # # Revision 1.7 2003/07/23 11:19:05 paul # Removed the 250 character limit because it sometimes broke by breaking # at & # # Revision 1.6 2003/07/15 07:08:46 paul # Removed some of the debugging output. # # Revision 1.5 2003/07/14 22:33:46 paul # More or less perfectly working. Have to clean up some of the debugging # code. # # Revision 1.4 2003/07/13 17:37:15 paul # Added most of the caching stuff. # # Revision 1.3 2003/07/10 06:47:25 paul # Randomized the fetching order for detail information. # # Revision 1.2 2003/07/09 21:08:44 paul # Fixed the Rembo&Rembo error. Added a little bit of documentation. # # Revision 1.1.1.1 2003/07/09 16:26:43 paul # First working version # import re import urllib import getopt, sys from string import replace, split, strip import time import random import htmlentitydefs import os.path import pickle # do extra debug stuff debug = 1 try: import redirect except: debug = 0 pass """ This is yet another XMLTV grabber based on tvgids.nl. It relies heavily on parsing the website and if anything changes on the site this script will probably fail. Note: All this is created without any serious knowledge of python or how to create xmltv stuff. If it works, great! If not, well, send me a patch. Some minor adaptation of category stuff is made to please mythtv. """ # globals tvgids = 'http://www.tvgids.nl/' uitgebreid_zoeken = tvgids + 'uitgebreidzoeken.html' # Wait a random number of seconds between each page fetch. # We want to be nice to tvgids.nl # Also, it appears tvguid.nl throttles its output. # So there. nice_time = [1, 3] # Create a category translation dictionary # Look in mythtv/themes/blue/ui.xml for all categories # The keys are the categories used by tvgids.nl cattrans = { 'Film' : 'Movies', 'Amusement' : 'Talk', 'Nieuws/actualiteiten' : 'News', 'Informatief' : 'Educational', 'Comedy' : 'Comedy', 'Serie/soap' : 'SERIE', 'Misdaad' : 'Crime/Mystery', 'Sport' : 'Sports', 'Muziek' : 'Music', 'Documentaire' : 'Documentary', 'Erotiek' : 'Adult', 'Kunst/Cultuur' : 'Educational', 'Wetenschap' : 'Science/Nature', 'Jeugd' : 'Kids', 'Natuur' : 'Nature', 'Animatie' : 'Kids', 'Religieus' : 'Religion'} # build inverse htmldefs dict #html_to_latin1 = htmlentitydefs.entitydefs #latin1_to_html = {} #for key in html_to_latin1.keys(): # latin1_to_html[html_to_latin1[key]] = key # Work in progress, the idea is to cache program categories and # descriptions to eliminate a lot of page fetches from tvgids.nl # for programs that do not have interesting/changing descriptions class ProgramCache: """ A cache to hold program name and category info """ def __init__(self, filename=None, threshold=3): """ Nothing much to do """ # if a program has been queried at threshold times with # the same information return the cached version self.threshold = threshold self.filename = filename if filename == None: self.pdict = {} else: if os.path.isfile(filename): self.load(filename) else: self.pdict = {} def load(self, filename): """ Loads a pickled cache dict from file """ self.pdict = pickle.load(open(filename,'r')) def dump(self, filename): """ Dumps a pickled cache """ pickle.dump(self.pdict, open(filename, 'w')) def query(self, program_name): """ Updates/gets/whatever. """ retval = None for key in self.pdict.keys(): item = self.pdict[key] if program_name == item[0] and item[3] >= self.threshold: retval = item break return retval def add(self, program_name, category, description): """ Adds a program_name, category, description tuple """ key = (program_name, category, description) if self.pdict.has_key(key): self.pdict[key][3] += 1 else: self.pdict[key] = [program_name, category, description, 1] def clean(self): """ Removes all programming under the current threshold from the cache """ for key in self.pdict.keys(): item = self.pdict[key] if item[3]<self.threshold: del self.pdict[key] def info(self): """ Prints some info on the cache """ n = len(self.pdict.keys()) t = 0.0 for key in self.pdict.keys(): kaas = self.pdict[key] t += self.pdict[key][3] print "%s %s" % (kaas[3], kaas[0]) print "Average threshold %s %s %s" % (t, n, t/n) def usage(): print 'Yet another grabber for Dutch television' print 'Usage:' print '--help, -h = print this info' print '--days = # number of days to grab' print '--slow = also grab descriptions of programming' print '--slowdays = grab slowdays initial days and the rest in fast mode' print '--offset = # day offset from where to grab (0 is today)' print '--configure = create configfile (overwrites existing file)' print '--output = file where to put the output' print '--cache = cache descriptions and use the file to store' print '--threshold # = number above which an item is considered to be cached' print '--clean_cache = clean the cache file and exit' print '--cache_info = print some statistics about the cache and exit' def filter_line(s): """ Removes unwanted stuff in strings (copied from tv_grab_be) """ # do the latin1 stuff tag = re.compile('(&\S*?;)') m = tag.finditer(s) for match in m: s = replace(s,match.group(1), htmlentitydefs.entitydefs[match.group(1)]) s = replace(s,' ',' ') s = replace(s,'\r',' ') x = re.compile('(<.*?>)') s = x.sub('', s) # A couple of characters which are not legal in Latin-1, we have # to guess what they are. # s = replace(s, '~Q', "'") s = replace(s, '~R', "'") # Hmm, not sure if I understand this. Without it, mythfilldatabase barfs # on program names like "Steinbrecher &..." s = replace(s,'&','&') return s def delete_from_list(list, items): """ Removes the indices in items from the list ['a', 'b', 'c', 'd' ] [2,1] -> ['a', 'd'] """ # unique-ify, sort, and reverse the range so that we start removing from the # back of the list u = {} for item in items: u[item] = 1; keys = u.keys() keys.sort() keys.reverse() for i in keys: del list[i] def time_kludge(s): """ Given a string of "HH:MM" returns a list with hh and mm "23:10" -> [23, 10] """ hh = int(s[0:2]) mm = int(s[3:5]) return [hh, mm] def duration(h1,m1,h2,m2): """ Calculates the duration of a program (24h times) in minutes. [h2,m2] can be on the next day. duration(23,10,23,15) -> 5 duration(23,10,0,20) -> 70 """ if h2<h1: hd = 23-h1 + h2 md = 60-m1 + m2 else: hd = h2-h1 md = m2-m1 if md<0: md = 60+md hd = hd - 1 return hd*60+md def get_page(url): """ Retrieves the url and returns a string with the contents """ try: fp = urllib.urlopen(url) lines = fp.readlines() page = "".join(lines) return page except: sys.stderr.write('Cannot open url: %s\n' % url) sys.exit(10); return None def get_channels(file): """ Get a list of all available channels and store these in a file. """ # store channels in a dict channels = {} # tvgids stores several instances of channels, we want to # find all the possibile channels channel_get = re.compile('<select.*?name="station">(.*?)</select>', \ re.DOTALL) # this is how we will find a (number, channel) instance channel_re = re.compile('<option value="([0-9]+)">(.*?)</option>', \ re.DOTALL) # this is where we will try to find our channel list total = get_page(uitgebreid_zoeken) if total == None: return # get a list of match objects of all the <select blah station> stations = channel_get.finditer(total) # and create a dict of number, channel_name pairs # we do this this way because several instances of the # channel list are stored in the url and not all of the # instances have all the channels, this way we get them all. for station in stations: m = channel_re.finditer(station.group(0)) for p in m: a = int(p.group(1)) b = p.group(2) # this is not naughty, tvgids starts with language selections # from 100 onwards, channel 0 does not exist if a>0 and a<100: channels[a] = b # sort on channel number (arbitrary but who cares) keys = channels.keys() keys.sort() # and create a file with the channels f = open(file,'w') for k in keys: f.write("%s %s\n" % (k, channels[k])) f.close() def get_channel_day(channel, offset): """ Get a day of programming for channel number, with an offset, where offset is one of (tvgids contains no info beyond a certain day, 5 seems a reasonable upper limit 0 = today 1 = tomorrow 2 = day after tomorrow etc The output is a list of programming in order where each row contains: [ start_time, stop_time, detail_url, program_name ] """ channel_url = 'http://www.tvgids.nl/zoekprogramma.php?'+\ 'trefwoord=Titel+of+trefwoord&'+\ 'station=%s&genre=alle&interval=%s×lot=0&periode=0&order=0' % (channel, offset) # get the raw programming for a single day total = get_page(channel_url) if total == None: return # check for following pages on the url and fetch those too # get the url given for the next page on this page following_page = re.compile('<td\s+class="lijst_zender".*?<[aA].*?[hH][rR][Ee][fF]="(.*?)">Volgende</[Aa]>\s*</td>') all_pages_done = 0 extra_page = total while not all_pages_done: # check for following page follow = following_page.search(extra_page) if follow != None: try: zoek = follow.group(1).strip() new_url = tvgids + zoek extra_page = get_page(new_url) total = total + extra_page except: all_pages_done = 1 pass else: all_pages_done = 1 # Setup a number of regexps # match <tr>.*</tr> getrow = re.compile('<[tT][Rr]>(.*?)</[tT][rR]>',re.DOTALL) # match the required program info # 1 = channel name # 2 = times # 3 = program name # 4 = url for details parserow = re.compile('class="lijst_zender">(.*?)</td>.*?' +\ 'class="lijst_tijd">(.*?)</td>.*?' +\ 'class="lijst_programma">.*?<[aA]\s+' +\ '[hH][rR][eE][fF]="(/.*?)".*?' +\ 'class="details_programma">(.*?)</[aA]', re.DOTALL) # tvgids.nl uses the following system: # # 1) normal begin and end times times = re.compile('([0-9]+:[0-9]+)-([0-9]+:[0-9]+)') # 2) a program that starts at the end of another program # (i.e. no start time for second program) # for example: # SBS 6 22:15-00:00 Hart van Nederland # SBS 6 -22:40 Trekking Lotto # meaning that "Trekking Lotto" starts after "Hart van Nederland" times_follow = re.compile('-([0-9]+:[0-9]+)') # 3) nightly progamming (this is even worse) # SBS 6 02:15-00:00 Nachtprogrammering # SBS 6 -00:00 Mobile Nights # SBS 6 -00:00 Stem van Nederland # SBS 6 - Hart van Nederland # not handled # and find relevant programming info allrows = getrow.finditer(total) programs = [] for r in allrows: detail = parserow.search(r.group(1)) if detail != None: # default times start_time = None stop_time = None # parse for begin and end times t = times.search(detail.group(2)) tf = times_follow.search(detail.group(2)) if t != None: start_time = t.group(1) stop_time = t.group(2) elif tf != None: stop_time = tf.group(1) else: # Well, here we reach wonderful # programming that is so important that # begin and end times are not given. # Skip. pass program_url = tvgids + detail.group(3) program_name = detail.group(4) program_name = program_name.strip() # store time, name and detail url in a list programs.append([start_time, stop_time, program_url, program_name]) #if debug: #sys.stderr.write('get_channel output----------\n') #for program in programs: #sys.stderr.write('%s\n' % program) # done return programs def parse_programs(programs, offset): """ Parse a list of programs as generated by get_channel_day() and convert begin and end times to xmltv compatible times. Programs is a list where each row contains: [ start_time, stop_time, detail_url, program_name ] """ # good programs good_programs = [] for i in range(len(programs)): # The common case: start and end times are present and are not # equal to each other (yes, this can happen) if programs[i][0] != None and programs[i][1] != None and programs[i][0] != programs[i][1]: good_programs.append(programs[i]) # Check for clumped programming elif programs[i][0] == None and programs[i][1] != None: # This is programming that follows directly # after the previous programming. # As far as I can see MythTV does not cater for clumpidx, here # we concatenate the names, create one program and hope for the best try: # double check if good_programs[-1][1] == '00:00': # set end time of previous program to # end time of this program good_programs[-1][1] = programs[i][1] # and adjust program name good_programs[-1][3] += ' [ + %s]' % programs[i][3] except: # good_programs was empty. Should # not happen. Oh well. pass # hmm both times are none, skip else: pass # adjust last program, this also has the ``benefit'' that it # skips allnight sex/mobile phne programming. try: if good_programs[-1][1] == '00:00' and good_programs[-1][0][0] == '0': del(good_programs[-1]) except: pass # So, good programming contains a list of all correct # programming with filled in start and end times. # All we have to do now is to correct the times for xmltv and to # account for a day switch. ##if debug: #sys.stderr.write('parse_program: after first test\n') #for program in good_programs: #sys.stderr.write('%s\n' % program) # Check for `motherprograms' i.e. a name given to a grouping of # programs, which is not a program itself. # Two checks are performed: if begin times match then the first # is removed. # If the time difference between the start times + the duration # of the second program is smaller than the duration of the # first program then the first program is removed. to_remove = [] for c in range(0, len(good_programs)-1): first = good_programs[c] next = good_programs[c+1] # also check for duration start1 = time_kludge(first[0]) stop1 = time_kludge(first[1]) start2 = time_kludge(next[0]) stop2 = time_kludge(next[1]) duration1 = duration(start1[0], start1[1], stop1[0], stop1[1]) duration2 = duration(start2[0], start2[1], stop2[0], stop2[1]) duration3 = duration(start1[0], start1[1], start2[0], start2[1]) # if begin times match, remove the first program if first[0] == next[0]: to_remove.append(c) # e.g.: # 09:00-12:00 Z@ppelin # 09:03-09:10 Pingu # 09:10-09:15 Whatever # the first is a mother program, but the start times do # not match elif (duration3+duration2)<duration1: to_remove.append(c) # if end times match, remove first program elif first[1] == next[1]: to_remove.append(c) delete_from_list(good_programs, to_remove) # done with checks, now correct the times/dates # store enough dates so that an offset is an index into dates dates = [time.strftime('%Y%m%d', time.gmtime(time.time()+x*86400)) for x in range(0,offset+2)] # and finally, modify the start/end times to xmltv format add = 0 prevstart = 0000 prevstop = 0000 for c in range(len(good_programs)): start = good_programs[c][0].replace(':','') stop = good_programs[c][1].replace(':','') # assign the new times good_programs[c][0] = dates[offset+add]+start+'00' good_programs[c][1] = dates[offset+add]+stop +'00' # check for day switch between programs if start<prevstop: add = 1 good_programs[c][0] = dates[offset+add]+start+'00' good_programs[c][1] = dates[offset+add]+stop +'00' # check for day switch in program if int(stop) < int(start): add = 1 good_programs[c][1] = dates[offset+add]+stop +'00' prevstart = start prevstop = stop #if debug: #sys.stderr.write('parse done-------------\n') #for program in good_programs: #sys.stderr.write('%s\n' % program) # done, nothing to see here, please move on return good_programs def get_descriptions(programs, program_cache=None): """ Given a programs list (from get_channel) retrieve program information """ detail = re.compile('<div class="detailDeel">.*?' +\ '<div class="detailLabel2">(.*?)</div>.*?'+\ '<div class="detailContent2">(.*?)</div>',\ re.DOTALL) num_descriptions = 0 sys.stderr.write('Descriptions[%s]: ' % len(programs)) # randomize detail requests fetch_order = range(0,len(programs)) random.shuffle(fetch_order) #for i in range(0,len(programs)): for i in fetch_order: sys.stderr.write('\n%s: %s ' % (i, programs[i][3])) # add a dictionary to hold program details programs[i].append({}) # if we have a cache use it if program_cache != None: cached_program = program_cache.query(programs[i][3]) if cached_program != None: programs[i][-1]['Genre:'] = cached_program[1] programs[i][-1]['Beschrijving:'] = cached_program[2] # this adds 1 to the cache usage counter # silly structure, need to rethink the caching. program_cache.add(programs[i][3], cached_program[1], cached_program[2]) sys.stderr.write('cached(%s) ' % cached_program[3]); #sys.stderr.write('Using cache(%s) ' %i) continue # be nice to tvgids.nl time.sleep(random.randint(nice_time[0], nice_time[1])) # get the details page, and get all the detail nodes try: total = get_page(programs[i][2]) descriptions = detail.finditer(total) except: # if we cannot find the description page, # go to next in the loop sys.stderr.write('Oh, thingy\n') continue # now parse the details for description in descriptions: title = description.groups()[0].strip() content = description.groups()[1].strip() content = filter_line(content) if content == '': continue elif title == '': programs[i][-1]['Beschrijving:'] = content #if len(content)<250: #programs[i][-1]['Beschrijving:'] = content #else: #programs[i][-1]['Beschrijving:'] = content[0:250]+"..." elif title == 'Genre:': try: programs[i][-1][title] = cattrans[content] except: programs[i][-1][title] = content elif title in ['Inhoud:', 'Titel aflevering:']: programs[i][-1][title] = content #if len(content)<250: #programs[i][-1][title] = content #else: #programs[i][-1][title] = content[0:250]+"..." # update the cache if necessary if program_cache != None: try: category = programs[i][-1]['Genre:'] except: category = '' try: description = programs[i][-1]['Beschrijving:'] except: description = '' program_cache.add(programs[i][3], category, description) sys.stderr.write('done...\n') # done def xmlefy_programs(programs, channel): """ Given a list of programming (from get_channels()) returns a string with the xml equivalent """ output = [] for program in programs: output.append(' <programme start="%s +0100" stop="%s +0100" channel="%s">\n' % (program[0], program[1], channel)) output.append(' <title lang="nl">%s</title>\n' % filter_line(program[3])) if len(program)==5: try: output.append(' <sub-title lang="nl">%s</sub-title>\n' % program[4]['Titel aflevering:']) except: pass if program[4].has_key('Inhoud:') and program[4].has_key('Beschrijving:'): try: output.append(' <desc lang="nl">%s\n%s</desc>\n' % (program[4]['Inhoud:'], program[4]['Beschrijving:'])) except: pass else: try: output.append(' <desc lang="nl">%s</desc>\n' % program[4]['Inhoud:']) except: pass try: output.append(' <desc lang="nl">%s</desc>\n' % program[4]['Beschrijving:']) except: pass try: output.append(' <category lang="nl">%s</category>\n' % program[4]['Genre:']) except: pass output.append(' </programme>\n') return "".join(output) def main(): # Parse command line options try: opts, args = getopt.getopt(sys.argv[1:], "h", ["help", "output=", "offset=", "days=", "configure", "slow", "cache=", "threshold=", "clean_cache", "cache_info"]) except getopt.GetoptError: usage() sys.exit(2) output = None output_file = None offset = 0 days = 7 slow = 0 slowdays = 2 program_cache = None program_cache_file = 'program_cache' config_file = 'tv_grab_nl_pdb.conf' threshold = 3 clean_cache = 0 cache_info = 0 # seed the random generator random.seed() for o, a in opts: if o in ("-h", "--help"): usage() sys.exit(1) if o == "--configure": sys.stderr.write('Creating config file: %s\n' % config_file) get_channels(config_file) sys.exit(2) if o == "--days": days = int(a) if o == "--offset": offset = int(a) if o == "--slow": slow = 1 if o == "--slowdays": slowdays = int(a) # slowdays implies slow == 0 slow = 0 if o == "--clean_cache": clean_cache = 1 if o == "--threshold": threshold = int(a) if o == "--cache": program_cache_file = a if o == "--cache_info": cache_info = 1 if o == "--output": output_file = a try: output = open(output_file,'w') # and redirect output if debug: debug_file = open('/tmp/kaas.xml','w') blah = redirect.Tee(output, debug_file) sys.stdout = blah else: sys.stdout = output except: sys.stderr.write('Cannot write to outputfile: %s\n' % output_file) sys.exit(10); # get configfile if available try: f = open(config_file,'r') except: sys.stderr.write('Config file not found.\n') sys.stderr.write('Re-run me with the --configure flag.\n') sys.exit(1) #check for cache program_cache = ProgramCache(program_cache_file, threshold) if clean_cache != 0: print "Cleaning the cache using threshold = %s\n" % threshold program_cache.clean() program_cache.dump(program_cache_file) sys.exit(0) elif cache_info != 0: print "Cache info" program_cache.info() sys.exit(0) # Go! channels = {} # Read the channel stuff for blah in f.readlines(): blah = blah.lstrip() blah = blah.replace('\n','') if blah[0] != '#': channel = blah.split() channels[channel[0]] = " ".join(channel[1:]) # channels are now in channels dict keyed on channel id # print header stuff print '<?xml version="1.0" encoding="ISO-8859-1"?>' print '<!DOCTYPE tv SYSTEM "xmltv.dtd">' print '<tv generator-info-name="Icky Pooh">' # first do the channel info for key in channels.keys(): print ' <channel id="%s">' % key print ' <display-name lang="nl">%s</display-name>' % channels[key] print ' </channel>' num_chans = len(channels.keys()) cur_day = -1 # now loop over the days and get the programming for x in range(0, days): cur_day += 1 sys.stderr.write('Day %s of %s\n' % (cur_day, days)) cur_chan = -1 fluffy = channels.keys() random.shuffle(fluffy) for id in fluffy: cur_chan += 1 sys.stderr.write('Channel %s of %s\n' % (cur_chan, num_chans)) try: info = get_channel_day(id, offset+x) blah = parse_programs(info, offset+x) if slow or (slowdays>0 and x<slowdays): get_descriptions(blah, program_cache) print xmlefy_programs(blah, id) except: sys.stderr.write('Could not get channel %s\n' % cur_chan) sys.exit(10); # be nice to tvgids.nl time.sleep(random.randint(2,5)) # print footer stuff print "</tv>" # close the outputfile if necessary if output != None: output.close() # save the cache if necessary if program_cache != None: program_cache.dump(program_cache_file) # allow this to be a module if __name__ == '__main__': main() # vim:tw=0:et:sw=4