#!/usr/bin/env python2 """ pyblagg: quick aggregator for Linux/Unix machines. VI settings: sw=4 ts=4 et This is an AGGREGATOR, not a news reader. Most readers out there still force me to go to each channel I subscribe to and figure out what the latest news is. This collects all the channels, aggregates the news and then display latest news first. installation and usage: - get Mark Pilgrim's ultraliberal parser - see URL below - get log4py module - see URL below - get timeoutsocket.py module also All these modules are available in CVS - edit configuration settings in your file config.ini - give it execute permission - run it from command line periodically. I don't recommend running these from CGI. """ __author__ = "S Babu " __date__ = "$Date: 2003/02/28 01:05:28 $" __credits__ = """Mark Pilgrim, for liberal RSS parser http://www.diveintomark.org/archives/2002/08/13.html#ultraliberal_rss_parser Martin Preishuber, for log4py http://sourceforge.net/project/showfiles.php?group_id=36216 """ __version__ = "$Revision: 1.12 $" #$Source: /cvsroot/spycyroll/pyblagg/pyblagg.py,v $ import sys import rssparser import timeoutsocket timeoutsocket.setDefaultSocketTimeout(60) import os import time import rfc822 import re import string import glob import cgi import threading from stat import * from log4py import Logger import SimpleStripper # Defaults DEFAULT_LOG_LEVEL = 2 DEFAULT_CONFIG_FILE = 'config.ini' # Get the log together. For the time being, this is module-level and to # override it you'll need to assign to the module's ``log4py`` attribute. # I'm strongly encouraging an eventual shift to the Python 2.3a1 logging # module, but that'stomorrow's change. log4py = Logger().get_instance() invalid_stuff = re.compile(r"\W+") def permalink2localfile(link): """Given a full url, tries to make a file name by replacing unwelcome characters with periods""" filenm = link filenm = filenm.replace('http://','') filenm = filenm.replace('www.','') filenm = invalid_stuff.sub('.', filenm) filenm = re.sub('\.+', '.', filenm) return filenm def parsetime(str_time): """Returns the time from the string; as seconds since UTC epoch. """ str_time = str_time.strip() # do we have an rfc2822 date? if str_time[3] == ',': log4py.debug("\trfc2822 date " + str_time) # we have a date of the format Sun, 22 Sep 2002 15:04:54 GMT # this assumption doesn't make me feel comfortable. okay!!! try: yyyy, mm, dd, hh, mi, ss, x, x, x, offset = \ rfc822.parsedate_tz(str_time) # TODO if offset is None, try to use the following to parse # properly # http://cvs.sourceforge.net/cgi-bin/viewcvs.cgi/simpleparse/simpleparse/common/timezone_names.py if offset is None: offset = 0.0 except ValueError: log4py.error( "RSS2 value error " + str_time) return None else: try: yyyy, mm, dd, hh, mi, ss=(int(str_time[0:4]), int(str_time[5:7]), int(str_time[8:10]), int(str_time[11:13]), int(str_time[14:16]), int(str_time[17:19])) # optional timezone offset if len(str_time) > 20: offset = str_time[20:] if offset.find(":") > 0: #using split because hours seem to appear in any format #like +12, -10, 10, 1, 02, -2 offset = string.split(offset, ":") offset = (int(offset[0]) * 60.0 + int(offset[1])) * 60.0 #in seconds else: #some feeds don't have minutes in offset :-) offset = int(offset) * 60.0 * 60.0 else: offset = 0 except ValueError: try: # yes, some people put in a _timestamp_ return float(str_time) except: log4py.error("value error " + str_time) return None # the last -1 is for DST # TODO timezone is still not considered ret = time.mktime ( (yyyy, mm, dd, hh, mi, ss, 0, 0, -1) ) # this is in seconds # now let us get to UTC ret = ret - offset return ret def titleOrdesc(dict): """Return a tuple, (title,description) from the dictionary. I want a title, then description, if it exists. some feeds have title. some have only description. some have both. """ k = dict.keys() title = None description = None if 'title' in k: title = dict['title'] else: title = None if 'description' in k and not 'title' in k: title = dict['description'] description = None if 'description' in k and 'title' in k: description = dict['description'] title = dict['title'] if 'description' not in k: description = None return (title, description) class Channel: """An RSS channel rss : url to the RSS or RDF file From this url, it figures out the rest; title : Title as specified in RSS file UNLESS you specify it while creating object link : url for the site, as specified in the RSS file description : optional textual description. items[]: collection of type NewsItem for the items in the feed """ def __init__(self, rss, title=None): self.rss = rss self.title = title self.link = None self.description = None self.items = [] def loadChannel(self, rss=None): """Downloads and parses a channel sets the feed's title, link and description. sets and returns items[], for NewsItems defined """ if rss is None: rss = self.rss prss = rssparser.parse(rss) channel = prss['channel'] items = prss['items'] if 'link' in channel.keys(): self.link = channel['link'] else: self.link = '' title, self.description = titleOrdesc(channel) if self.title is None: self.title = title self.items = [] for item in items: self.items.append(NewsItem(item)) return self.items def htmlLink(self): """Returns an HTML string that represents a link to the channel""" if self.description is not None: return '%s' % ( cgi.escape(self.link, 1), '\n\n'.join(SimpleStripper.strip(self.description)), self.title) else: return '%s' % ( cgi.escape(self.link), self.title) class NewsItem: """Each item in a channel""" def __init__(self, dict): self.link = dict.get('link', '') self.title, self.description = titleOrdesc(dict) if 'date' in dict.keys(): self.date = parsetime(dict['date']) else: self.date = None self.filenm = permalink2localfile(self.link) class ChannelLoader(threading.Thread): def __init__(self, channel, blog): threading.Thread.__init__(self) self.channel = channel self.blog = blog self.working = 1 self.failed = None def run(self): print "thread running" self.failed = not self.fetch() print "thread finished" self.working = 0 def fetch(self): try: self.channel.loadChannel() return 1 except: import traceback traceback.print_exc() log4py.error("Could not download channel => " + self.blog) return None def loadChannels(config): blogroll = {} for blog in config.sections(): if config.has_option(blog, "name"): blogroll[blog] = config.get(blog, "name") else: blogroll[blog] = None log4py.info("got blog information") # let us first update our mini database channels = [] threads = [] for blog in blogroll.keys(): log4py.info("creating channel [%s]" % blog) channel = Channel(blog, blogroll[blog]) thread = ChannelLoader(channel, blog) threads.append(thread) thread.start() still_working = len(threads) while 1: for i in range(len(threads)): thread = threads[i] if not thread: continue if not thread.working: print "thread: finished %d working %d" % (not thread.working, thread.working) print "this one's done" still_working -= 1 if thread.failed: print "failed; not using resulting channel" else: channels.append(thread.channel) threads[i] = None print "%d threads still going" % still_working if not still_working: print "all finished" break time.sleep(1) log4py.info("collected all channels") log4py.info("sorting channel list by name") channels.sort(lambda x,y: cmp(x.title, y.title)) log4py.info("start processing channels") blogroll_string = "" for channel in channels: log4py.debug("\t channel:" + channel.title) # don't load channels that are disabled if channel.rss.startswith('!') or channel.rss.startswith('%'): continue # blog URLS starting with '%' are disabled for item in channel.items: filenm = datadir + os.sep + item.filenm log4py.debug("\t\t" + item.title[:20] + "[" + filenm + "]") if item.date is None: if os.path.exists(filenm): item.date = os.stat(filenm)[ST_MTIME] if roll_in_utc: item.date = item.date - time.timezone log4py.warn("[%s]%s - had to use existing file time"%( channel.title, item.title[:20]) ) else: item.date = time.mktime(time.localtime()) log4py.warn("[%s]%s - had to use system time"%( channel.title, item.title[:20]) ) else: if not roll_in_utc: item.date = item.date - time.timezone # XXX we should be storing off the RDF, not the formatted info # this is probably a bad idea to update it always. then again, # people don't put 1001 things on their rdf's entry = """%s\n""" % (cgi.escape(item.link), item.title) if item.description is not None: entry = entry + "\n" + item.description entry = entry + '
\n' try: fp = open(filenm,"w") fp.write(entry) fp.close() except: log4py.error("Could not save =>" + filenm) continue if item.date is not None: os.utime(filenm, (item.date, item.date)) log4py.info("finished processing channels") return channels if __name__ == '__main__': if len(sys.argv) > 1: config_file = sys.argv[1] else: config_file = DEFAULT_CONFIG_FILE log_level = DEFAULT_LOG_LEVEL #log level can be 0 .. 4 as specified in log4py #0 = none. 1 = only errors. 2 = normal. #3 = verbose. 4 = debug #This is overridden later by configuration file setting log4py.set_loglevel(1<' elif channel.rss.startswith('%'): # blog URLS starting with '%' don't have RSS blogroll_string = blogroll_string + '%s (no rss)
'%(cgi.escape(channel.rss[1:], 1), channel.title) continue else: # blog is normal and in the roll blogroll_string = blogroll_string + ' ' blogroll_string = blogroll_string + channel.htmlLink() + "
" # write the actual output log4py.info("preparing output file " + output) ofp = open(output, "w") ofp.write("""

Demo

This demo might be broken at times...

Subscriptions

%s


Updated on %s

SpycyRoll

Old data can be deleted by:
find path_to_data -mtime +5 -exec rm -f {} \;
for deleting items older than 5 days.

""" % (blogroll_string, time.strftime("%B %d, %Y %I:%M %p", time.localtime())) ) log4py.info("getting channel data") aggritems = glob.glob(datadir + os.sep + "*.*") aggritems.sort(lambda x, y: cmp(os.stat(y)[ST_MTIME], os.stat(x)[ST_MTIME])) prev_blogdate = '' for blogitem in aggritems[:blog_items_to_show]: blogdate = time.strftime("%B %d, %Y", time.localtime(os.stat(blogitem)[8])) if blogdate != prev_blogdate: ofp.write('

'+ blogdate + "

\n") prev_blogdate = blogdate ofp.write("

"+ open(blogitem, "r").read() + "

\n") ofp.write("""""") ofp.close() log4py.info("finished making output " + output)