#!/usr/bin/python # Feed Normalizer # http://www.myelin.co.nz/feed_normalizer/ # Copyright (C) 2003-04 Phillip Pearson # Permission is hereby granted, free of charge, to any person # obtaining a copy of this software and associated documentation files # (the "Software"), to deal in the Software without restriction, # including without limitation the rights to use, copy, modify, merge, # publish, distribute, sublicense, and/or sell copies of the Software, # and to permit persons to whom the Software is furnished to do so, # subject to the following conditions: # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import feedparser, cgi, os, types, sys, httplib2, os.path, stat, time def esc(s): # convert to utf-8, and escape if type(s) == types.UnicodeType: # if unicode, just convert s = s.encode('utf-8') else: # otherwise see if it's already utf-8 try: s.decode('utf-8') except UnicodeError: # it's not - cross fingers and assume us-ascii s = s.decode('iso-8859-1').encode('utf-8') # s is utf-8 by now return cgi.escape(s, 1) def prepare(x, keys): # turn some possibly dodgy data into escaped xml without leading or trailing space return tuple([esc(x.get(k, '').strip()) for k in keys]) def munge(s): r = "" for c in s: if c.isalnum() or c in "_(),.": r += c else: r += "%%%02d" % ord(c) return r def norm(url, stripdesc, noisy=0): # do some normalization. not too hard when someone else does the parser for you! # make sure we're not being asked to read a local file assert url.startswith('http://') # parse it for d in ("cache", "cache/httplib2", "cache/normalize"): if not os.path.exists(d): os.mkdir(d) cache_fn = os.path.join("cache", "normalize", munge(url)) if not os.path.exists(cache_fn) or os.stat(cache_fn)[stat.ST_MTIME] < (time.time() - 3600): http = httplib2.Http("cache/httplib2") response, content = http.request(url) open(cache_fn, "wt").write(content) f = feedparser.parse(cache_fn) if noisy: print f # and print out some rss 2.0 print """Content-Type: text/xml %s %s %s""" % prepare(f['channel'], ('title', 'link', 'description')) for item in f['items']: title, guid, link = prepare(item, ('title', 'guid', 'link')) # there's probably a better way, but for the moment, if the guid looks like a url, # assume it's a permalink. if guid.startswith('http://'): isperm = "" else: isperm = ' ispermalink="false"' print """ %s %s %s""" % (title, isperm, guid, link) #from pprint import pprint #pprint(item) if stripdesc: # strip description - this is good for sharpreader as it lets you # see the linked article in the preview pane instead of the description # from the feed. content = '' elif item.has_key('content_encoded'): # prefer content_encoded over description, as some people put full # content in and only a summary in content = item['content_encoded'] elif item.has_key('content'): # failing that, is often good content = item['content'][0]['value'] elif item.has_key('description'): # couldn't find anything - just use the description content = item['description'] else: # no content?? content = '' print """ %s""" % esc(content.strip()) print """ """ print """ """ def usage(): # we weren't called properly print """Location: %(url)s Status: 301 Content-Type: text/html Go to %(url)s ... """ % {'url': 'http://www.myelin.co.nz/feed_normalizer/', } def cgi_main(): # read cgi params, then do stuff fs = cgi.FieldStorage() if fs.getfirst('showsource'): print "Content-Type: text/plain\n" print open('normalize.py').read() return stripdesc = fs.getfirst('stripdesc') and 1 or 0 url = fs.getfirst('url') if url is None: usage() return norm(url, stripdesc) def main(): # some tests import sys if len(sys.argv) > 1: norm(sys.argv[1]) return for url in ('http://www.simplegeek.com/blogxbrowsing.asmx/GetRss?', 'http://alex.halavais.net/news/index.xml', 'http://boingboing.net/rss.xml', 'http://www.simplegeek.com/blogxbrowsing.asmx/GetRss?', 'http://www.myelin.co.nz/post/rss.xml', ): print "--- trying to normalise %s ---" % url norm(url, 0, noisy=1) if __name__ == '__main__': # are we running under cgi? if os.environ.has_key('SERVER_NAME'): cgi_main() else: # nope - run tests instead main()