#!/usr/bin/python
# Feed Normalizer
# http://www.myelin.co.nz/feed_normalizer/
# Copyright (C) 2003-04 Phillip Pearson
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation files
# (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge,
# publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import feedparser, cgi, os, types, sys, httplib2, os.path, stat, time
def esc(s):
# convert to utf-8, and escape
if type(s) == types.UnicodeType:
# if unicode, just convert
s = s.encode('utf-8')
else:
# otherwise see if it's already utf-8
try:
s.decode('utf-8')
except UnicodeError:
# it's not - cross fingers and assume us-ascii
s = s.decode('iso-8859-1').encode('utf-8')
# s is utf-8 by now
return cgi.escape(s, 1)
def prepare(x, keys):
# turn some possibly dodgy data into escaped xml without leading or trailing space
return tuple([esc(x.get(k, '').strip()) for k in keys])
def munge(s):
r = ""
for c in s:
if c.isalnum() or c in "_(),.":
r += c
else:
r += "%%%02d" % ord(c)
return r
def norm(url, stripdesc, noisy=0):
# do some normalization. not too hard when someone else does the parser for you!
# make sure we're not being asked to read a local file
assert url.startswith('http://')
# parse it
for d in ("cache", "cache/httplib2", "cache/normalize"):
if not os.path.exists(d):
os.mkdir(d)
cache_fn = os.path.join("cache", "normalize", munge(url))
if not os.path.exists(cache_fn) or os.stat(cache_fn)[stat.ST_MTIME] < (time.time() - 3600):
http = httplib2.Http("cache/httplib2")
response, content = http.request(url)
open(cache_fn, "wt").write(content)
f = feedparser.parse(cache_fn)
if noisy:
print f
# and print out some rss 2.0
print """Content-Type: text/xml
%s
%s
%s""" % prepare(f['channel'], ('title', 'link', 'description'))
for item in f['items']:
title, guid, link = prepare(item, ('title', 'guid', 'link'))
# there's probably a better way, but for the moment, if the guid looks like a url,
# assume it's a permalink.
if guid.startswith('http://'):
isperm = ""
else:
isperm = ' ispermalink="false"'
print """ %s%s
%s""" % (title, isperm, guid, link)
#from pprint import pprint
#pprint(item)
if stripdesc:
# strip description - this is good for sharpreader as it lets you
# see the linked article in the preview pane instead of the description
# from the feed.
content = ''
elif item.has_key('content_encoded'):
# prefer content_encoded over description, as some people put full
# content in and only a summary in
content = item['content_encoded']
elif item.has_key('content'):
# failing that, is often good
content = item['content'][0]['value']
elif item.has_key('description'):
# couldn't find anything - just use the description
content = item['description']
else:
# no content??
content = ''
print """ %s""" % esc(content.strip())
print """ """
print """"""
def usage():
# we weren't called properly
print """Location: %(url)s
Status: 301
Content-Type: text/html
Go to %(url)s ...
""" % {'url': 'http://www.myelin.co.nz/feed_normalizer/',
}
def cgi_main():
# read cgi params, then do stuff
fs = cgi.FieldStorage()
if fs.getfirst('showsource'):
print "Content-Type: text/plain\n"
print open('normalize.py').read()
return
stripdesc = fs.getfirst('stripdesc') and 1 or 0
url = fs.getfirst('url')
if url is None:
usage()
return
norm(url, stripdesc)
def main():
# some tests
import sys
if len(sys.argv) > 1:
norm(sys.argv[1])
return
for url in ('http://www.simplegeek.com/blogxbrowsing.asmx/GetRss?',
'http://alex.halavais.net/news/index.xml',
'http://boingboing.net/rss.xml',
'http://www.simplegeek.com/blogxbrowsing.asmx/GetRss?',
'http://www.myelin.co.nz/post/rss.xml',
):
print "--- trying to normalise %s ---" % url
norm(url, 0, noisy=1)
if __name__ == '__main__':
# are we running under cgi?
if os.environ.has_key('SERVER_NAME'):
cgi_main()
else:
# nope - run tests instead
main()