#!/usr/bin/env python2
"""
pyblagg: quick aggregator for Linux/Unix machines.

VI settings: sw=4 ts=4 et

  This  is an AGGREGATOR, not a news reader. Most readers out there
  still force me to go to each channel I subscribe to and figure out
  what the latest news is. This collects all the channels, aggregates
  the news and then display latest news first.

installation and usage:
- get Mark Pilgrim's ultraliberal parser - see URL below
- get log4py module - see URL below
- get timeoutsocket.py module also

  All these modules are available in CVS

- edit configuration settings in your file config.ini
- give it execute permission
- run it from command line periodically. I don't recommend running these from
  CGI.
"""
__author__ = "S Babu <vsbabu_at_vsbabu_dot_org>"
__date__ = "$Date: 2003/02/28 01:05:28 $"
__credits__ = """Mark Pilgrim, for liberal RSS parser
http://www.diveintomark.org/archives/2002/08/13.html#ultraliberal_rss_parser
Martin Preishuber, for log4py
http://sourceforge.net/project/showfiles.php?group_id=36216
"""

__version__ = "$Revision: 1.12 $"
#$Source: /cvsroot/spycyroll/pyblagg/pyblagg.py,v $

import sys
import rssparser
import timeoutsocket
timeoutsocket.setDefaultSocketTimeout(60)
import os
import time
import rfc822
import re
import string
import glob
import cgi
import threading
from stat import *

from log4py import Logger
import SimpleStripper

# Defaults

DEFAULT_LOG_LEVEL = 2
DEFAULT_CONFIG_FILE = 'config.ini'

# Get the log together. For the time being, this is module-level and to
# override it you'll need to assign to the module's ``log4py`` attribute.
# I'm strongly encouraging an eventual shift to the Python 2.3a1 logging
# module, but that'stomorrow's change. 

log4py = Logger().get_instance()

invalid_stuff = re.compile(r"\W+")

def permalink2localfile(link):
    """Given a full url, tries to make a file name by replacing unwelcome characters with periods""" 
    filenm = link
    filenm = filenm.replace('http://','')
    filenm = filenm.replace('www.','')
    filenm = invalid_stuff.sub('.', filenm)
    filenm = re.sub('\.+', '.', filenm)
    return filenm

def parsetime(str_time):
    """Returns the time from the string; as seconds since UTC epoch.
    """
    str_time = str_time.strip()
    # do we have an rfc2822 date?
    if str_time[3] == ',':
        log4py.debug("\trfc2822 date " + str_time)
        # we have a date of the format Sun, 22 Sep 2002 15:04:54 GMT
        # this assumption doesn't make me feel comfortable. okay!!!
        try:
            yyyy, mm, dd, hh, mi, ss, x, x, x, offset = \
                rfc822.parsedate_tz(str_time)

            # TODO if offset is None, try to use the following to parse
            # properly
            # http://cvs.sourceforge.net/cgi-bin/viewcvs.cgi/simpleparse/simpleparse/common/timezone_names.py
            if offset is None:
                offset = 0.0
        except ValueError:
            log4py.error( "RSS2 value error " + str_time)
            return None
    else:
        try:
            yyyy, mm, dd, hh, mi, ss=(int(str_time[0:4]),
                                    int(str_time[5:7]),
                                    int(str_time[8:10]),
                                    int(str_time[11:13]),
                                    int(str_time[14:16]),
                                    int(str_time[17:19]))
            # optional timezone offset
            if len(str_time) > 20:
                offset = str_time[20:]
                if offset.find(":") > 0:
                    #using split because hours seem to appear in any format
                    #like +12, -10, 10, 1, 02, -2
                    offset = string.split(offset, ":")
                    offset = (int(offset[0]) * 60.0 + int(offset[1])) * 60.0 #in seconds
                else:
                    #some feeds don't have minutes in offset :-)
                    offset = int(offset) * 60.0 * 60.0
            else:
                offset = 0
        except ValueError:
            try:
                # yes, some people put in a _timestamp_
                return float(str_time)
            except:
                log4py.error("value error " + str_time)
                return None
    # the last -1 is for DST
    # TODO timezone is still not considered
    ret = time.mktime ( (yyyy, mm, dd, hh, mi, ss, 0, 0, -1) )   

    # this is in seconds
    # now let us get to UTC
    ret = ret - offset
    return ret

def titleOrdesc(dict):
    """Return a tuple, (title,description) from the dictionary.

    I want a title, then description, if it exists. some feeds have title.
    some have only description. some have both.
    """
    k = dict.keys()
    title = None
    description = None
    if 'title' in k: 
        title = dict['title']
    else:
        title = None
    if 'description' in k and not 'title' in k: 
        title = dict['description']
        description = None
    if 'description' in k and 'title' in k: 
        description = dict['description']
        title = dict['title']
    if 'description' not in k:
        description = None
    return (title, description)

class Channel:
    """An RSS channel
    rss : url to the RSS or RDF file
    
    From this url, it figures out the rest;
    title : Title as specified in RSS file UNLESS you specify it while creating object
    link : url for the site, as specified in the RSS file
    description : optional textual description.
    items[]: collection of type NewsItem for the items in the feed
    """
    
    def __init__(self, rss, title=None):
        self.rss = rss
        self.title = title
        self.link = None
        self.description = None
        self.items = []

        
    def loadChannel(self, rss=None):
        """Downloads and parses a channel
        
        sets the feed's title, link and description.
        sets and returns items[], for NewsItems defined
        """
        if rss is None:
            rss = self.rss
        prss = rssparser.parse(rss)
        channel = prss['channel']
        items = prss['items']
        if 'link' in channel.keys():
            self.link = channel['link']
        else:
            self.link = ''
        title, self.description = titleOrdesc(channel)
        if self.title is None:
            self.title = title
        self.items = []
        for item in items:
            self.items.append(NewsItem(item))
        return self.items

    def htmlLink(self):
        """Returns an HTML string that represents a link to the channel"""
        if self.description is not None:
            return '<a href="%s" title="%s">%s</a>' % (
                cgi.escape(self.link, 1), 
                '\n\n'.join(SimpleStripper.strip(self.description)), 
                self.title)
        else:
            return '<a href="%s">%s</a>' % (
                cgi.escape(self.link),
                self.title)
            
class NewsItem:
    """Each item in a channel"""
    
    def __init__(self, dict):
        self.link = dict.get('link', '')
        self.title, self.description = titleOrdesc(dict)
        if 'date' in dict.keys():
            self.date = parsetime(dict['date'])
        else:
            self.date = None
        self.filenm = permalink2localfile(self.link)

class ChannelLoader(threading.Thread):
    def __init__(self, channel, blog):
        threading.Thread.__init__(self)
        self.channel = channel
        self.blog = blog
        self.working = 1
        self.failed = None
    def run(self):
        print "thread running"
        self.failed = not self.fetch()
        print "thread finished"
        self.working = 0
    def fetch(self):
        try:
            self.channel.loadChannel()
            return 1
        except:
            import traceback
            traceback.print_exc()
            log4py.error("Could not download channel => " + self.blog)
            return None
            
def loadChannels(config):
    blogroll = {}

    for blog in config.sections():
        if config.has_option(blog, "name"):
            blogroll[blog] = config.get(blog, "name")
        else:
            blogroll[blog] = None

    log4py.info("got blog information")

    # let us first update our mini database
    channels = []

    threads = []
    for blog in blogroll.keys():
        log4py.info("creating channel [%s]" % blog)
        channel = Channel(blog, blogroll[blog])
        thread = ChannelLoader(channel, blog)
        threads.append(thread)
        thread.start()

    still_working = len(threads)
    while 1:
        for i in range(len(threads)):
            thread = threads[i]
            if not thread: continue
            if not thread.working:
                print "thread: finished %d working %d" % (not thread.working, thread.working)
                print "this one's done"
                still_working -= 1
                if thread.failed:
                    print "failed; not using resulting channel"
                else:
                    channels.append(thread.channel)
                threads[i] = None
        print "%d threads still going" % still_working
        if not still_working:
            print "all finished"
            break
        time.sleep(1)
    log4py.info("collected all channels")
    
    log4py.info("sorting channel list by name")
    channels.sort(lambda x,y: cmp(x.title, y.title))

    log4py.info("start processing channels")
    blogroll_string = ""
    for channel in channels:
        log4py.debug("\t channel:" + channel.title)

        # don't load channels that are disabled
        if channel.rss.startswith('!') or channel.rss.startswith('%'):
            continue

        # blog URLS starting with '%' are disabled
        for item in channel.items:
            filenm = datadir + os.sep  + item.filenm
            log4py.debug("\t\t" + item.title[:20] + "[" + filenm + "]")
            if item.date is None:
                if os.path.exists(filenm):
                    item.date = os.stat(filenm)[ST_MTIME]
                    if roll_in_utc:
                        item.date = item.date - time.timezone
                    log4py.warn("[%s]%s - had to use existing file time"%(  
                        channel.title, item.title[:20]) )
                else:
                    item.date = time.mktime(time.localtime())
                    log4py.warn("[%s]%s - had to use system time"%(
                        channel.title, item.title[:20]) )
            else:
                if not roll_in_utc:
                    item.date = item.date - time.timezone

            # XXX we should be storing off the RDF, not the formatted info
            # this is probably a bad idea to update it always. then again,
            # people don't put 1001 things on their rdf's
            entry = """<a href="%s">%s</a>\n""" % (cgi.escape(item.link), item.title)
            if item.description is not None:
                entry = entry + "\n" + item.description
            entry = entry + '<br/>\n<div class="bloglinks">' + time.strftime("%B %d, %Y %I:%M %p", time.localtime(item.date))
            entry = entry + '\n<span class="separator">|</span> ' + channel.htmlLink() + '</div>'

            try:
                fp = open(filenm,"w")
                fp.write(entry)
                fp.close()
            except:
                log4py.error("Could not save =>" + filenm)
                continue
            if item.date is not None:
                os.utime(filenm, (item.date, item.date))
    log4py.info("finished processing channels")
    return channels

if __name__ == '__main__':
    if len(sys.argv) > 1:
        config_file = sys.argv[1]
    else:
        config_file = DEFAULT_CONFIG_FILE
    log_level = DEFAULT_LOG_LEVEL

    #log level can be 0 .. 4 as specified in log4py
    #0 = none. 1 = only errors. 2 = normal.
    #3 = verbose. 4 = debug
    #This is overridden later by configuration file setting

    log4py.set_loglevel(1<<log_level)
    log4py.info("startup")

    log4py.info("get configuration options from " + config_file)
    from ConfigParser import ConfigParser
    config = ConfigParser()
    config.readfp(open(config_file))

    datadir = config.get("DEFAULT", "data_directory")
    output = config.get("DEFAULT", "output_html_file")
    blog_items_to_show = config.getint("DEFAULT", "blog_items_to_show")
    log_level = config.getint("DEFAULT", "log_level")
    try:
        roll_in_utc = config.getint("DEFAULT", "roll_in_utc")
    except:
        roll_in_utc = 0

    log4py.info("got default configuration options")

    log4py.debug("Data directory = " + datadir)
    log4py.debug("Output file = " + output)
    log4py.debug("Number of items shown = %d" % blog_items_to_show)
    log4py.debug("Current timezone offset from UTC = %.2f" % time.timezone)

    channels = loadChannels(config)

    # generate the blogroll string
    blogroll_string = ''
    for channel in channels:
        if channel.rss.startswith('!'):
            # blog URLS starting with '!' are disabled
            blogroll_string = blogroll_string + channel.htmlLink() +' (disabled)<br>'

        elif channel.rss.startswith('%'):
            # blog URLS starting with '%' don't have RSS
            blogroll_string = blogroll_string + '<a href="%s">%s</a> (no rss)<br>'%(cgi.escape(channel.rss[1:], 1), channel.title)
            continue

        else:
            # blog is normal and in the roll
            blogroll_string = blogroll_string + '<a href="' + cgi.escape(channel.rss, 1) + '"><img src="http://vsbabu.org/mt/archives/images/xml.gif" width="24" height="10" border="0"></a> '
            blogroll_string = blogroll_string + channel.htmlLink() + "<br/>"
    
    # write the actual output
    log4py.info("preparing output file " + output)
    ofp = open(output, "w")
    ofp.write("""
    <?php include("header.php");?>

    <h3>Demo</h3>

    <p>
    This demo might be broken at times...
    </p>

    <div class="highlight">
    <h2 class="itemhead">Subscriptions</h2>
    <p>
    %s
    </p>
    <hr/>

    <p><em>Updated on %s</em></p>

    <p>
    <a href="http://cvs.sourceforge.net/cgi-bin/viewcvs.cgi/spycyroll">SpycyRoll</a>
    </p>

    <p>
    Old data can be deleted by:<br/>
    <tt><em>
    find path_to_data -mtime +5 -exec rm -f {} \;
    </em></tt>
    <br/>
    for deleting items older than 5 days.
    </p>
    </div>
    """ % (blogroll_string, time.strftime("%B %d, %Y %I:%M %p", time.localtime())) )

    log4py.info("getting channel data")
    aggritems = glob.glob(datadir + os.sep + "*.*")
    aggritems.sort(lambda x, y: cmp(os.stat(y)[ST_MTIME], os.stat(x)[ST_MTIME]))
    prev_blogdate = ''
    for blogitem in aggritems[:blog_items_to_show]:
        blogdate = time.strftime("%B %d, %Y", time.localtime(os.stat(blogitem)[8]))
        if blogdate != prev_blogdate:
            ofp.write('<h2 class="blogtitle">'+ blogdate + "</h2>\n")
            prev_blogdate = blogdate
        ofp.write("<p>"+ open(blogitem, "r").read() + "</p>\n")
    
    ofp.write("""<?php include("footer.php");?>""")
    ofp.close()
    log4py.info("finished making output " + output)