#!/usr/bin/env python

# Clean up crikey HTML
# (C) Ian Wienand <ianw@ieee.org>
# Public domain

from optparse import OptionParser
import sys

from HTMLParser import HTMLParser
import re
import htmlentitydefs

global options

# when these bylines are seen, the story will be stripped if -b is
# passed
strip_bylines = ['Christian Kerr writes']

class BasicHTMLCleaner(HTMLParser):

    # a decorator that stops functions processing if
    # self.processing is not true; used to strip between tags
    def is_processing(func):
        def wrapper(self, *__args, **__kw):
            if self.processing:
                return func(self, *__args, **__kw)
            else:
                pass
        wrapper.__name__ = func.__name__
        wrapper.__dict__ = func.__dict__
        wrapper.__doc__  = func.__doc__
        return wrapper

    def reset(self):                       
        self.pieces = []
        # this is fiddled on and off by comments, see
        # handle_comment()
        self.processing = True
        HTMLParser.reset(self)

    @is_processing
    def handle_starttag(self, tag, attrs):
        stripped_attrs = []
        for key,value in attrs:
            if (key == 'style'):
                continue
            stripped_attrs.append( (key,value) )
        strattrs = "".join([' %s="%s"' % (key, value) for key, value in stripped_attrs])
        self.pieces.append("<%(tag)s%(strattrs)s>" % locals())

    @is_processing
    def handle_endtag(self, tag):         
        self.pieces.append("</%(tag)s>" % locals())

    @is_processing
    def handle_charref(self, ref):         
        self.pieces.append("&#%(ref)s;" % locals())

    @is_processing
    def handle_entityref(self, ref):       
        self.pieces.append("&%(ref)s" % locals())
        if htmlentitydefs.entitydefs.has_key(ref):
            self.pieces.append(";")

    # don't use the decorator, do it by hand here
    def handle_data(self, text):           
        global options

        # turn on processing when the story has ended
        if (text.rfind("Back to Top") != -1) and (self.processing == False):
            self.processing = True

        # turn off processing if we see any bylines we don't like
        if (options.strip_byline):
            for byline in strip_bylines:
                if (text.rfind(byline) != -1):
                    self.processing = False

        if (self.processing):
            self.pieces.append(text)

    def handle_comment(self, text):
        # many of the ads are between comment tags
        text = text.strip().lower()
        if (text == 'begin ad tag'):
            self.processing = False
            return
        if (text == 'end ad tag'):
            self.processing = True
            return

        if (text == 'start crikey ads'):
            self.processing = False
            return
        if (text == 'end crikey ads'):
            self.processing = True
            return

        if (text == 'start double click ads'):
            self.processing = False
            return
        if (text == 'end double click ads'):
            self.processing = True
            return

    @is_processing
    def handle_pi(self, text):             
        self.pieces.append("<?%(text)s>" % locals())

    @is_processing
    def handle_decl(self, text):
        self.pieces.append("<!%(text)s>" % locals())

    def output(self):              
        """Return processed HTML as a single string"""
        return "".join(self.pieces)

# we don't actually need to do anything else here, but leave it around
# for future reference.
class CrikeyCleaner(BasicHTMLCleaner):

    def reset(self):
        BasicHTMLCleaner.reset(self)

def cleanup():
    global options

    usage = "clean-crikey [-o output] [input]"
    parser = OptionParser(usage, version=".1")
    parser.add_option("-o", "--output", dest="output", 
                      help="Output file name")

    parser.add_option("-b", "--strip-bylines", dest="strip_byline", default=False,
                      action="store_true", help="Strip stories with bylines we don't like")
    
    (options, args) = parser.parse_args()

    if options.output == None:
        f_out = sys.stdout
    else:
        f_out = open(options.output, "w")

    if len(args) == 0:
        f_in = sys.stdin
    else:
        f_in = open(args[0], "r")
        
    doc = ""
    
    # preprocess -- some advertisement sections are all on one line so
    # we can strip them out
    for line in f_in.readlines():
        if (line.rfind("***Advertisement***") != -1):
            continue
        doc += line
        
    parser = CrikeyCleaner()
    parser.feed(doc)
    parser.close()
    f_out.write(parser.output())

if __name__ == "__main__":
    cleanup()

