#an extremely hacky script to generate a list of files from the airmen of note downloads
#http://rewindplay.com/airmenofnote/sounds/sounds.htm

import re
import urllib, urlparse
from optparse import OptionParser
import os

whiteout = re.compile(r'\s+')

title_re = re.compile('<title>(?P<junk>[^|]*)..(?P<album>.*)</title>', re.VERBOSE)

# grabs hyperlinks from text
href_re = re.compile(r'''
<a(?P<attrs>[^>]*     # start of tag
href=(?P<delim>["])   # delimiter
(?P<link>[^"]*)       # link
(?P=delim)            # delimiter
[^>]*)>               # rest of start tag
(?P<content>.*?)      # link content
</a>                  # end tag
''', re.VERBOSE | re.IGNORECASE)

# grabs attribute name, value pairs
attrs_re = re.compile(r'''
(?P<name>\w+)=        # attribute name
(?P<delim>["])       # delimiter
(?P<value>[^"]*)     # attribute value
(?P=delim)            # delimiter
''', re.VERBOSE)

def getTitle(html_data):
    matches = title_re.finditer(html_data)
    for match in matches:
        d = match.groupdict()
        return d.get('album', None)

def getLinks(html_data):
    newdata = whiteout.sub(' ', html_data)
    matches = href_re.finditer(newdata)
    ancs = []
    for match in matches:
        d = match.groupdict()
        a = {}
        a['href'] = d.get('link', None)
        a['content'] = d.get('content', None)
        attr_matches = attrs_re.finditer(d.get('attrs', None))
        for match in attr_matches:
            da = match.groupdict()
            name = da.get('name', None)
            a[name] = da.get('value', None)
        ancs.append(a)
    return ancs

PRE_URL = "http://ianwienand:basie@rewindplay.com"

if __name__ == '__main__':
    parser = OptionParser("usage", version="1.0")

    (options, args) = parser.parse_args()

    for f in args:
        content = open(f).read()
        print "processing %s (%s)" % (f, getTitle(content))
        title = urllib.unquote(getTitle(content))
        os.mkdir(title)
        try:
            f = open(title + "/urllist.txt", "w")
        except:
            os.exit(1)
        for l in getLinks(content):
            try:
                if len(l["content"]) == 2 and int(l["content"]):
                    print l["content"]
                    split_url = urlparse.urlsplit(urllib.unquote(l["href"]))
                    if (split_url[2][:3] == "mp3"):
                        url = "/airmenofnote/sounds/" + split_url[2]
                    else:
                        url = split_url[2]
                        
                    f.write(PRE_URL + url + "\n")
            except ValueError:
                pass

