import sys, string, re, fileinput, socket, getopt

class LogEntry :
    "Splits up and holds a log entry"
    def __init__(self, input):
        self.input = input[:]
        input = string.split(input, '"')
        regex = re.compile(r"\[.*\]");
        self.date         = regex.findall(input[0])[0][1:-1]
        regex = re.compile(r"^.*:\s\[");
        self.hostname     = regex.findall(input[0])[0][:-3]
        regex = re.compile(r"\]\s.*\s\-");
        self.userip     = regex.findall(input[0])[0][2:-4]
        self.request      = input[1]
        self.httpresponse = string.split(input[2], " ")[1]
        self.bytes        = string.split(input[2], " ")[2]
        self.referer      = input[3]
        self.useragent    = input[5]

    def get_date(self):
        return self.date[:11]

    def get_time(self):
        return self.date[12:20]

    def get_referer(self):
        try:
            regex = re.compile(r"^http://.*?/")
            return regex.findall( self.referer )[0]
        except:
            return "Unknown"

    def get_request(self):
        regex = re.compile(r"\s/.*\s")
        return regex.findall( self.request )[0]

    def get_useragent(self):
        return self.useragent


class Log:
    """ Holds an entire or subset of a log """
    def __init__(self):
        self.entries = []

    def add(self, entry):
        self.entries.append( LogEntry(entry) )

    def get_ips(self, resolve_name):
        """Return a list of IP address that hit us, and the number of hits"""
        ips = {}
        known_hosts = {}
        for entry in self.entries:
            if resolve_name:
                if not known_hosts.has_key( entry.userip ):
                    try:
                        print "Reverse looking up " + entry.userip
                        host = socket.gethostbyaddr( entry.userip )[0]
                        known_hosts[entry.userip] = host
                    except:
                        host = entry.userip
                        known_hosts[host] = host
                    else:
                        host = known_hosts[entry.userip]
            else:
                host = entry.userip

            if not ips.has_key(host):
                ips[host] = 1
            else:
                ips[host] = ips[host] + 1

                
        items = [(v,k) for k , v in ips.items()]
        items.sort()
        items.reverse()
        items = [(k, v) for v, k in items]

        return items

    def get_hits_by_date(self):
        """ Return a list of hits sorted by date """
        known_dates = {}
        for entry in self.entries:
            if not known_dates.has_key( entry.get_date() ):
                known_dates[entry.get_date()] = 1
            else:
                known_dates[entry.get_date()] = known_dates[entry.get_date()] + 1

        items = [(v,k) for k , v in known_dates.items()]
        items.sort()
        items.reverse()
        items = [(k, v) for v, k in items]

        return items

    def get_total_hit_count(self):
        """ Return total number of hits """
        return len(self.entries)

    def get_most_hit(self, num):
        """ Get the num most hit pages """
        most_hit = {}
        for entry in self.entries:
            if not most_hit.has_key( entry.get_request() ):
                most_hit[entry.get_request()] = 1
            else:
                most_hit[entry.get_request()] = most_hit[entry.get_request()] + 1

        top_hits = {} 
        for i in range(num):
            biggest = 0
            for i in  most_hit.keys():
                if most_hit[i] > biggest:
                    biggest = most_hit[i]
                    biggest_key = i

            top_hits[biggest_key] = most_hit[biggest_key]
            most_hit[biggest_key] = -1

        items = [(v,k) for k , v in top_hits.items()]
        items.sort()
        items.reverse()
        items = [(k, v) for v, k in items]

        return items

    def get_user_agents(self):
        user_agents = {}
        for entry in self.entries:
            if not user_agents.has_key( entry.get_useragent() ):
                user_agents[entry.get_useragent()] = 1
            else:
                user_agents[entry.get_useragent()] = user_agents[entry.get_useragent()] + 1

        items = [(v,k) for k , v in user_agents.items()]
        items.sort()
        items.reverse()
        items = [(k, v) for v, k in items]

        return items


class html_table:
    """Create a fairly basic HTML table from a list of tuples"""
    def __init__(self, title, list, bgcolor = '008800', width = '100%'):
        self.title   = title
        self.list   = list[:]
        self.bgcolor = bgcolor
        self.width   = width

    def print_table(self):
        table = '<TABLE BORDER="0" CELLPADDING="0" CELLSPACING="0" WIDTH="' + self.width + '">' + \
        '<TR><TD WIDTH="100%" colspan="2" bgcolor="#' + self.bgcolor + '">' + \
        '<FONT COLOR="FFFFFF"><B><BIG>' + self.title + '</B></BIG></FONT>'

        alternate = 0
        for name, value in self.list:
            if alternate == 1:
                table = table + '<TR><TD bgcolor="#efefef">' + `name` + '</TD>' + \
                '<TD bgcolor="#efefef">' + `value` + '</TD></TR>'
                alternate = 0
            else:
                table = table + '<TR><TD>' + `name` + '</TD><TD>' + `value` + '</TD></TR>'
                alternate = 1

        table = table + '</TABLE>'

        return table
            


#main()
def main():
    if len(sys.argv) == 1:
        usage()
        sys.exit(2)

    try:
        opts, args   = getopt.getopt(sys.argv[1:], "rf:")
    except getopt.GetoptError:
        usage()
        sys.exit(2)

    mylog = Log()

    #setup defaults
    reverse_lookup = 0
    file_name = "output.html"

    #process options
    if opts:
        for o,a in opts:
            if o in ["-r"]:
                reverse_lookup = 1
            else:
                if o in ["-f"]:
                    file_name = a
                else:
                    file_name = "output.html"
    #if no options fall back to default
        
    try:
        f = open(file_name, 'w')
    except:
        print "Can't open output file!"
        sys.exit(2)
        
    
    try:
        for line in fileinput.input(args):
            if fileinput.isfirstline() :
                print "Processing " , fileinput.filename()
            mylog.add( line )

    except IOError:
        print "Can not process file" , fileinput.filename()
        sys.exit(2)
    print "Read " + `fileinput.lineno()` + " lines"

    print "Outputting to " + file_name

    f.write( '<html><head><title>Logs for wienand.org</title></head><body>' )
    f.write( '<center><font color="#FF0000" size="+2">Logs for wienand.org</font></center>' )
    f.write( '<br><br>' )
    f.write( '<P>Total hits : <b><font color="#FF0000">' )
    f.write( `mylog.get_total_hit_count()` )
    f.write( '</B></font><br><br></p>' )
    most_hits = html_table("20 Most Hit Pages", mylog.get_most_hit(20)).print_table()
    f.write( most_hits )
    f.write( '<BR><TABLE WIDTH="100%"><TR><TD WIDTH="50%" ROWSPAN="2">' )
    iphits = html_table( "IP Addresses that hit us", mylog.get_ips(reverse_lookup), "FF0000" ).print_table()
    f.write( iphits )
    f.write( '</TD><TD WIDTH="50%" VALIGN="TOP">' )
    datehits = html_table("Hits by Date", mylog.get_hits_by_date(), "0000FF" ).print_table()
    f.write( datehits )
    f.write( '</TD></TR><TR><TD VALIGN="TOP">' )
    useragents = html_table( "User Agents" , mylog.get_user_agents() , "800080" ).print_table()
    f.write( useragents )
    f.write( '</TD></TR></TABLE>' )
    f.write( '<BR>' )
    f.write( '</body></html>' )

    #close and exit normally
    f.close()
    
def usage():
    print """
logalyser [-r] [-f output.html] file [file2 ... filen]
   -r      : do reverse lookup on IP addresses
   (WARNING: this might take a while ...)
   -f file : output to this file (default output.html)
   file [file2 ... filen] : fasthost.co.uk input log files
"""
        
if __name__ == '__main__':
    main()
    

