#!/usr/bin/python
#
#Author: Gleeda <jamie.levy@gmail.com>
#
#This program is free software; you can redistribute it and/or
#modify it under the terms of the GNU General Public License
#as published by the Free Software Foundation; either version
#2 of the License, or (at your option) any later version.
#
# getmalwaredomains.py
# 
# collects domain/IP/Date/Reverse Lookup information 
#  from malwaredomainlist.com
import os, sys 
import getopt
import httplib
import sqlite3

DBNAME  = "malwaredomains.db"
MAXWAIT = (60*10) # ten minutes 

def usage():
    print 'getmalwaredomains.py:'
    print '  - collects domain/IP/Date/Reverse Lookup information from malwaredomainlist.com'
    print '\t-h, --help     : print help message'
    print '\t-d, --database : sqlite3 db to store domain info (default malwaredomains.db)'
    print '\t-b, --bulk     : bulk upload up to given number of pages'
    print '\t-u, --update   : process entries from update link'
    print '\t-a, --add      : add a particular domain\n'

class malwaredomains:
    def adddomain(self, d):
        domain = d
        url = 'http://www.malwaredomainlist.com/mdl.php?search=' + domain + '&colsearch=All&quantity=50'
        try:
            conn = httplib.HTTPConnection('www.malwaredomainlist.com')
            conn.request('GET', '/mdl.php?search=' + domain + '&colsearch=All&quantity=50')
            response = conn.getresponse().read()
            if response.find('Date (UTC)') != -1: 
                print "Page exists: %s" % url 
                return response
            else:
                print "Unable to connect!"
        except Exception, e:
            print "Error connecting:", e
            pass

        return None

    def getupdates(self):
        url = 'http://www.malwaredomainlist.com/update.php'
        try:
            conn = httplib.HTTPConnection('www.malwaredomainlist.com')
            conn.request('GET', '/update.php')
            response = conn.getresponse().read()
            if response.find('Date (UTC)') != -1: 
                print "Page exists: %s" % url 
                return response
            else:
                print "Unable to connect!"
        except Exception, e:
            print "Error connecting:", e
            pass

        return None

    
    def getdomains(self, page):
        url = 'http://www.malwaredomainlist.com/mdl.php'
        page = page
        try:
            conn = httplib.HTTPConnection('www.malwaredomainlist.com')
            conn.request('GET', '/mdl.php?inactive=&sort=Date&search=&colsearch=All&ascordesc=DESC&quantity=100&page=' + page) 
            response = conn.getresponse().read()
            if response.find('Date (UTC)') != -1:
                print "Page exists: %s" % url
                return response
            else:
                print "Unable to connect!"
        except Exception, e:
            print "Error connecting:", e
            pass

        return None

class Domains:
    def __init__(self, data):
        self.data = data
        self.column_names = {0 : 'Date', 1 : 'Domain', 2 : 'IP',  3 : 'Rlookup', 4 : 'Description', 5 : 'Registrant', 6 : 'ASN', 7 : 'COUNTRY'}

    def process_entries(self, str):
        items = []
        line = str
        line.rstrip()
        line.lstrip()    
        line.strip()
        line = line.replace('<nobr>', '')
        line = line.replace('</nobr>', '')
        line = line.replace('<wbr>', '')
        line = line.strip("<wbr>")
        return line


    def process_column(self, column, ncol):
        start_value = column.find('>')
        if start_value == -1:
            return

        column = column[start_value+1:]

        end_column = column.find('</td>')
        if end_column == -1:
            return

        str = column[0:end_column]
        if str.find('img src') != -1:
            return None
        if self.column_names[ncol] == 'Date':
            if str.find("/") != -1 and str.find(":") != -1:
                date = self.process_entries(str)
                date = date.replace("_", " ")
                return {'dates': date}
            else:
                return None
        elif self.column_names[ncol] == 'Domain':
            domain = self.process_entries(str)
            domain = domain.lstrip('.')
            return {'domains': domain}
        elif self.column_names[ncol] == 'Rlookup':
           rlookup = self.process_entries(str) 
           rlookup = rlookup.rstrip('.')
           return {'rlookups': rlookup}
        elif self.column_names[ncol] == 'IP':
            ip = self.process_entries(str)
            return {'ips': ip}

        return None

    def process_row(self, row):
        end_row = row.find('</tr>')
        if end_row == -1:
            return

        row = row[0:end_row]
        offset = 0
        ncol = 0

        row_info = {}

        while row[offset:].find('<td') != -1:
            ofs = row[offset:].find('<td') + 3
            column = row[offset+ofs:]
            info = self.process_column(column, ncol)
            if info != None:
                row_info.update(info)
            offset += ofs
            ncol += 1
            ncol = ncol % 8

        return row_info


    def extract(self):
        data = self.data
        table_data = ''
        start_table = data.find('Date (UTC)')
        if start_table != -1:
            end_table = data[start_table:].find('</table>')
            if end_table != -1:
                table_data = data[start_table:start_table+end_table]

        offset = 0
        nrow = 0
        domain_info = []

        while table_data[offset:].find('<tr') != -1:

            ofs = table_data[offset:].find('<tr') + 4
            row = table_data[offset+ofs:]
            if nrow > 0:
                row_info = self.process_row(row)
                domain_info.append(row_info)
            offset += ofs
            nrow += 1

        return domain_info

def init():
    global DBNAME
    if os.path.isfile(DBNAME):
        return

    conn = sqlite3.connect(DBNAME)
    curs = conn.cursor()
    
    curs.executescript('''
    CREATE TABLE domains (
            id          INTEGER PRIMARY KEY,
            domain      TEXT,
            ip          TEXT,
            rlookup     TEXT,
            date        TEXT
        );
    ''')
    curs.close()

    if os.path.isfile(DBNAME):
        print "Success."
    else:
        print "Failed."

def process_domains(domain_info):
    conn = sqlite3.connect(DBNAME)
    conn.text_factory = str
    cur = conn.cursor()

    for info in domain_info:
        cur.execute("SELECT COUNT(*) FROM domains where domain=? AND ip=? AND rlookup=? AND date=?", 
                    (info['domains'],
                    info['ips'], 
                    info['rlookups'],
                    info['dates']))
        count = cur.fetchone()[0]
        if count < 1:
            cur.execute("INSERT INTO domains VALUES(null, ?, ?, ?, ?)", (info['domains'], info['ips'], info['rlookups'], info['dates']))
            print '[domain]', info['domains'], '\n\t[IP] ', info['ips']
        else:
            print "NOT ADDED " +info['domains'] + '\n\twith IP: ' + info['ips'] + ' already in database, not added'
        
    conn.commit()
    conn.close()


def main():          
    try:
        opts, args = getopt.getopt(sys.argv[1:], "hb:a:d:u", ["help", "bulk=", "add=", "database="])
    except getopt.GetoptError, err:
        print str(err)
        sys.exit(2)

    bulk = False
    update = False
    domain = None
    global DBNAME

    for o, a in opts:
        if o in ("-h", "--help"):
            usage()
            sys.exit(2)
        elif o in ("-b", "--bulk"):
            bulk = True 
            pages = a
        elif o in ("-a", "--add"):
            domain = a
        elif o in ("-d", "--database"):
            DBNAME = a
        elif o in ("-u", "--update"):
            update = True
        else:
            assert False, "unhandled option\n\n"
            sys.exit(2)

    if not bulk and domain == None and not update:
        print 'You must choose either bulk, update or single addition mode!\n'  
        usage()
        sys.exit(-1)

    init()

    md = malwaredomains()
    data = None

    if domain != None:
        data = md.adddomain(domain)
        if data == None:
            print "Cannot find url!"
        else: 
            fs = Domains(data) 
            domain_info = fs.extract()
            process_domains(domain_info)
    if update:
        data = md.getupdates()
        if data == None:
            print "Cannot find url!"
        else: 
            fs = Domains(data) 
            domain_info = fs.extract()
            process_domains(domain_info)
    if bulk:
        if int(pages) >= 0:
            i = 0
            while i <= int(pages):
                data = md.getdomains(str(i))

                if data == None:
                    print "Cannot find url!"
                    return
                else: 
                    fs = Domains(data) 
                    domain_info = fs.extract()
   
                    process_domains(domain_info)
                i = i+1

if __name__ == "__main__":
    main()