diff --git a/.gitignore b/.gitignore index e314318..4ab6c9a 100644 --- a/.gitignore +++ b/.gitignore @@ -46,3 +46,4 @@ venv maltrieve.out archive +grequests diff --git a/MultiPartForm.py b/MultiPartForm.py deleted file mode 100644 index 0af6d96..0000000 --- a/MultiPartForm.py +++ /dev/null @@ -1,69 +0,0 @@ -import itertools -import mimetools -import mimetypes -import urllib -import urllib2 - - -class MultiPartForm(object): - """Accumulate the data to be used when posting a form.""" - - def __init__(self): - self.form_fields = [] - self.files = [] - self.boundary = mimetools.choose_boundary() - return - - def get_content_type(self): - return 'multipart/form-data; boundary=%s' % self.boundary - - def add_field(self, name, value): - """Add a simple field to the form data.""" - self.form_fields.append((name, value)) - return - - def add_file(self, fieldname, filename, fileHandle, mimetype=None): - """Add a file to be uploaded.""" - body = fileHandle.read() - if mimetype is None: - mimetype = mimetypes.guess_type(filename)[0] or 'application/octet-stream' - self.files.append((fieldname, filename, mimetype, body)) - return - - def __str__(self): - """Return a string representing the form data, including attached files.""" - # Build a list of lists, each containing "lines" of the - # request. Each part is separated by a boundary string. - # Once the list is built, return a string where each - # line is separated by '\r\n'. - parts = [] - part_boundary = '--' + self.boundary - - # Add the form fields - parts.extend( - [ part_boundary, - 'Content-Disposition: form-data; name="%s"' % name, - '', - value, - ] - for name, value in self.form_fields - ) - - # Add the files to upload - parts.extend( - [ part_boundary, - 'Content-Disposition: file; name="%s"; filename="%s"' % \ - (field_name, filename), - 'Content-Type: %s' % content_type, - '', - body, - ] - for field_name, filename, content_type, body in self.files - ) - - # Flatten the list and add closing boundary marker, - # then return CR+LF separated data - flattened = list(itertools.chain(*parts)) - flattened.append('--' + self.boundary + '--') - flattened.append('') - return '\r\n'.join(flattened) diff --git a/README.md b/README.md index 91a9511..e24a854 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,6 @@ Maltrieve originated as a fork of [mwcrawler](https://github.com/ricardo-dias/mw * [Malc0de](http://malc0de.com/rss) * [Malware Black List](http://www.malwareblacklist.com/mbl.xml) * [Malware Domain List](http://www.malwaredomainlist.com/hostslist/mdl.xml) -* [Sacour.cn](http://www.sacour.cn) * [VX Vault](http://vxvault.siri-urz.net/URL_List.php) * [URLqery](http://urlquery.net/) * [CleanMX](http://support.clean-mx.de/clean-mx/xmlviruses.php?) diff --git a/maltrieve.py b/maltrieve.py index 621a710..f3d5f1f 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -21,6 +21,7 @@ import argparse import datetime import feedparser +import grequests import hashlib import json import logging @@ -37,49 +38,8 @@ from bs4 import BeautifulSoup -def get_malware(q, dumpdir): - while True: - url = q.get() - logging.info("Fetched URL %s from queue", url) - logging.info("%s items remaining in queue", q.qsize()) - try: - logging.info("Requesting %s" % url) - mal_req = requests.get(url, proxies=cfg['proxy'], timeout=10) - except requests.ConnectionError as e: - logging.info("Could not connect to %s: %s" % (url, e)) - break - except requests.Timeout as e: - logging.info("Timeout waiting for %s: %s" % (url, e)) - break - mal = mal_req.content - if mal: - # TODO: store these in the JSON DB - if 'logheaders' in cfg: - logging.info("Returned headers for %s: %r" % (url, mal_req.headers)) - md5 = hashlib.md5(mal).hexdigest() - # Is this a big race condition problem? - if md5 not in hashes: - logging.info("Found file %s at URL %s", md5, url) - if not os.path.isdir(dumpdir): - try: - logging.info("Creating dumpdir %s", dumpdir) - os.makedirs(dumpdir) - except OSError as exception: - if exception.errno != errno.EEXIST: - raise - with open(os.path.join(dumpdir, md5), 'wb') as f: - f.write(mal) - logging.info("Stored %s in %s", md5, dumpdir) - print "URL %s stored as %s" % (url, md5) - if 'vxcage' in cfg: - store_vxcage(os.path.join(dumpdir, md5)) - if 'cuckoo' in cfg: - submit_cuckoo(os.path.join(dumpdir, md5)) - hashes.add(md5) - q.task_done() - - -def store_vxcage(filepath): +# TODO: use response, not filepath +def upload_vxcage(filepath): if os.path.exists(filepath): files = {'file': (os.path.basename(filepath), open(filepath, 'rb'))} url = 'http://localhost:8080/malware/add' @@ -90,7 +50,7 @@ def store_vxcage(filepath): response_data = response.json() logging.info("Submitted %s to VxCage, response was %s" % (os.path.basename(filepath), response_data["message"])) - logging.info("Deleting file as it has been uploaded to VxCage") + logging.info("Deleting file %s as it has been uploaded to VxCage" % filepath) try: os.remove(filepath) except: @@ -99,7 +59,8 @@ def store_vxcage(filepath): logging.info("Exception caught from VxCage") -def submit_cuckoo(filepath): +# TODO: use response, not filepath +def upload_cuckoo(filepath): if os.path.exists(filepath): files = {'file': (os.path.basename(filepath), open(filepath, 'rb'))} url = 'http://localhost:8090/tasks/create/file' @@ -112,39 +73,96 @@ def submit_cuckoo(filepath): logging.info("Exception caught from Cuckoo") -def get_xml_list(feed_url, q): +def upload_viper(filepath, source_url): + if os.path.exists(filepath): + files = {'file': (os.path.basename(filepath), open(filepath, 'rb'))} + url = 'http://localhost:8080/file/add' + headers = {'User-agent': 'Maltrieve'} + try: + # Note that this request does NOT go through proxies + response = requests.post(url, headers=headers, files=files) + response_data = response.json() + logging.info("Submitted %s to Viper, response was %s" % (os.path.basename(filepath), + response_data["message"])) + logging.info("Deleting file as it has been uploaded to Viper") + try: + os.remove(filepath) + except: + logging.info("Exception when attempting to delete file: %s", filepath) + except: + logging.info("Exception caught from Viper") + + + +def exception_handler(request, exception): + logging.info("Request for %s failed: %s" % (request, exception)) - feed = feedparser.parse(feed_url) + +def save_malware(response, directory): + url = response.url + data = response.content + md5 = hashlib.md5(data).hexdigest() + logging.info("%s hashes to %s" % (url, md5)) + if not os.path.isdir(directory): + try: + os.makedirs(dumpdir) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + with open(os.path.join(directory, md5), 'wb') as f: + f.write(data) + logging.info("Saved %s" % md5) + return md5 + + +def process_xml_list_desc(response): + feed = feedparser.parse(response) + urls = set() for entry in feed.entries: desc = entry.description url = desc.split(' ')[1].rstrip(',') + if url == '': + continue if url == '-': url = desc.split(' ')[4].rstrip(',') url = re.sub('&', '&', url) if not re.match('http', url): url = 'http://' + url - push_malware_url(url, q) + urls.add(url) + return urls -def push_malware_url(url, q): - url = url.strip() - if url not in pasturls: - logging.info('Adding new URL to queue: %s', url) - pasturls.add(url) - q.put(url) - else: - logging.info('Skipping previously processed URL: %s', url) + +def process_xml_list_title(response): + feed = feedparser.parse(response) + urls = set([re.sub('&', '&', entry.title) for entry in feed.entries]) + return urls + + +def process_simple_list(response): + urls = set([re.sub('&', '&', line.strip()) for line in response.split('\n') if line.startswith('http')]) + return urls + + +def process_urlquery(response): + soup = BeautifulSoup(response) + urls = set() + for t in soup.find_all("table", class_="test"): + for a in t.find_all("a"): + urls.add('http://'+re.sub('&', '&', a.text)) + return urls + + +def chunker(seq, size): + return (seq[pos:pos + size] for pos in xrange(0, len(seq), size)) def main(): global hashes hashes = set() - global pasturls - pasturls = set() + past_urls = set() - malq = Queue() - NUMTHREADS = 5 now = datetime.datetime.now() parser = argparse.ArgumentParser() @@ -221,64 +239,57 @@ def main(): if os.path.exists('urls.json'): with open('urls.json', 'rb') as urlfile: - pasturls = json.load(urlfile) + past_urls = json.load(urlfile) elif os.path.exists('urls.obj'): with open('urls.obj', 'rb') as urlfile: - pasturls = pickle.load(urlfile) - - for i in range(NUMTHREADS): - worker = Thread(target=get_malware, args=(malq, cfg['dumpdir'],)) - worker.setDaemon(True) - worker.start() - - # TODO: refactor so we're just appending to the queue here - get_xml_list('http://www.malwaredomainlist.com/hostslist/mdl.xml', malq) - get_xml_list('http://malc0de.com/rss', malq) - get_xml_list('http://www.malwareblacklist.com/mbl.xml', malq) - - # TODO: wrap these in functions? - for url in requests.get('http://vxvault.siri-urz.net/URL_List.php', proxies=cfg['proxy']).text: - if re.match('http', url): - push_malware_url(url, malq) - - sacour_text = requests.get('http://www.sacour.cn/list/%d-%d/%d%d%d.htm' % - (now.year, now.month, now.year, now.month, - now.day), proxies=cfg['proxy']).text - if sacour_text: - sacour_soup = BeautifulSoup(sacour_text) - for url in sacour_soup.stripped_strings: - if re.match("^http", url): - push_malware_url(url, malq) - - urlquery_text = requests.get('http://urlquery.net/', proxies=cfg['proxy']).text - if urlquery_text: - urlquery_soup = BeautifulSoup(urlquery_text) - for t in urlquery_soup.find_all("table", class_="test"): - for a in t.find_all("a"): - push_malware_url(a['title'], malq) - - # TODO: this doesn't use proxies - cleanmx_feed = feedparser.parse('http://support.clean-mx.de/clean-mx/rss?scope=viruses&limit=0%2C64') - for entry in cleanmx_feed.entries: - push_malware_url(entry.title, malq) - - joxean_text = requests.get('http://malwareurls.joxeankoret.com/normal.txt', - proxies=cfg['proxy']).text - joxean_lines = joxean_text.splitlines() - for url in joxean_lines: - if not re.match("^#", url): - push_malware_url(url, malq) + past_urls = pickle.load(urlfile) + + source_urls = {'http://www.malwaredomainlist.com/hostslist/mdl.xml': process_xml_list_desc, + 'http://malc0de.com/rss/': process_xml_list_desc, + # 'http://www.malwareblacklist.com/mbl.xml', # removed for now + 'http://vxvault.siri-urz.net/URL_List.php': process_simple_list, + 'http://urlquery.net/': process_urlquery, + 'http://support.clean-mx.de/clean-mx/rss?scope=viruses&limit=0%2C64': process_xml_list_title, + 'http://malwareurls.joxeankoret.com/normal.txt': process_simple_list} + headers = {'User-Agent': 'maltrieve'} + + reqs = [grequests.get(url, timeout=60, headers=headers, proxies=cfg['proxy']) for url in source_urls] + source_lists = grequests.map(reqs) + + print "Completed source processing" cfg['vxcage'] = args.vxcage or config.has_option('Maltrieve', 'vxcage') cfg['cuckoo'] = args.cuckoo or config.has_option('Maltrieve', 'cuckoo') cfg['logheaders'] = config.get('Maltrieve', 'logheaders') - malq.join() - - if pasturls: + malware_urls = set() + for response in source_lists: + if hasattr(response, 'status_code') and response.status_code == 200: + malware_urls.update(source_urls[response.url](response.text)) + + malware_urls -= past_urls + reqs = [grequests.get(url, headers=headers, proxies=cfg['proxy']) for url in malware_urls] + for chunk in chunker(reqs, 32): + malware_downloads = grequests.map(chunk) + for each in malware_downloads: + if not each or each.status_code != 200: + continue + md5 = save_malware(each, cfg['dumpdir']) + if 'vxcage' in cfg: + upload_vxcage(md5) + if 'cuckoo' in cfg: + upload_cuckoo(md5) + if 'viper' in cfg: + upload_viper(each) + past_urls.add(each.url) + + + print "Completed downloads" + + if past_urls: logging.info('Dumping past URLs to file') with open('urls.json', 'w') as urlfile: - json.dump(pasturls, urlfile) + json.dump(past_urls, urlfile) if hashes: with open('hashes.json', 'w') as hashfile: diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..412371e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +argparse==1.2.1 +beautifulsoup4==4.3.2 +feedparser==5.1.3 +gevent==1.0.1 +greenlet==0.4.2 +grequests==0.2.0 +requests==2.3.0 +wsgiref==0.1.2