From 86f6caf91ebf1d4d4afec047054f8ef444d51c42 Mon Sep 17 00:00:00 2001 From: george Date: Mon, 22 Dec 2014 18:24:48 +0100 Subject: [PATCH 01/97] modifying maltrieve to allow it to upload source and domain to Crits --- maltrieve.cfg | 5 ++++- maltrieve.py | 57 +++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 57 insertions(+), 5 deletions(-) diff --git a/maltrieve.cfg b/maltrieve.cfg index add59cf..fd85165 100644 --- a/maltrieve.cfg +++ b/maltrieve.cfg @@ -7,7 +7,10 @@ User-Agent = Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0) #viper = http://127.0.0.1:8080 #cuckoo = http://127.0.0.1:8090 #vxcage = http://127.0.0.1:8080 - +#crits = http://127.0.0.1:8000 +#crits_user = user +#crits_key = +#crits_source = maltrieve # Filter Lists are based on mime type NO SPACE BETWEEN , #black_list = text/html,text/plain diff --git a/maltrieve.py b/maltrieve.py index fb6dca9..5dd6d18 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -38,6 +38,41 @@ from threading import Thread from Queue import Queue from bs4 import BeautifulSoup +from pycrits import pycrits + +def upload_crits(response, md5): + if response: + url_tag = urlparse(response.url) + files = {'file': (md5, response.content)} + url = "{0}/api/v1/samples".format(config.get('Maltrieve', 'crits')) + headers = {'User-agent': 'Maltrieve'} + + ip_data = { + 'api_key': cfg['crits_key'], + 'username': cfg['crits_user'], + 'source': cfg['crits_source'], + 'domain': url_tag.netloc + } + + # submit domain / IP + try: + # Note that this request does NOT go through proxies + response = requests.post(url, headers=headers, data=ip_data) + response_data = response.json() + logging.info("Submitted domain info for %s to Crits, response was %s" % (md5, + response_data["message"])) + except: + logging.info("Exception caught from Crits") + + # submit sample + try: + # Note that this request does NOT go through proxies + response = requests.post(url, headers=headers, files=files, data=tags) + response_data = response.json() + logging.info("Submitted sample info for %s to Crits, response was %s" % (md5, + response_data["message"])) + except: + logging.info("Exception caught from Crits") def upload_vxcage(response, md5): @@ -121,6 +156,9 @@ def save_malware(response, directory, black_list, white_list): if cfg['viper']: upload_viper(response, md5) stored = True + if cfg['crits']: + upload_crits(response, md5) + stored = True # else save to disk if not stored: with open(os.path.join(directory, md5), 'wb') as f: @@ -186,14 +224,18 @@ def main(): help="Define dump directory for retrieved files") parser.add_argument("-l", "--logfile", help="Define file for logging progress") - parser.add_argument("-x", "--vxcage", - help="Dump the file to a VxCage instance", + parser.add_arguement("-r", "--crits", + help="Dump the file to a Crits instance.", action="store_true", default=False) parser.add_argument("-v", "--viper", help="Dump the file to a Viper instance", action="store_true", default=False) + parser.add_argument("-x", "--vxcage", + help="Dump the file to a VxCage instance", + action="store_true", default=False) parser.add_argument("-c", "--cuckoo", - help="Enable cuckoo analysis", action="store_true", default=False) + help="Enable cuckoo analysis", + action="store_true", default=False) global cfg cfg = dict() @@ -236,7 +278,14 @@ def main(): cfg['vxcage'] = args.vxcage or config.has_option('Maltrieve', 'vxcage') cfg['cuckoo'] = args.cuckoo or config.has_option('Maltrieve', 'cuckoo') cfg['viper'] = args.viper or config.has_option('Maltrieve', 'viper') - cfg['logheaders'] = config.get('Maltrieve', 'logheaders') + cfg['logheaders'] = config.get('Maltrieve', 'logheaders') + + # See if crits is configured. If so add config options for User/API + cfg['crits'] = args.crits or config.has_option('Maltrieve', 'crits') + if cfg['crits']: + cfg['crits_user'] = config.get('Maltrieve', 'crits_user') + cfg['crits_key'] = config.get('Maltrieve', 'crits_key') + cfg['crits_source'] = config.get('Maltrieve', 'crits_source') black_list = [] if config.has_option('Maltrieve', 'black_list'): From 39fe87c4671a62ec24d8bf6ff15c9bafd769e197 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Tue, 6 Jan 2015 20:18:00 -0600 Subject: [PATCH 02/97] Set open file descriptor limits to fix #66 --- maltrieve.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/maltrieve.py b/maltrieve.py index fb6dca9..16f01ba 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -29,6 +29,7 @@ import pickle import re import requests +import resource import tempfile import sys import ConfigParser @@ -173,6 +174,8 @@ def chunker(seq, size): def main(): + resource.setrlimit(resource.RLIMIT_NOFILE, (2048, 2048)) + global hashes hashes = set() past_urls = set() From 1f2648a832aeed2873860d63818aa2cf0be80a75 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Tue, 6 Jan 2015 20:19:09 -0600 Subject: [PATCH 03/97] Minor PEP8 whitespace fixes --- maltrieve.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 16f01ba..51cfea7 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -45,7 +45,7 @@ def upload_vxcage(response, md5): if response: url_tag = urlparse(response.url) files = {'file': (md5, response.content)} - tags = {'tags':url_tag.netloc + ',Maltrieve'} + tags = {'tags': url_tag.netloc + ',Maltrieve'} url = "{0}/malware/add".format(config.get('Maltrieve', 'vxcage')) headers = {'User-agent': 'Maltrieve'} try: @@ -76,7 +76,7 @@ def upload_viper(response, md5): if response: url_tag = urlparse(response.url) files = {'file': (md5, response.content)} - tags = {'tags':url_tag.netloc + ',Maltrieve'} + tags = {'tags': url_tag.netloc + ',Maltrieve'} url = "{0}/file/add".format(config.get('Maltrieve', 'viper')) headers = {'User-agent': 'Maltrieve'} try: @@ -175,7 +175,7 @@ def chunker(seq, size): def main(): resource.setrlimit(resource.RLIMIT_NOFILE, (2048, 2048)) - + global hashes hashes = set() past_urls = set() From 804842e5d7dfeffa87d6817296cec3bc21be72b2 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Mon, 9 Feb 2015 20:15:06 -0600 Subject: [PATCH 04/97] Convert old-style format strings to Python 3 style --- maltrieve.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index f1edaa3..56478f4 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -51,8 +51,7 @@ def upload_vxcage(response, md5): # Note that this request does NOT go through proxies response = requests.post(url, headers=headers, files=files, data=tags) response_data = response.json() - logging.info("Submitted %s to VxCage, response was %s" % (md5, - response_data["message"])) + logging.info("Submitted {md5} to VxCage, response was {msg}".format(md5=md5, msg=response_data["message"])) except: logging.info("Exception caught from VxCage") @@ -66,7 +65,7 @@ def upload_cuckoo(response, md5): #try: response = requests.post(url, headers=headers, data=data) response_data = response.json() - logging.info("Submitted %s to Cuckoo, task ID %s", md5, response_data["task_id"]) + logging.info("Submitted {md5} to Cuckoo, task ID {taskid}".format(md5=md5, taskid=response_data["task_id"])) #except: #logging.info("Exception caught from Cuckoo") @@ -82,8 +81,7 @@ def upload_viper(response, md5): # Note that this request does NOT go through proxies response = requests.post(url, headers=headers, files=files, data=tags) response_data = response.json() - logging.info("Submitted %s to Viper, response was %s" % (md5, - response_data["message"])) + logging.info("Submitted {md5} to Viper, response was {msg}".format(md5=md5, msg=response_data["message"])) except: logging.info("Exception caught from Viper") @@ -97,18 +95,18 @@ def save_malware(response, directory, black_list, white_list): data = response.content mime_type = magic.from_buffer(data, mime=True) if mime_type in black_list: - logging.info('%s in ignore list for %s', mime_type, url) + logging.info('{mtype} in ignore list for {url}'.format(mtype=mime_type, url=url)) return if white_list: if mime_type in white_list: pass else: - logging.info('%s not in whitelist for %s', mime_type, url) + logging.info('{mtype} not in whitelist for {url}'.format(mtype=mime_type, url=url)) return # Hash and log md5 = hashlib.md5(data).hexdigest() - logging.info("%s hashes to %s" % (url, md5)) + logging.info("{url} hashes to {md5}".format(url=url, md5=md5)) # Assume that if viper or vxcage then we dont need to write to file as well. stored = False @@ -133,7 +131,7 @@ def save_malware(response, directory, black_list, white_list): store_path = os.path.join(directory, md5) with open(store_path, 'wb') as f: f.write(data) - logging.info("Saved %s to dump dir" % md5) + logging.info("Saved {md5} to dump dir".format(md5=md5)) return True @@ -243,8 +241,8 @@ def main(): if cfg['proxy']: logging.info('Using proxy %s', cfg['proxy']) my_ip = requests.get('http://ipinfo.io/ip', proxies=cfg['proxy']).text - logging.info('External sites see %s', my_ip) - print "External sites see %s" % my_ip + logging.info('External sites see {ip}'.format(ip=my_ip)) + print 'External sites see {ip}'.format(ip=my_ip) cfg['vxcage'] = args.vxcage or config.has_option('Maltrieve', 'vxcage') cfg['cuckoo'] = args.cuckoo or config.has_option('Maltrieve', 'cuckoo') @@ -274,13 +272,12 @@ def main(): try: d = tempfile.mkdtemp(dir=cfg['dumpdir']) except Exception as e: - logging.error('Could not open %s for writing (%s), using default', - cfg['dumpdir'], e) + logging.error('Could not open {dir} for writing ({exception}), using default'.format(dir=cfg['dumpdir'], exception=e)) cfg['dumpdir'] = '/tmp/malware' else: os.rmdir(d) - logging.info('Using %s as dump directory', cfg['dumpdir']) + logging.info('Using {dir} as dump directory'.format(dir=cfg['dumpdir'])) if os.path.exists('hashes.json'): with open('hashes.json', 'rb') as hashfile: From aa40915bad49c53cda3e842b6b08d335a53c46a7 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Mon, 9 Feb 2015 20:15:32 -0600 Subject: [PATCH 05/97] Remove unused (?) function --- maltrieve.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 56478f4..6da5338 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -86,10 +86,6 @@ def upload_viper(response, md5): logging.info("Exception caught from Viper") -def exception_handler(request, exception): - logging.info("Request for %s failed: %s" % (request, exception)) - - def save_malware(response, directory, black_list, white_list): url = response.url data = response.content From 4abd1bebb952e46afedd73094adb7ee403d563c8 Mon Sep 17 00:00:00 2001 From: webstergd Date: Sun, 15 Feb 2015 18:57:31 +0100 Subject: [PATCH 06/97] bug fixes and updated to a working version --- maltrieve.cfg | 4 +-- maltrieve.py | 91 ++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 70 insertions(+), 25 deletions(-) diff --git a/maltrieve.cfg b/maltrieve.cfg index fd85165..f4703df 100644 --- a/maltrieve.cfg +++ b/maltrieve.cfg @@ -7,8 +7,8 @@ User-Agent = Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0) #viper = http://127.0.0.1:8080 #cuckoo = http://127.0.0.1:8090 #vxcage = http://127.0.0.1:8080 -#crits = http://127.0.0.1:8000 -#crits_user = user +#crits = https://127.0.0.1 +#crits_user = maltrieve #crits_key = #crits_source = maltrieve diff --git a/maltrieve.py b/maltrieve.py index 5dd6d18..711357a 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -38,48 +38,93 @@ from threading import Thread from Queue import Queue from bs4 import BeautifulSoup -from pycrits import pycrits -def upload_crits(response, md5): +def upload_crits(response, md5, mime_type): if response: url_tag = urlparse(response.url) - files = {'file': (md5, response.content)} - url = "{0}/api/v1/samples".format(config.get('Maltrieve', 'crits')) + files = {'filedata': (md5, response.content)} headers = {'User-agent': 'Maltrieve'} - - ip_data = { + zip_files = ['application/zip', 'application/gzip', 'application/x-7z-compressed'] + rar_files = ['application/x-rar-compressed'] + domain_response_data = False + sample_response_data = False + + # submit domain / IP + # TODO: identify if it is a domain or IP and submit accordingly + url = "{0}/api/v1/domains/".format(config.get('Maltrieve', 'crits')) + domain_data = { 'api_key': cfg['crits_key'], 'username': cfg['crits_user'], 'source': cfg['crits_source'], 'domain': url_tag.netloc } - - # submit domain / IP try: # Note that this request does NOT go through proxies - response = requests.post(url, headers=headers, data=ip_data) - response_data = response.json() + domain_response = requests.post(url, headers=headers, data=domain_data, verify=False) + domain_response_data = domain_response.json() logging.info("Submitted domain info for %s to Crits, response was %s" % (md5, - response_data["message"])) + domain_response_data["message"])) except: - logging.info("Exception caught from Crits") - - # submit sample + logging.info("Exception caught from Crits when submitting domain") + + # Submit sample + url = "{0}/api/v1/samples/".format(config.get('Maltrieve', 'crits')) + if mime_type in zip_files: + file_type = 'zip' + elif mime_type in rar_files: + file_type = 'rar' + else: + file_type = 'raw' + sample_data = { + 'api_key': cfg['crits_key'], + 'username': cfg['crits_user'], + 'source': cfg['crits_source'], + 'upload_type': 'file', + 'md5': md5, + 'file_format': file_type # must be type zip, rar, or raw + } try: # Note that this request does NOT go through proxies - response = requests.post(url, headers=headers, files=files, data=tags) - response_data = response.json() + sample_response = requests.post(url, headers=headers, files=files, data=sample_data, verify=False) + sample_response_data = sample_response.json() logging.info("Submitted sample info for %s to Crits, response was %s" % (md5, - response_data["message"])) + sample_response_data["message"])) except: - logging.info("Exception caught from Crits") + logging.info("Exception caught from Crits when submitting sample") + + # Create a relationship for the sample and domain + url = "{0}/api/v1/relationships/".format(config.get('Maltrieve', 'crits')) + if (sample_response_data['return_code'] == 0 and + domain_response_data['return_code'] == 0): + relationship_data = { + 'api_key': cfg['crits_key'], + 'username': cfg['crits_user'], + 'source': cfg['crits_source'], + 'right_type': domain_response_data['type'], + 'right_id': domain_response_data['id'], + 'left_type': sample_response_data['type'], + 'left_id': sample_response_data['id'], + 'rel_type': 'Downloaded_From', + 'rel_confidence': 'high', + 'rel_date': datetime.datetime.now() + } + try: + # Note that this request does NOT go through proxies + relationship_response = requests.post(url, headers=headers, data=relationship_data, verify=False) + relationship_response_data = relationship_response.json() + logging.info("Submitted relationship info for %s to Crits, response was %s" % (md5, + relationship_response_data["message"])) + except: + logging.info("Exception caught from Crits when submitting relationship") + else: + logging.info("Relationship submission skipped. \n Domain message was %s\n Sample message was %s" % (domain_response_data["message"], sample_response_data["message"]) def upload_vxcage(response, md5): if response: url_tag = urlparse(response.url) files = {'file': (md5, response.content)} - tags = {'tags':url_tag.netloc + ',Maltrieve'} + tags = {'tags': url_tag.netloc + ',Maltrieve'} url = "{0}/malware/add".format(config.get('Maltrieve', 'vxcage')) headers = {'User-agent': 'Maltrieve'} try: @@ -110,7 +155,7 @@ def upload_viper(response, md5): if response: url_tag = urlparse(response.url) files = {'file': (md5, response.content)} - tags = {'tags':url_tag.netloc + ',Maltrieve'} + tags = {'tags': url_tag.netloc + ',Maltrieve'} url = "{0}/file/add".format(config.get('Maltrieve', 'viper')) headers = {'User-agent': 'Maltrieve'} try: @@ -157,7 +202,7 @@ def save_malware(response, directory, black_list, white_list): upload_viper(response, md5) stored = True if cfg['crits']: - upload_crits(response, md5) + upload_crits(response, md5, mime_type) stored = True # else save to disk if not stored: @@ -224,7 +269,7 @@ def main(): help="Define dump directory for retrieved files") parser.add_argument("-l", "--logfile", help="Define file for logging progress") - parser.add_arguement("-r", "--crits", + parser.add_argument("-r", "--crits", help="Dump the file to a Crits instance.", action="store_true", default=False) parser.add_argument("-v", "--viper", @@ -272,7 +317,7 @@ def main(): if cfg['proxy']: logging.info('Using proxy %s', cfg['proxy']) - my_ip = requests.get('http://whatthehellismyip.com/?ipraw').text + my_ip = requests.get('http://api.ipify.org', proxies=cfg['proxy']).text logging.info('External sites see %s', my_ip) cfg['vxcage'] = args.vxcage or config.has_option('Maltrieve', 'vxcage') From 72529da049466821384e49fcb3feeaa2908666e1 Mon Sep 17 00:00:00 2001 From: webstergd Date: Mon, 2 Mar 2015 20:13:54 +0100 Subject: [PATCH 07/97] quick fix: forgot ) at end of logging function. --- maltrieve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maltrieve.py b/maltrieve.py index eaef160..c84d37f 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -117,7 +117,7 @@ def upload_crits(response, md5, mime_type): except: logging.info("Exception caught from Crits when submitting relationship") else: - logging.info("Relationship submission skipped. \n Domain message was %s\n Sample message was %s" % (domain_response_data["message"], sample_response_data["message"]) + logging.info("Relationship submission skipped. \n Domain message was %s\n Sample message was %s" % (domain_response_data["message"], sample_response_data["message"])) def upload_vxcage(response, md5): From d2e114001af0bcc9a08ec9b26081bc38d1441811 Mon Sep 17 00:00:00 2001 From: webstergd Date: Tue, 3 Mar 2015 00:03:57 +0100 Subject: [PATCH 08/97] adjusted status code checks. --- maltrieve.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index c84d37f..b1977f5 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -46,8 +46,6 @@ def upload_crits(response, md5, mime_type): headers = {'User-agent': 'Maltrieve'} zip_files = ['application/zip', 'application/gzip', 'application/x-7z-compressed'] rar_files = ['application/x-rar-compressed'] - domain_response_data = False - sample_response_data = False # submit domain / IP # TODO: identify if it is a domain or IP and submit accordingly @@ -61,9 +59,10 @@ def upload_crits(response, md5, mime_type): try: # Note that this request does NOT go through proxies domain_response = requests.post(url, headers=headers, data=domain_data, verify=False) - domain_response_data = domain_response.json() - logging.info("Submitted domain info for %s to Crits, response was %s" % (md5, - domain_response_data["message"])) + if domain_response.status_code == requests.codes.ok: + domain_response_data = domain_response.json() + logging.info("Submitted domain info for %s to Crits, response was %s" % (md5, + domain_response_data["message"])) except: logging.info("Exception caught from Crits when submitting domain") @@ -86,16 +85,17 @@ def upload_crits(response, md5, mime_type): try: # Note that this request does NOT go through proxies sample_response = requests.post(url, headers=headers, files=files, data=sample_data, verify=False) - sample_response_data = sample_response.json() - logging.info("Submitted sample info for %s to Crits, response was %s" % (md5, + if sample_response.status_code == requests.codes.ok: + sample_response_data = sample_response.json() + logging.info("Submitted sample info for %s to Crits, response was %s" % (md5, sample_response_data["message"])) except: logging.info("Exception caught from Crits when submitting sample") # Create a relationship for the sample and domain url = "{0}/api/v1/relationships/".format(config.get('Maltrieve', 'crits')) - if (sample_response_data['return_code'] == 0 and - domain_response_data['return_code'] == 0): + if (domain_response.status_code == requests.codes.ok and + sample_response.status_code == requests.codes.ok): relationship_data = { 'api_key': cfg['crits_key'], 'username': cfg['crits_user'], @@ -111,13 +111,14 @@ def upload_crits(response, md5, mime_type): try: # Note that this request does NOT go through proxies relationship_response = requests.post(url, headers=headers, data=relationship_data, verify=False) - relationship_response_data = relationship_response.json() - logging.info("Submitted relationship info for %s to Crits, response was %s" % (md5, - relationship_response_data["message"])) + if relationship_response.status_code == requests.codes.ok: + relationship_response_data = relationship_response.json() + logging.info("Submitted relationship info for %s to Crits, response was %s" % (md5, + relationship_response_data["message"])) except: logging.info("Exception caught from Crits when submitting relationship") else: - logging.info("Relationship submission skipped. \n Domain message was %s\n Sample message was %s" % (domain_response_data["message"], sample_response_data["message"])) + logging.info("Relationship submission skipped. \n Domain was %s\n Sample response was %s\n Domain response was %s\n" % (url_tag.netloc, sample_response.status_code, domain_response.status_code)) def upload_vxcage(response, md5): From fc24b771d7b7083378be000278a7d61b2b1e834d Mon Sep 17 00:00:00 2001 From: webstergd Date: Sun, 8 Mar 2015 20:00:31 +0100 Subject: [PATCH 09/97] Adjusted status checks to ensure that a connection was successfully made to Crits and that it processed the information. --- maltrieve.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index b1977f5..fe87b5f 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -46,6 +46,8 @@ def upload_crits(response, md5, mime_type): headers = {'User-agent': 'Maltrieve'} zip_files = ['application/zip', 'application/gzip', 'application/x-7z-compressed'] rar_files = ['application/x-rar-compressed'] + inserted_domain = False + inserted_sample = False # submit domain / IP # TODO: identify if it is a domain or IP and submit accordingly @@ -63,6 +65,8 @@ def upload_crits(response, md5, mime_type): domain_response_data = domain_response.json() logging.info("Submitted domain info for %s to Crits, response was %s" % (md5, domain_response_data["message"])) + if domain_response_data['return_code'] == 0: + inserted_domain = True except: logging.info("Exception caught from Crits when submitting domain") @@ -89,13 +93,14 @@ def upload_crits(response, md5, mime_type): sample_response_data = sample_response.json() logging.info("Submitted sample info for %s to Crits, response was %s" % (md5, sample_response_data["message"])) + if sample_response_data['return_code'] == 0: + inserted_sample = True except: logging.info("Exception caught from Crits when submitting sample") # Create a relationship for the sample and domain url = "{0}/api/v1/relationships/".format(config.get('Maltrieve', 'crits')) - if (domain_response.status_code == requests.codes.ok and - sample_response.status_code == requests.codes.ok): + if (inserted_sample and inserted_domain): relationship_data = { 'api_key': cfg['crits_key'], 'username': cfg['crits_user'], From fcf2f6ba1f30d271f2aeed9def0faf49defbf32b Mon Sep 17 00:00:00 2001 From: george Date: Fri, 13 Mar 2015 13:25:49 +0100 Subject: [PATCH 10/97] Error handling relationship does not call variable before assignment now. --- maltrieve.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index fe87b5f..36cb559 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -121,9 +121,9 @@ def upload_crits(response, md5, mime_type): logging.info("Submitted relationship info for %s to Crits, response was %s" % (md5, relationship_response_data["message"])) except: - logging.info("Exception caught from Crits when submitting relationship") + logging.info("Relationship submission skipped. \n Domain was %s\n Sample response was %s\n Domain response was %s\n" % (url_tag.netloc, sample_response.status_code, domain_response.status_code)) else: - logging.info("Relationship submission skipped. \n Domain was %s\n Sample response was %s\n Domain response was %s\n" % (url_tag.netloc, sample_response.status_code, domain_response.status_code)) + logging.info("Skipping adding relationship. CRITs could not process domain or sample.") def upload_vxcage(response, md5): From aa6a1c1c5b7a4553654323d617fe08f06c9ad5c8 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Sun, 15 Mar 2015 22:10:04 -0500 Subject: [PATCH 11/97] Update based on v0.6 setuptools change --- docker/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 0f38d6e..793bad7 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -36,12 +36,12 @@ RUN apt-get update && \ WORKDIR /home RUN git clone https://github.com/technoskald/maltrieve.git && \ cd maltrieve && \ - pip install -r requirements.txt && \ + python setup.py install && \ chown -R maltrieve:maltrieve /home/maltrieve USER maltrieve ENV HOME /home/maltrieve ENV USER maltrieve WORKDIR /home/maltrieve -CMD ["./maltrieve.py"] +CMD ["maltrieve"] From 5b364e069b2d42cf6e6edb8e25ca6ca0fb1f9d0f Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Sun, 15 Mar 2015 22:22:12 -0500 Subject: [PATCH 12/97] Upgrade system files and pip --- docker/Dockerfile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 793bad7..02346ff 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -19,17 +19,17 @@ MAINTAINER Michael Boman USER root RUN apt-get update && \ - apt-get install -y --no-install-recommends \ + apt-get dist-upgrade -y +RUN apt-get install -y --no-install-recommends \ gcc \ git \ libpython2.7-stdlib \ python2.7 \ python2.7-dev \ python-pip \ - python-setuptools && \ - - rm -rf /var/lib/apt/lists/* && \ - + python-setuptools +RUN rm -rf /var/lib/apt/lists/* && \ + pip install --upgrade pip && \ groupadd -r maltrieve && \ useradd -r -g maltrieve -d /home/maltrieve -s /sbin/nologin -c "Maltrieve User" maltrieve From 339e69a8d2d0161286ac93e593d230418a9b11b1 Mon Sep 17 00:00:00 2001 From: webstergd Date: Tue, 17 Mar 2015 12:49:57 +0100 Subject: [PATCH 13/97] Added comment in readme file for CRITs. --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index d4348ea..59c1dad 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,7 @@ optional arguments: Define file for logging progress -x, --vxcage Dump the files to a VxCage instance -v, --viper Dump the files to a Viper instance + -r, --crits Dump the file and domain to a CRITs instance -c, --cuckoo Enable Cuckoo analysis -s, --sort_mime Sort files by MIME type From 52b824630bbafd09363695c0a03eb33ce2e4ef43 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Tue, 17 Mar 2015 15:22:27 -0500 Subject: [PATCH 14/97] Remove unused imports --- maltrieve.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 6bb62bf..5f4b62e 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -35,8 +35,6 @@ import magic from urlparse import urlparse -from threading import Thread -from Queue import Queue from bs4 import BeautifulSoup From 0b156eaf715025197e6ce2d8ff5ffb16091b70b4 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 19 Mar 2015 14:01:39 -0500 Subject: [PATCH 15/97] Remove unused function --- maltrieve.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 5f4b62e..3dd45bf 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -86,10 +86,6 @@ def upload_viper(response, md5): logging.info("Exception caught from Viper") -def exception_handler(request, exception): - logging.info("Request for %s failed: %s" % (request, exception)) - - def save_malware(response, directory, black_list, white_list): url = response.url data = response.content From 435074e53f57a31f2460889a95eb622acb97b870 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 19 Mar 2015 14:06:47 -0500 Subject: [PATCH 16/97] Name format vars for repo servers --- maltrieve.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 3517954..5dbef83 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -45,7 +45,7 @@ def upload_vxcage(response, md5): url_tag = urlparse(response.url) files = {'file': (md5, response.content)} tags = {'tags': url_tag.netloc + ',Maltrieve'} - url = "{0}/malware/add".format(config.get('Maltrieve', 'vxcage')) + url = "{srv}/malware/add".format(srv=config.get('Maltrieve', 'vxcage')) headers = {'User-agent': 'Maltrieve'} try: # Note that this request does NOT go through proxies @@ -60,7 +60,7 @@ def upload_vxcage(response, md5): def upload_cuckoo(response, md5): if response: data = {'url': response.url} - url = "{0}/tasks/create/url".format(config.get('Maltrieve', 'cuckoo')) + url = "{srv}/tasks/create/url".format(srv=config.get('Maltrieve', 'cuckoo')) headers = {'User-agent': 'Maltrieve'} #try: response = requests.post(url, headers=headers, data=data) @@ -75,7 +75,7 @@ def upload_viper(response, md5): url_tag = urlparse(response.url) files = {'file': (md5, response.content)} tags = {'tags': url_tag.netloc + ',Maltrieve'} - url = "{0}/file/add".format(config.get('Maltrieve', 'viper')) + url = "{srv}/file/add".format(srv=config.get('Maltrieve', 'viper')) headers = {'User-agent': 'Maltrieve'} try: # Note that this request does NOT go through proxies From a6db1f95c8b82a186210e6db60c1ffefc4187dff Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 19 Mar 2015 14:07:36 -0500 Subject: [PATCH 17/97] Remove unused variable --- maltrieve.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 920abf9..956a308 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -177,8 +177,6 @@ def main(): hashes = set() past_urls = set() - now = datetime.datetime.now() - parser = argparse.ArgumentParser() parser.add_argument("-p", "--proxy", help="Define HTTP proxy as address:port") From 5253d85394e71b2a4be15c83197523695357c617 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 19 Mar 2015 14:15:03 -0500 Subject: [PATCH 18/97] Remove declaration of hashes as a global --- maltrieve.py | 1 - 1 file changed, 1 deletion(-) diff --git a/maltrieve.py b/maltrieve.py index 956a308..44e1f9b 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -173,7 +173,6 @@ def chunker(seq, size): def main(): - global hashes hashes = set() past_urls = set() From 91fb9ebacd625d154eeba0a2647052eb74b92d39 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 19 Mar 2015 14:15:42 -0500 Subject: [PATCH 19/97] Remove another unused import --- maltrieve.py | 1 - 1 file changed, 1 deletion(-) diff --git a/maltrieve.py b/maltrieve.py index 44e1f9b..9fa18c4 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -19,7 +19,6 @@ # along with this program. If not, see Date: Thu, 19 Mar 2015 14:26:17 -0500 Subject: [PATCH 20/97] Update sources list to fix #101 --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d4348ea..ed4d3ce 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,8 @@ Maltrieve originated as a fork of [mwcrawler](https://github.com/ricardo-dias/mwcrawler). It retrieves malware directly from the sources as listed at a number of sites. Currently we crawl the following: * [Malc0de](http://malc0de.com/rss) -* [Malware Black List](http://www.malwareblacklist.com/mbl.xml) * [Malware Domain List](http://www.malwaredomainlist.com/hostslist/mdl.xml) +* [Malware URLs](http://malwareurls.joxeankoret.com/normal.txt) * [VX Vault](http://vxvault.siri-urz.net/URL_List.php) * [URLquery](http://urlquery.net/) * [CleanMX](http://support.clean-mx.de/clean-mx/xmlviruses.php?) @@ -88,4 +88,4 @@ Aside from pull requests, non-developers can open issues on [Github](https://git * Suggestions of additional sources for malware lists * Descriptions of how you use it and ways we can improve it for you -Check the [contributing guide](./CONTRIBUTING.md) for details. +Check the [contributing guide](./CONTRIBUTING.md) for details. From fa7db85bad7abe971447a78ce796f6eb327d6917 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 19 Mar 2015 14:29:39 -0500 Subject: [PATCH 21/97] Add MBL to backlog --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ed4d3ce..2a8c4db 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ Maltrieve originated as a fork of [mwcrawler](https://github.com/ricardo-dias/mw These lists will be implemented if/when they return to activity. +* [Malware Blacklist](http://www.malwareblacklist.com/showMDL.php) * [NovCon Minotaur](http://minotauranalysis.com/malwarelist-urls.aspx) Other improvements include: @@ -41,7 +42,7 @@ Maltrieve requires the following dependencies: * [python-magic](https://pypi.python.org/pypi/python-magic/) * [Requests](http://www.python-requests.org) -With the exception of the Python header files, these can all be found in [requirements.txt](./requirements.txt). On Debian-based distributions, run `sudo apt-get install python-dev`. On Red Hat-based distributions, run `sudo yum install python-devel`. After that, just `pip install .` or `python setup.py install`. You may need to prepend that with ```sudo``` if not running in a virtual environment, but using such an environment is highly encouraged. +With the exception of the Python header files, these can all be found in [requirements.txt](./requirements.txt). On Debian-based distributions, run `sudo apt-get install python-dev`. On Red Hat-based distributions, run `sudo yum install python-devel`. After that, just `python setup.py install` or `pip install .` (preferably the former). That command may need to be prepended with ```sudo``` if not running in a virtual environment, but we highly encourage using such an environment. ## Usage From 5bc2a270a1c09ade5438bc429a1cda360c50f884 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 19 Mar 2015 14:33:33 -0500 Subject: [PATCH 22/97] Update URLs --- docker/Dockerfile | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 02346ff..341ade6 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,8 +1,8 @@ # # This Docker image encapsulates Maltrieve, a tool to retrieve malware # directly from the source for security researchers. -# which was created by Kyle Maxwell (technoskald) and is -# available at https://github.com/technoskald/maltrieve. +# which was created by Kyle Maxwell (krmaxwell) and is +# available at https://github.com/krmaxwell/maltrieve. # # The file below is based on ideas from Spenser Reinhardt's Dockerfile # (https://registry.hub.docker.com/u/sreinhardt/honeynet/dockerfile) @@ -12,7 +12,7 @@ # # sudo docker run --rm -it technoskald/maltrieve bash # -# then run ./maltrieve.py with the desired parameters. +# then run maltrieve.py with the desired parameters. FROM ubuntu:14.04 MAINTAINER Michael Boman @@ -34,7 +34,7 @@ RUN rm -rf /var/lib/apt/lists/* && \ useradd -r -g maltrieve -d /home/maltrieve -s /sbin/nologin -c "Maltrieve User" maltrieve WORKDIR /home -RUN git clone https://github.com/technoskald/maltrieve.git && \ +RUN git clone https://github.com/krmaxwell/maltrieve.git && \ cd maltrieve && \ python setup.py install && \ chown -R maltrieve:maltrieve /home/maltrieve @@ -44,4 +44,3 @@ ENV HOME /home/maltrieve ENV USER maltrieve WORKDIR /home/maltrieve CMD ["maltrieve"] - From 5db8762e9669488e0bca546b0c558c7103085cec Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 19 Mar 2015 14:38:20 -0500 Subject: [PATCH 23/97] Add 'In Progress' badge --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8b5e23d..e1c423c 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ [![Stories in Ready](https://badge.waffle.io/krmaxwell/maltrieve.png?label=ready&title=Ready)](https://waffle.io/krmaxwell/maltrieve) +[![Stories in In Progress](https://badge.waffle.io/krmaxwell/maltrieve.png?label=in%20progress&title=In%20Progress)](https://waffle.io/krmaxwell/maltrieve) ``` _______ _______ _______ ______ _____ _______ _ _ _______ | | | |_____| | | |_____/ | |______ \ / |______ @@ -89,4 +90,4 @@ Aside from pull requests, non-developers can open issues on [Github](https://git * Suggestions of additional sources for malware lists * Descriptions of how you use it and ways we can improve it for you -Check the [contributing guide](./CONTRIBUTING.md) for details. +Check the [contributing guide](./CONTRIBUTING.md) for details. From b0476918d8af636a2a638b8f4d3e3d9f03b306d4 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 19 Mar 2015 14:43:31 -0500 Subject: [PATCH 24/97] Add link to Docker Hub --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d4348ea..ef73284 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,8 @@ Maltrieve requires the following dependencies: With the exception of the Python header files, these can all be found in [requirements.txt](./requirements.txt). On Debian-based distributions, run `sudo apt-get install python-dev`. On Red Hat-based distributions, run `sudo yum install python-devel`. After that, just `pip install .` or `python setup.py install`. You may need to prepend that with ```sudo``` if not running in a virtual environment, but using such an environment is highly encouraged. +Alternately, avoid all of that by using the [Docker image](https://registry.hub.docker.com/u/technoskald/maltrieve/) + ## Usage __Basic execution:__ `maltrieve` (if installed normally) or ```python maltrieve.py``` (if just downloaded and run) @@ -88,4 +90,4 @@ Aside from pull requests, non-developers can open issues on [Github](https://git * Suggestions of additional sources for malware lists * Descriptions of how you use it and ways we can improve it for you -Check the [contributing guide](./CONTRIBUTING.md) for details. +Check the [contributing guide](./CONTRIBUTING.md) for details. From 6276610218fd9b96091372009f4dba8f038514d4 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 19 Mar 2015 17:02:42 -0500 Subject: [PATCH 25/97] Implement class for config info --- maltrieve.py | 145 ++++++++++++++++++++++++++------------------------- 1 file changed, 74 insertions(+), 71 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 9fa18c4..73b097c 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -37,6 +37,76 @@ from bs4 import BeautifulSoup +class config: + + def __init__(self, args, filename='maltrieve.cfg'): + self.configp = ConfigParser.ConfigParser() + self.configp.read(filename) + + if args.logfile or self.configp.get('Maltrieve', 'logfile'): + if args.logfile: + self.logfile = args.logfile + else: + self.logfile = self.configp.get('Maltrieve', 'logfile') + logging.basicConfig(filename=self.logfile, level=logging.DEBUG, + format='%(asctime)s %(thread)d %(message)s', + datefmt='%Y-%m-%d %H:%M:%S') + else: + logging.basicConfig(level=logging.DEBUG, + format='%(asctime)s %(thread)d %(message)s', + datefmt='%Y-%m-%d %H:%M:%S') + if args.proxy: + self.proxy = {'http': args.proxy} + elif self.configp.has_option('Maltrieve', 'proxy'): + self.proxy = {'http': self.configp.get('Maltrieve', 'proxy')} + else: + self.proxy = None + + if self.configp.has_option('Maltrieve', 'User-Agent'): + self.useragent = {'User-Agent': self.configp.get('Maltrieve', 'User-Agent')} + else: + self.useragent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 7.1; Trident/5.0)" + + self.sort_mime = args.sort_mime + + if self.configp.has_option('Maltrieve', 'black_list'): + self.black_list = self.configp.get('Maltrieve', 'black_list').strip().split(',') + else: + self.black_list = [] + + if self.configp.has_option('Maltrieve', 'white_list'): + self.white_list = self.configp.get('Maltrieve', 'white_list').strip().split(',') + else: + self.white_list = False + + # make sure we can open the directory for writing + if args.dumpdir: + self.dumpdir = args.dumpdir + elif self.configp.get('Maltrieve', 'dumpdir'): + self.dumpdir = self.configp.get('Maltrieve', 'dumpdir') + else: + self.dumpdir = '/tmp/malware' + + # Create the dir + if not os.path.exists(self.dumpdir): + os.makedirs(self.dumpdir) + + try: + d = tempfile.mkdtemp(dir=self.dumpdir) + except Exception as e: + logging.error('Could not open {dir} for writing ({exception}), using default'.format(dir=self.dumpdir, exception=e)) + self.dumpdir = '/tmp/malware' + else: + os.rmdir(d) + + logging.info('Using {dir} as dump directory'.format(dir=self.dumpdir)) + + self.vxcage = args.vxcage or self.configp.has_option('Maltrieve', 'vxcage') + self.cuckoo = args.cuckoo or self.configp.has_option('Maltrieve', 'cuckoo') + self.viper = args.viper or self.configp.has_option('Maltrieve', 'viper') + self.logheaders = self.configp.get('Maltrieve', 'logheaders') + + def upload_vxcage(response, md5): if response: url_tag = urlparse(response.url) @@ -193,40 +263,8 @@ def main(): parser.add_argument("-s", "--sort_mime", help="Sort files by MIME type", action="store_true", default=False) - global cfg - cfg = dict() args = parser.parse_args() - - global config - config = ConfigParser.ConfigParser() - config.read('maltrieve.cfg') - - if args.logfile or config.get('Maltrieve', 'logfile'): - if args.logfile: - cfg['logfile'] = args.logfile - else: - cfg['logfile'] = config.get('Maltrieve', 'logfile') - logging.basicConfig(filename=cfg['logfile'], level=logging.DEBUG, - format='%(asctime)s %(thread)d %(message)s', - datefmt='%Y-%m-%d %H:%M:%S') - else: - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(thread)d %(message)s', - datefmt='%Y-%m-%d %H:%M:%S') - - if args.proxy: - cfg['proxy'] = {'http': args.proxy} - elif config.has_option('Maltrieve', 'proxy'): - cfg['proxy'] = {'http': config.get('Maltrieve', 'proxy')} - else: - cfg['proxy'] = None - - if config.has_option('Maltrieve', 'User-Agent'): - cfg['User-Agent'] = {'User-Agent': config.get('Maltrieve', 'User-Agent')} - else: - cfg['User-Agent'] = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 7.1; Trident/5.0)" - - cfg['sort_mime'] = args.sort_mime + cfg = config(args, 'maltrieve.cfg') if cfg['proxy']: logging.info('Using proxy %s', cfg['proxy']) @@ -234,41 +272,6 @@ def main(): logging.info('External sites see {ip}'.format(ip=my_ip)) print 'External sites see {ip}'.format(ip=my_ip) - cfg['vxcage'] = args.vxcage or config.has_option('Maltrieve', 'vxcage') - cfg['cuckoo'] = args.cuckoo or config.has_option('Maltrieve', 'cuckoo') - cfg['viper'] = args.viper or config.has_option('Maltrieve', 'viper') - cfg['logheaders'] = config.get('Maltrieve', 'logheaders') - - black_list = [] - if config.has_option('Maltrieve', 'black_list'): - black_list = config.get('Maltrieve', 'black_list').strip().split(',') - - white_list = False - if config.has_option('Maltrieve', 'white_list'): - white_list = config.get('Maltrieve', 'white_list').strip().split(',') - - # make sure we can open the directory for writing - if args.dumpdir: - cfg['dumpdir'] = args.dumpdir - elif config.get('Maltrieve', 'dumpdir'): - cfg['dumpdir'] = config.get('Maltrieve', 'dumpdir') - else: - cfg['dumpdir'] = '/tmp/malware' - - # Create the dir - if not os.path.exists(cfg['dumpdir']): - os.makedirs(cfg['dumpdir']) - - try: - d = tempfile.mkdtemp(dir=cfg['dumpdir']) - except Exception as e: - logging.error('Could not open {dir} for writing ({exception}), using default'.format(dir=cfg['dumpdir'], exception=e)) - cfg['dumpdir'] = '/tmp/malware' - else: - os.rmdir(d) - - logging.info('Using {dir} as dump directory'.format(dir=cfg['dumpdir'])) - if os.path.exists('hashes.json'): with open('hashes.json', 'rb') as hashfile: hashes = json.load(hashfile) @@ -302,7 +305,7 @@ def main(): print "Completed source processing" - headers['User-Agent'] = cfg['User-Agent'] + headers['User-Agent'] = cfg.useragent malware_urls = set() for response in source_lists: if hasattr(response, 'status_code') and response.status_code == 200: @@ -311,13 +314,13 @@ def main(): print "Downloading samples, check log for details" malware_urls -= past_urls - reqs = [grequests.get(url, headers=headers, proxies=cfg['proxy']) for url in malware_urls] + reqs = [grequests.get(url, headers=headers, proxies=cfg.proxy) for url in malware_urls] for chunk in chunker(reqs, 32): malware_downloads = grequests.map(chunk) for each in malware_downloads: if not each or each.status_code != 200: continue - md5 = save_malware(each, cfg['dumpdir'], black_list, white_list) + md5 = save_malware(each, cfg.dumpdir, cfg.black_list, cfg.white_list) if not md5: continue past_urls.add(each.url) From 3f6fdcc3c8d139dbab3c5153efc95f65b8f30251 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Tue, 24 Mar 2015 22:25:12 -0500 Subject: [PATCH 26/97] Implement pre-commit --- .pre-commit-config.yaml | 17 +++++++++++++++++ requirements.txt | 1 + 2 files changed, 18 insertions(+) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..73b9138 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,17 @@ +- repo: git://github.com/pre-commit/pre-commit-hooks + sha: master + hooks: + - id: autopep8-wrapper + args: ['-i', '--ignore=E501'] + - id: check-json + - id: check-yaml + - id: end-of-file-fixer + - id: flake8 + args: [--max-line-length=140] + - id: trailing-whitespace + +- repo: git://github.com/ivanlei/pre-commit-python-sorter + sha: master + hooks: + - id: python-import-sorter + args: [--silent-overwrite, --force_single_line] diff --git a/requirements.txt b/requirements.txt index cfe22a1..195cd73 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ grequests==0.2.0 python-magic==0.4.6 requests==2.3.0 wsgiref==0.1.2 +pre-commit From 5d77628b203dc2f694a7a6b3257f07caf674d028 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 19 Mar 2015 17:02:42 -0500 Subject: [PATCH 27/97] Implement class for config info --- maltrieve.py | 145 ++++++++++++++++++++++++++------------------------- 1 file changed, 74 insertions(+), 71 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 9fa18c4..73b097c 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -37,6 +37,76 @@ from bs4 import BeautifulSoup +class config: + + def __init__(self, args, filename='maltrieve.cfg'): + self.configp = ConfigParser.ConfigParser() + self.configp.read(filename) + + if args.logfile or self.configp.get('Maltrieve', 'logfile'): + if args.logfile: + self.logfile = args.logfile + else: + self.logfile = self.configp.get('Maltrieve', 'logfile') + logging.basicConfig(filename=self.logfile, level=logging.DEBUG, + format='%(asctime)s %(thread)d %(message)s', + datefmt='%Y-%m-%d %H:%M:%S') + else: + logging.basicConfig(level=logging.DEBUG, + format='%(asctime)s %(thread)d %(message)s', + datefmt='%Y-%m-%d %H:%M:%S') + if args.proxy: + self.proxy = {'http': args.proxy} + elif self.configp.has_option('Maltrieve', 'proxy'): + self.proxy = {'http': self.configp.get('Maltrieve', 'proxy')} + else: + self.proxy = None + + if self.configp.has_option('Maltrieve', 'User-Agent'): + self.useragent = {'User-Agent': self.configp.get('Maltrieve', 'User-Agent')} + else: + self.useragent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 7.1; Trident/5.0)" + + self.sort_mime = args.sort_mime + + if self.configp.has_option('Maltrieve', 'black_list'): + self.black_list = self.configp.get('Maltrieve', 'black_list').strip().split(',') + else: + self.black_list = [] + + if self.configp.has_option('Maltrieve', 'white_list'): + self.white_list = self.configp.get('Maltrieve', 'white_list').strip().split(',') + else: + self.white_list = False + + # make sure we can open the directory for writing + if args.dumpdir: + self.dumpdir = args.dumpdir + elif self.configp.get('Maltrieve', 'dumpdir'): + self.dumpdir = self.configp.get('Maltrieve', 'dumpdir') + else: + self.dumpdir = '/tmp/malware' + + # Create the dir + if not os.path.exists(self.dumpdir): + os.makedirs(self.dumpdir) + + try: + d = tempfile.mkdtemp(dir=self.dumpdir) + except Exception as e: + logging.error('Could not open {dir} for writing ({exception}), using default'.format(dir=self.dumpdir, exception=e)) + self.dumpdir = '/tmp/malware' + else: + os.rmdir(d) + + logging.info('Using {dir} as dump directory'.format(dir=self.dumpdir)) + + self.vxcage = args.vxcage or self.configp.has_option('Maltrieve', 'vxcage') + self.cuckoo = args.cuckoo or self.configp.has_option('Maltrieve', 'cuckoo') + self.viper = args.viper or self.configp.has_option('Maltrieve', 'viper') + self.logheaders = self.configp.get('Maltrieve', 'logheaders') + + def upload_vxcage(response, md5): if response: url_tag = urlparse(response.url) @@ -193,40 +263,8 @@ def main(): parser.add_argument("-s", "--sort_mime", help="Sort files by MIME type", action="store_true", default=False) - global cfg - cfg = dict() args = parser.parse_args() - - global config - config = ConfigParser.ConfigParser() - config.read('maltrieve.cfg') - - if args.logfile or config.get('Maltrieve', 'logfile'): - if args.logfile: - cfg['logfile'] = args.logfile - else: - cfg['logfile'] = config.get('Maltrieve', 'logfile') - logging.basicConfig(filename=cfg['logfile'], level=logging.DEBUG, - format='%(asctime)s %(thread)d %(message)s', - datefmt='%Y-%m-%d %H:%M:%S') - else: - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(thread)d %(message)s', - datefmt='%Y-%m-%d %H:%M:%S') - - if args.proxy: - cfg['proxy'] = {'http': args.proxy} - elif config.has_option('Maltrieve', 'proxy'): - cfg['proxy'] = {'http': config.get('Maltrieve', 'proxy')} - else: - cfg['proxy'] = None - - if config.has_option('Maltrieve', 'User-Agent'): - cfg['User-Agent'] = {'User-Agent': config.get('Maltrieve', 'User-Agent')} - else: - cfg['User-Agent'] = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 7.1; Trident/5.0)" - - cfg['sort_mime'] = args.sort_mime + cfg = config(args, 'maltrieve.cfg') if cfg['proxy']: logging.info('Using proxy %s', cfg['proxy']) @@ -234,41 +272,6 @@ def main(): logging.info('External sites see {ip}'.format(ip=my_ip)) print 'External sites see {ip}'.format(ip=my_ip) - cfg['vxcage'] = args.vxcage or config.has_option('Maltrieve', 'vxcage') - cfg['cuckoo'] = args.cuckoo or config.has_option('Maltrieve', 'cuckoo') - cfg['viper'] = args.viper or config.has_option('Maltrieve', 'viper') - cfg['logheaders'] = config.get('Maltrieve', 'logheaders') - - black_list = [] - if config.has_option('Maltrieve', 'black_list'): - black_list = config.get('Maltrieve', 'black_list').strip().split(',') - - white_list = False - if config.has_option('Maltrieve', 'white_list'): - white_list = config.get('Maltrieve', 'white_list').strip().split(',') - - # make sure we can open the directory for writing - if args.dumpdir: - cfg['dumpdir'] = args.dumpdir - elif config.get('Maltrieve', 'dumpdir'): - cfg['dumpdir'] = config.get('Maltrieve', 'dumpdir') - else: - cfg['dumpdir'] = '/tmp/malware' - - # Create the dir - if not os.path.exists(cfg['dumpdir']): - os.makedirs(cfg['dumpdir']) - - try: - d = tempfile.mkdtemp(dir=cfg['dumpdir']) - except Exception as e: - logging.error('Could not open {dir} for writing ({exception}), using default'.format(dir=cfg['dumpdir'], exception=e)) - cfg['dumpdir'] = '/tmp/malware' - else: - os.rmdir(d) - - logging.info('Using {dir} as dump directory'.format(dir=cfg['dumpdir'])) - if os.path.exists('hashes.json'): with open('hashes.json', 'rb') as hashfile: hashes = json.load(hashfile) @@ -302,7 +305,7 @@ def main(): print "Completed source processing" - headers['User-Agent'] = cfg['User-Agent'] + headers['User-Agent'] = cfg.useragent malware_urls = set() for response in source_lists: if hasattr(response, 'status_code') and response.status_code == 200: @@ -311,13 +314,13 @@ def main(): print "Downloading samples, check log for details" malware_urls -= past_urls - reqs = [grequests.get(url, headers=headers, proxies=cfg['proxy']) for url in malware_urls] + reqs = [grequests.get(url, headers=headers, proxies=cfg.proxy) for url in malware_urls] for chunk in chunker(reqs, 32): malware_downloads = grequests.map(chunk) for each in malware_downloads: if not each or each.status_code != 200: continue - md5 = save_malware(each, cfg['dumpdir'], black_list, white_list) + md5 = save_malware(each, cfg.dumpdir, cfg.black_list, cfg.white_list) if not md5: continue past_urls.add(each.url) From ff50838f8b67e8edfd42bcb3b1b67c19b3c0ed1c Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 11:32:38 -0500 Subject: [PATCH 28/97] Use cfg for upload functions --- maltrieve.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 73b097c..90c08aa 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -107,12 +107,12 @@ def __init__(self, args, filename='maltrieve.cfg'): self.logheaders = self.configp.get('Maltrieve', 'logheaders') -def upload_vxcage(response, md5): +def upload_vxcage(response, md5, cfg): if response: url_tag = urlparse(response.url) files = {'file': (md5, response.content)} tags = {'tags': url_tag.netloc + ',Maltrieve'} - url = "{srv}/malware/add".format(srv=config.get('Maltrieve', 'vxcage')) + url = "{srv}/malware/add".format(cfg.vxcage) headers = {'User-agent': 'Maltrieve'} try: # Note that this request does NOT go through proxies @@ -124,25 +124,25 @@ def upload_vxcage(response, md5): # This gives cuckoo the URL instead of the file. -def upload_cuckoo(response, md5): +def upload_cuckoo(response, md5, cfg): if response: data = {'url': response.url} - url = "{srv}/tasks/create/url".format(srv=config.get('Maltrieve', 'cuckoo')) + url = "{srv}/tasks/create/url".format(srv=cfg.cuckoo) headers = {'User-agent': 'Maltrieve'} - #try: - response = requests.post(url, headers=headers, data=data) - response_data = response.json() - logging.info("Submitted {md5} to Cuckoo, task ID {taskid}".format(md5=md5, taskid=response_data["task_id"])) - #except: - #logging.info("Exception caught from Cuckoo") + try: + response = requests.post(url, headers=headers, data=data) + response_data = response.json() + logging.info("Submitted {md5} to Cuckoo, task ID {taskid}".format(md5=md5, taskid=response_data["task_id"])) + except: + logging.info("Exception caught from Cuckoo") -def upload_viper(response, md5): +def upload_viper(response, md5, cfg): if response: url_tag = urlparse(response.url) files = {'file': (md5, response.content)} tags = {'tags': url_tag.netloc + ',Maltrieve'} - url = "{srv}/file/add".format(srv=config.get('Maltrieve', 'viper')) + url = "{srv}/file/add".format(srv=cfg.viper) headers = {'User-agent': 'Maltrieve'} try: # Note that this request does NOT go through proxies @@ -153,15 +153,15 @@ def upload_viper(response, md5): logging.info("Exception caught from Viper") -def save_malware(response, directory, black_list, white_list): +def save_malware(response, cfg): url = response.url data = response.content mime_type = magic.from_buffer(data, mime=True) - if mime_type in black_list: + if mime_type in cfg.black_list: logging.info('{mtype} in ignore list for {url}'.format(mtype=mime_type, url=url)) return - if white_list: - if mime_type in white_list: + if cfg.white_list: + if mime_type in cfg.white_list: pass else: logging.info('{mtype} not in whitelist for {url}'.format(mtype=mime_type, url=url)) @@ -175,12 +175,12 @@ def save_malware(response, directory, black_list, white_list): stored = False # Submit to external services if cfg['vxcage']: - upload_vxcage(response, md5) + upload_vxcage(response, md5, cfg) stored = True if cfg['cuckoo']: - upload_cuckoo(response, md5) + upload_cuckoo(response, md5, cfg) if cfg['viper']: - upload_viper(response, md5) + upload_viper(response, md5, cfg) stored = True # else save to disk if not stored: @@ -320,7 +320,7 @@ def main(): for each in malware_downloads: if not each or each.status_code != 200: continue - md5 = save_malware(each, cfg.dumpdir, cfg.black_list, cfg.white_list) + md5 = save_malware(each, cfg) if not md5: continue past_urls.add(each.url) From cb4ac53c0cdf5003732c55862c1e34a7fc4886df Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 11:38:03 -0500 Subject: [PATCH 29/97] Fix directory references --- maltrieve.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 90c08aa..6409d4b 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -187,11 +187,11 @@ def save_malware(response, cfg): if cfg['sort_mime']: # set folder per mime_type sort_folder = mime_type.replace('/', '_') - if not os.path.exists(os.path.join(directory, sort_folder)): - os.makedirs(os.path.join(directory, sort_folder)) - store_path = os.path.join(directory, sort_folder, md5) + if not os.path.exists(os.path.join(cfg.dumpdir, sort_folder)): + os.makedirs(os.path.join(cfg.dumpdir, sort_folder)) + store_path = os.path.join(cfg.dumpdir, sort_folder, md5) else: - store_path = os.path.join(directory, md5) + store_path = os.path.join(cfg.dumpdir, md5) with open(store_path, 'wb') as f: f.write(data) logging.info("Saved {md5} to dump dir".format(md5=md5)) From 4294ce05e666b4e6ed4f8e6200f1ba5b4a02053c Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 11:45:04 -0500 Subject: [PATCH 30/97] Add TODO statements and fix proxy reference --- maltrieve.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 6409d4b..09a0618 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -19,26 +19,28 @@ # along with this program. If not, see Date: Wed, 25 Mar 2015 12:11:04 -0500 Subject: [PATCH 31/97] Specify exceptions for saving malware --- maltrieve.py | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 09a0618..515a9cb 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -92,15 +92,19 @@ def __init__(self, args, filename='maltrieve.cfg'): # Create the dir if not os.path.exists(self.dumpdir): - os.makedirs(self.dumpdir) + try: + os.makedirs(self.dumpdir) + except IOError: + logging.error('Could not create {dir}, using default'.format(dir=self.dumpdir)) + self.dumpdir = '/tmp/malware' try: - d = tempfile.mkdtemp(dir=self.dumpdir) - except Exception as e: - logging.error('Could not open {dir} for writing ({exception}), using default'.format(dir=self.dumpdir, exception=e)) + f = tempfile.mkstemp(dir=self.dumpdir) + except IOError: + logging.error('Could not open {dir} for writing, using default'.format(dir=self.dumpdir)) self.dumpdir = '/tmp/malware' else: - os.rmdir(d) + os.remove(f) logging.info('Using {dir} as dump directory'.format(dir=self.dumpdir)) @@ -123,8 +127,11 @@ def upload_vxcage(response, md5, cfg): response = requests.post(url, headers=headers, files=files, data=tags) response_data = response.json() logging.info("Submitted {md5} to VxCage, response was {msg}".format(md5=md5, msg=response_data["message"])) - except: - logging.info("Exception caught from VxCage") + except requests.exceptions.ConnectionError: + logging.info("Could not connect to VxCage, will attempt local storage") + return False + else: + return True # This gives cuckoo the URL instead of the file. @@ -137,8 +144,11 @@ def upload_cuckoo(response, md5, cfg): response = requests.post(url, headers=headers, data=data) response_data = response.json() logging.info("Submitted {md5} to Cuckoo, task ID {taskid}".format(md5=md5, taskid=response_data["task_id"])) - except: - logging.info("Exception caught from Cuckoo") + except requests.exceptions.ConnectionError: + logging.info("Could not connect to Cuckoo, will attempt local storage") + return False + else: + return True def upload_viper(response, md5, cfg): @@ -153,8 +163,11 @@ def upload_viper(response, md5, cfg): response = requests.post(url, headers=headers, files=files, data=tags) response_data = response.json() logging.info("Submitted {md5} to Viper, response was {msg}".format(md5=md5, msg=response_data["message"])) - except: - logging.info("Exception caught from Viper") + except requests.exceptions.ConnectionError: + logging.info("Could not connect to Viper, will attempt local storage") + return False + else: + return True def save_malware(response, cfg): @@ -180,13 +193,11 @@ def save_malware(response, cfg): # Submit to external services # TODO: merge these if cfg['vxcage']: - upload_vxcage(response, md5, cfg) - stored = True + stored = upload_vxcage(response, md5, cfg) or stored if cfg['cuckoo']: - upload_cuckoo(response, md5, cfg) + stored = upload_cuckoo(response, md5, cfg) or stored if cfg['viper']: - upload_viper(response, md5, cfg) - stored = True + stored = upload_viper(response, md5, cfg) or stored # else save to disk if not stored: if cfg['sort_mime']: From b9f52213d0e7ed180f92b601444849f578b547ab Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 12:35:24 -0500 Subject: [PATCH 32/97] Close temporary file correctly --- maltrieve.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 515a9cb..64a0b39 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -99,12 +99,13 @@ def __init__(self, args, filename='maltrieve.cfg'): self.dumpdir = '/tmp/malware' try: - f = tempfile.mkstemp(dir=self.dumpdir) + fd, temp_path = tempfile.mkstemp(dir=self.dumpdir) except IOError: logging.error('Could not open {dir} for writing, using default'.format(dir=self.dumpdir)) self.dumpdir = '/tmp/malware' else: - os.remove(f) + os.close(fd) + os.remove(temp_path) logging.info('Using {dir} as dump directory'.format(dir=self.dumpdir)) From dfacc636db9308d0c6b37432e6c8777b0472a94c Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 12:43:28 -0500 Subject: [PATCH 33/97] Use class member reference, not dict keys --- maltrieve.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 64a0b39..d6a261a 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -193,15 +193,15 @@ def save_malware(response, cfg): stored = False # Submit to external services # TODO: merge these - if cfg['vxcage']: + if cfg.vxcage: stored = upload_vxcage(response, md5, cfg) or stored - if cfg['cuckoo']: + if cfg.cuckoo: stored = upload_cuckoo(response, md5, cfg) or stored - if cfg['viper']: + if cfg.viper: stored = upload_viper(response, md5, cfg) or stored # else save to disk if not stored: - if cfg['sort_mime']: + if cfg.sort_mime: # set folder per mime_type sort_folder = mime_type.replace('/', '_') if not os.path.exists(os.path.join(cfg.dumpdir, sort_folder)): @@ -283,9 +283,9 @@ def main(): args = parser.parse_args() cfg = config(args, 'maltrieve.cfg') - if cfg['proxy']: - logging.info('Using proxy %s', cfg['proxy']) - my_ip = requests.get('http://ipinfo.io/ip', proxies=cfg['proxy']).text + if cfg.proxy: + logging.info('Using proxy {proxy}'.format(proxy=cfg.proxy)) + my_ip = requests.get('http://ipinfo.io/ip', proxies=cfg.proxy).text logging.info('External sites see {ip}'.format(ip=my_ip)) print 'External sites see {ip}'.format(ip=my_ip) From f271127056d2d1e4b0e744f55e00564254ff020c Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 14:20:27 -0500 Subject: [PATCH 34/97] Use pytest and ignore JSON files --- .gitignore | 1 + setup.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index d9189b9..6156808 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,4 @@ maltrieve.out archive grequests *.bak +*.json diff --git a/setup.py b/setup.py index c79387f..18deb3e 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,8 @@ 'python-magic==0.4.6', 'requests==2.3.0', 'wsgiref==0.1.2', - 'precommit' + 'precommit', + 'pytest' ], package_dir={'maltrieve': 'src'}, packages=['maltrieve'], From 146b7ca92e5b466e36a03fa6e4a73f0ca0ed55e9 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 14:22:53 -0500 Subject: [PATCH 35/97] How did I miss this earlier --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 18deb3e..9e54b7a 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ 'python-magic==0.4.6', 'requests==2.3.0', 'wsgiref==0.1.2', - 'precommit', + 'pre-commit', 'pytest' ], package_dir={'maltrieve': 'src'}, From 874ab3b8e7a4aeb6f7eed40ff3fc32f936dad987 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 14:43:34 -0500 Subject: [PATCH 36/97] Getting test infrastructure running --- maltrieve-test.cfg | 14 ++++++++++++++ maltrieve.py | 14 +++++++++----- test.py | 6 ++++++ 3 files changed, 29 insertions(+), 5 deletions(-) create mode 100644 maltrieve-test.cfg create mode 100644 test.py diff --git a/maltrieve-test.cfg b/maltrieve-test.cfg new file mode 100644 index 0000000..add59cf --- /dev/null +++ b/maltrieve-test.cfg @@ -0,0 +1,14 @@ +[Maltrieve] +dumpdir = archive +logfile = maltrieve.log +logheaders = true +User-Agent = Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0) + +#viper = http://127.0.0.1:8080 +#cuckoo = http://127.0.0.1:8090 +#vxcage = http://127.0.0.1:8080 + + +# Filter Lists are based on mime type NO SPACE BETWEEN , +#black_list = text/html,text/plain +#white_list = application/pdf,application/x-dosexec diff --git a/maltrieve.py b/maltrieve.py index d6a261a..235e432 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -258,10 +258,7 @@ def chunker(seq, size): return (seq[pos:pos + size] for pos in xrange(0, len(seq), size)) -def main(): - hashes = set() - past_urls = set() - +def setup_args(args): parser = argparse.ArgumentParser() parser.add_argument("-p", "--proxy", help="Define HTTP proxy as address:port") @@ -280,7 +277,14 @@ def main(): parser.add_argument("-s", "--sort_mime", help="Sort files by MIME type", action="store_true", default=False) - args = parser.parse_args() + return parser.parse_args(args) + + +def main(): + hashes = set() + past_urls = set() + + args = setup_args(sys.argv) cfg = config(args, 'maltrieve.cfg') if cfg.proxy: diff --git a/test.py b/test.py new file mode 100644 index 0000000..57958ad --- /dev/null +++ b/test.py @@ -0,0 +1,6 @@ +import maltrieve + + +def test_args(): + args = maltrieve.setup_args(['-l', 'testlog']) + assert args.logfile == 'testlog' From 4124d416911aaec3593a7040143cb8ed5cf0f211 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 14:46:14 -0500 Subject: [PATCH 37/97] Configure CircleCI manually to fix #98 --- circle.yml | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 circle.yml diff --git a/circle.yml b/circle.yml new file mode 100644 index 0000000..814ac6e --- /dev/null +++ b/circle.yml @@ -0,0 +1,3 @@ +test: + override: + py.test test.py From 70ab9a56f8db087564826acc01599e3b339cff6a Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 14:47:10 -0500 Subject: [PATCH 38/97] Override must be a list --- circle.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/circle.yml b/circle.yml index 814ac6e..42637df 100644 --- a/circle.yml +++ b/circle.yml @@ -1,3 +1,3 @@ test: override: - py.test test.py + - py.test test.py From f978bd8367d0cecd304b1668ca30ee8548e13579 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 15:01:34 -0500 Subject: [PATCH 39/97] Test argument parsing --- test.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/test.py b/test.py index 57958ad..91cac0d 100644 --- a/test.py +++ b/test.py @@ -1,6 +1,16 @@ import maltrieve -def test_args(): - args = maltrieve.setup_args(['-l', 'testlog']) +def test_basic_args(): + args = maltrieve.setup_args(['-l', 'testlog', '-p', '127.0.0.1:8080', '-d', '/opt/']) assert args.logfile == 'testlog' + assert args.proxy == '127.0.0.1:8080' + assert args.dumpdir == '/opt/' + + +def test_saving_args(): + args = maltrieve.setup_args(['-v', '-x', '-c', '-s']) + assert args.viper + assert args.vxcage + assert args.cuckoo + assert args.sort_mime From f795a6c87c35b7d142e2add206c987f2dc3f9fbb Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 15:05:28 -0500 Subject: [PATCH 40/97] Trying to test parse_simple_list --- test.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test.py b/test.py index 91cac0d..654b51b 100644 --- a/test.py +++ b/test.py @@ -14,3 +14,11 @@ def test_saving_args(): assert args.vxcage assert args.cuckoo assert args.sort_mime + + +def test_parse_simple_list(): + sources = 'http://example.org/mylist \ + http://example.com/yourlist \ + http://example.org/mylist' + assert maltrieve.process_simple_list(sources) == \ + set('http://example.org/mylist', 'http://example.com/yourlist') From 7165c808b4cf233afc3d65767f05d2ab3887eb0c Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 15:38:45 -0500 Subject: [PATCH 41/97] Test simple list --- test.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/test.py b/test.py index 654b51b..4fde8bc 100644 --- a/test.py +++ b/test.py @@ -1,4 +1,5 @@ import maltrieve +import requests def test_basic_args(): @@ -17,8 +18,6 @@ def test_saving_args(): def test_parse_simple_list(): - sources = 'http://example.org/mylist \ - http://example.com/yourlist \ - http://example.org/mylist' - assert maltrieve.process_simple_list(sources) == \ - set('http://example.org/mylist', 'http://example.com/yourlist') + source = requests.get('http://xwell.org/assets/maltrieve-test.txt').text + assert maltrieve.process_simple_list(source) == \ + set(['http://example.org/mylist', 'http://example.com/yourlist']) From 4cbbec6bed4bee8f2e4a2ad0c94cd1fb6915e41b Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 15:53:31 -0500 Subject: [PATCH 42/97] Test list processing functions --- test.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test.py b/test.py index 4fde8bc..374a5ad 100644 --- a/test.py +++ b/test.py @@ -21,3 +21,15 @@ def test_parse_simple_list(): source = requests.get('http://xwell.org/assets/maltrieve-test.txt').text assert maltrieve.process_simple_list(source) == \ set(['http://example.org/mylist', 'http://example.com/yourlist']) + + +def test_parse_xml_list(): + source = requests.get('http://xwell.org/assets/maltrieve-test-list.xml').text + assert maltrieve.process_xml_list_title(source) == \ + set(['http://example.org/mylist', 'http://example.com/yourlist']) + + +def test_parse_xml_desc(): + source = requests.get('http://xwell.org/assets/maltrieve-test-desc.xml').text + assert maltrieve.process_xml_list_desc(source) == \ + set(['http://example.org/mylist', 'http://example.com/yourlist']) From 9c9dfdeddba6dbaee558fc70b1ed19e9df7bfd92 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 16:14:03 -0500 Subject: [PATCH 43/97] Configure Coveralls.io --- circle.yml | 3 ++- setup.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/circle.yml b/circle.yml index 42637df..f18f164 100644 --- a/circle.yml +++ b/circle.yml @@ -1,3 +1,4 @@ test: override: - - py.test test.py + - coverage run --source=maltrieve py.test test.py + - coveralls diff --git a/setup.py b/setup.py index 9e54b7a..d4a2f6a 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,8 @@ 'requests==2.3.0', 'wsgiref==0.1.2', 'pre-commit', - 'pytest' + 'pytest', + 'coveralls' ], package_dir={'maltrieve': 'src'}, packages=['maltrieve'], From a8362b61fadf80e3aeff5a3922d70ad4554b88b9 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 16:29:22 -0500 Subject: [PATCH 44/97] Use pytest-cov plugin to generate initial coverage metrics --- circle.yml | 2 +- setup.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/circle.yml b/circle.yml index f18f164..a325f62 100644 --- a/circle.yml +++ b/circle.yml @@ -1,4 +1,4 @@ test: override: - - coverage run --source=maltrieve py.test test.py + - py.test --cov maltrieve test.py - coveralls diff --git a/setup.py b/setup.py index d4a2f6a..5911a1e 100644 --- a/setup.py +++ b/setup.py @@ -19,6 +19,7 @@ 'wsgiref==0.1.2', 'pre-commit', 'pytest', + 'pytest-cov', 'coveralls' ], package_dir={'maltrieve': 'src'}, From 0255397cf9b73a54632b43257dac939eef106897 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 16:43:06 -0500 Subject: [PATCH 45/97] Reorganize imports --- maltrieve.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 18af4e0..c8a65f4 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -26,16 +26,15 @@ import os import pickle import re -import requests import resource -import tempfile import sys -from urlparse import urlparse - +import tempfile import feedparser import grequests import magic +import requests from bs4 import BeautifulSoup +from urlparse import urlparse class config: From 9f5f3c564d34e2c2886b75169f056c6487441641 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 17:29:02 -0500 Subject: [PATCH 46/97] Test load_hashes() --- maltrieve.py | 20 +++++++++++++------- test.py | 5 +++++ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index c8a65f4..8dd3e9d 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -29,12 +29,13 @@ import resource import sys import tempfile +from urlparse import urlparse + import feedparser import grequests import magic import requests from bs4 import BeautifulSoup -from urlparse import urlparse class config: @@ -280,6 +281,15 @@ def setup_args(args): return parser.parse_args(args) +def load_hashes(filename="hashes.json"): + if os.path.exists(filename): + with open(filename, 'rb') as hashfile: + hashes = set(json.load(hashfile)) + else: + hashes = set() + return hashes + + def main(): resource.setrlimit(resource.RLIMIT_NOFILE, (2048, 2048)) hashes = set() @@ -288,18 +298,14 @@ def main(): args = setup_args(sys.argv) cfg = config(args, 'maltrieve.cfg') + # TODO: move this inside config.__init__() if cfg.proxy: logging.info('Using proxy {proxy}'.format(proxy=cfg.proxy)) my_ip = requests.get('http://ipinfo.io/ip', proxies=cfg.proxy).text logging.info('External sites see {ip}'.format(ip=my_ip)) print 'External sites see {ip}'.format(ip=my_ip) - if os.path.exists('hashes.json'): - with open('hashes.json', 'rb') as hashfile: - hashes = json.load(hashfile) - elif os.path.exists('hashes.obj'): - with open('hashes.obj', 'rb') as hashfile: - hashes = pickle.load(hashfile) + hashes = load_hashes('hashes.json') if os.path.exists('urls.json'): try: diff --git a/test.py b/test.py index 374a5ad..5a54b5f 100644 --- a/test.py +++ b/test.py @@ -33,3 +33,8 @@ def test_parse_xml_desc(): source = requests.get('http://xwell.org/assets/maltrieve-test-desc.xml').text assert maltrieve.process_xml_list_desc(source) == \ set(['http://example.org/mylist', 'http://example.com/yourlist']) + + +def test_load_hashes(): + assert maltrieve.load_hashes('test-hashes.json') == \ + set(['d41d8cd98f00b204e9800998ecf8427e']) From 8ed202304e566e2103502d92da8f45caf83e0208 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 17:39:32 -0500 Subject: [PATCH 47/97] Test save_hashes() --- maltrieve.py | 9 ++++++--- test.py | 8 +++++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 8dd3e9d..e95667f 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -290,6 +290,11 @@ def load_hashes(filename="hashes.json"): return hashes +def save_hashes(hashes, filename='hashes.json'): + with open(filename, 'w') as hashfile: + json.dump(list(hashes), hashfile) + + def main(): resource.setrlimit(resource.RLIMIT_NOFILE, (2048, 2048)) hashes = set() @@ -362,9 +367,7 @@ def main(): with open('urls.json', 'w') as urlfile: json.dump(list(past_urls), urlfile) - if hashes: - with open('hashes.json', 'w') as hashfile: - json.dump(hashes, hashfile) + save_hashes(hashes, 'hashes.json') if __name__ == "__main__": diff --git a/test.py b/test.py index 5a54b5f..78caa51 100644 --- a/test.py +++ b/test.py @@ -36,5 +36,11 @@ def test_parse_xml_desc(): def test_load_hashes(): - assert maltrieve.load_hashes('test-hashes.json') == \ + assert maltrieve.load_hashes('test-load-hashes.json') == \ set(['d41d8cd98f00b204e9800998ecf8427e']) + + +def test_save_hashes(): + hashes = set(['d41d8cd98f00b204e9800998ecf8427e']) + maltrieve.save_hashes(hashes, 'test-save-hashes.json') + test_load_hashes() From 457dbb93dc10996bec016f6674f3fde5fb309dd9 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 17:56:37 -0500 Subject: [PATCH 48/97] Testing save/load URLs/hashes --- maltrieve.py | 53 ++++++++++++++++++++++++++++------------------------ test.py | 17 ++++++++++++++--- 2 files changed, 43 insertions(+), 27 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index e95667f..3885c82 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -24,7 +24,6 @@ import json import logging import os -import pickle import re import resource import sys @@ -116,6 +115,13 @@ def __init__(self, args, filename='maltrieve.cfg'): self.viper = args.viper or self.configp.has_option('Maltrieve', 'viper') self.logheaders = self.configp.get('Maltrieve', 'logheaders') + def check_proxy(self): + if self.proxy: + logging.info('Using proxy {proxy}'.format(proxy=self.proxy)) + my_ip = requests.get('http://ipinfo.io/ip', proxies=self.proxy).text + logging.info('External sites see {ip}'.format(ip=my_ip)) + print 'External sites see {ip}'.format(ip=my_ip) + def upload_vxcage(response, md5, cfg): if response: @@ -285,16 +291,35 @@ def load_hashes(filename="hashes.json"): if os.path.exists(filename): with open(filename, 'rb') as hashfile: hashes = set(json.load(hashfile)) + logging.info('Loaded hashes from {f}'.format(f=filename)) else: hashes = set() return hashes def save_hashes(hashes, filename='hashes.json'): + logging.info('Dumping hashes to {f}'.format(f=filename)) with open(filename, 'w') as hashfile: json.dump(list(hashes), hashfile) +def load_urls(filename='urls.json'): + if os.path.exists(filename): + try: + with open(filename, 'rb') as urlfile: + urls = set(json.load(urlfile)) + logging.info('Loaded urls from {f}'.format(f=filename)) + except ValueError: + urls = set() + return urls + + +def save_urls(urls, filename='urls.json'): + logging.info('Dumping past URLs to {f}'.format(f=filename)) + with open('urls.json', 'w') as urlfile: + json.dump(list(urls), urlfile) + + def main(): resource.setrlimit(resource.RLIMIT_NOFILE, (2048, 2048)) hashes = set() @@ -302,25 +327,10 @@ def main(): args = setup_args(sys.argv) cfg = config(args, 'maltrieve.cfg') - - # TODO: move this inside config.__init__() - if cfg.proxy: - logging.info('Using proxy {proxy}'.format(proxy=cfg.proxy)) - my_ip = requests.get('http://ipinfo.io/ip', proxies=cfg.proxy).text - logging.info('External sites see {ip}'.format(ip=my_ip)) - print 'External sites see {ip}'.format(ip=my_ip) + cfg.check_proxy() hashes = load_hashes('hashes.json') - - if os.path.exists('urls.json'): - try: - with open('urls.json', 'rb') as urlfile: - past_urls = set(json.load(urlfile)) - except ValueError: - pass - elif os.path.exists('urls.obj'): - with open('urls.obj', 'rb') as urlfile: - past_urls = pickle.load(urlfile) + past_urls = load_urls('urls.json') print "Processing source URLs" @@ -361,12 +371,7 @@ def main(): print "Completed downloads" - # TODO: move to functions - if past_urls: - logging.info('Dumping past URLs to file') - with open('urls.json', 'w') as urlfile: - json.dump(list(past_urls), urlfile) - + save_urls(past_urls, 'urls.json') save_hashes(hashes, 'hashes.json') diff --git a/test.py b/test.py index 78caa51..fe82bbe 100644 --- a/test.py +++ b/test.py @@ -35,12 +35,23 @@ def test_parse_xml_desc(): set(['http://example.org/mylist', 'http://example.com/yourlist']) -def test_load_hashes(): - assert maltrieve.load_hashes('test-load-hashes.json') == \ +def test_load_hashes(hashfile='test-load-hashes.json'): + assert maltrieve.load_hashes(hashfile) == \ set(['d41d8cd98f00b204e9800998ecf8427e']) def test_save_hashes(): hashes = set(['d41d8cd98f00b204e9800998ecf8427e']) maltrieve.save_hashes(hashes, 'test-save-hashes.json') - test_load_hashes() + assert test_load_hashes('test-save-hashes.json') + + +def test_load_urls(urlfile='test-load-urls.json'): + assert maltrieve.load_urls(urlfile) == \ + set(['http://example.com/badurl']) + + +def test_save_urls(): + urls = set(['http://example.com/badurl']) + maltrieve.save_urls(urls, 'test-save-urls.json') + assert test_load_urls('test-save-urls.json') From 551871d0ce26ac92e4fe226281a30c38c96410fe Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 17:58:33 -0500 Subject: [PATCH 49/97] Add CircleCI and Coveralls badges --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 5fc607f..b9f8f51 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ [![Stories in Ready](https://badge.waffle.io/krmaxwell/maltrieve.png?label=ready&title=Ready)](https://waffle.io/krmaxwell/maltrieve) [![Stories in In Progress](https://badge.waffle.io/krmaxwell/maltrieve.png?label=in%20progress&title=In%20Progress)](https://waffle.io/krmaxwell/maltrieve) +[![Circle CI](https://circleci.com/gh/krmaxwell/maltrieve/tree/dev.svg?style=svg)](https://circleci.com/gh/krmaxwell/maltrieve/tree/dev) +[![Coverage Status](https://coveralls.io/repos/krmaxwell/maltrieve/badge.svg?branch=dev)](https://coveralls.io/r/krmaxwell/maltrieve?branch=dev) ``` _______ _______ _______ ______ _____ _______ _ _ _______ | | | |_____| | | |_____/ | |______ \ / |______ From 98558ddf9caa96febd886f4d1b08fdd32d5190e0 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 18:04:47 -0500 Subject: [PATCH 50/97] Don't use assert when calling other tests --- test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test.py b/test.py index fe82bbe..042d04a 100644 --- a/test.py +++ b/test.py @@ -43,7 +43,7 @@ def test_load_hashes(hashfile='test-load-hashes.json'): def test_save_hashes(): hashes = set(['d41d8cd98f00b204e9800998ecf8427e']) maltrieve.save_hashes(hashes, 'test-save-hashes.json') - assert test_load_hashes('test-save-hashes.json') + test_load_hashes('test-save-hashes.json') def test_load_urls(urlfile='test-load-urls.json'): @@ -54,4 +54,4 @@ def test_load_urls(urlfile='test-load-urls.json'): def test_save_urls(): urls = set(['http://example.com/badurl']) maltrieve.save_urls(urls, 'test-save-urls.json') - assert test_load_urls('test-save-urls.json') + test_load_urls('test-save-urls.json') From 6151b08ffba3142672c6df44c7a708f83cec569c Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 18:08:01 -0500 Subject: [PATCH 51/97] Initialize empty set if file doesn't exist. Testing found this! --- maltrieve.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/maltrieve.py b/maltrieve.py index 3885c82..9bb39fe 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -311,6 +311,8 @@ def load_urls(filename='urls.json'): logging.info('Loaded urls from {f}'.format(f=filename)) except ValueError: urls = set() + else: + urls = set() return urls From 440c35a8b674fc5198a159727db803751ddaa159 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 18:10:49 -0500 Subject: [PATCH 52/97] Add test data --- maltrieve.py | 2 +- test-hashes.json | 1 + test-load-hashes.json | 1 + test-save-hashes.json | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 test-hashes.json create mode 100644 test-load-hashes.json create mode 100644 test-save-hashes.json diff --git a/maltrieve.py b/maltrieve.py index 9bb39fe..0509f61 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -309,7 +309,7 @@ def load_urls(filename='urls.json'): with open(filename, 'rb') as urlfile: urls = set(json.load(urlfile)) logging.info('Loaded urls from {f}'.format(f=filename)) - except ValueError: + except ValueError: # this usually happens when the file is empty urls = set() else: urls = set() diff --git a/test-hashes.json b/test-hashes.json new file mode 100644 index 0000000..5bf1d3a --- /dev/null +++ b/test-hashes.json @@ -0,0 +1 @@ +["d41d8cd98f00b204e9800998ecf8427e"] diff --git a/test-load-hashes.json b/test-load-hashes.json new file mode 100644 index 0000000..5bf1d3a --- /dev/null +++ b/test-load-hashes.json @@ -0,0 +1 @@ +["d41d8cd98f00b204e9800998ecf8427e"] diff --git a/test-save-hashes.json b/test-save-hashes.json new file mode 100644 index 0000000..5bf1d3a --- /dev/null +++ b/test-save-hashes.json @@ -0,0 +1 @@ +["d41d8cd98f00b204e9800998ecf8427e"] From 7bbe6572eb6d7a6aac1410e413042bd69ffac17d Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 18:16:12 -0500 Subject: [PATCH 53/97] Add test data --- test-save-urls.json | 1 + 1 file changed, 1 insertion(+) create mode 100644 test-save-urls.json diff --git a/test-save-urls.json b/test-save-urls.json new file mode 100644 index 0000000..4b0b27a --- /dev/null +++ b/test-save-urls.json @@ -0,0 +1 @@ +["http://example.com/badurl"] From 72fd4ff84d24e381b994c3c0a25ccc6b1fdccf18 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 18:21:33 -0500 Subject: [PATCH 54/97] Resolve remaining test issues --- maltrieve.py | 6 +++--- test-load-urls.json | 1 + test-save-hashes.json | 4 +++- test-save-urls.json | 4 +++- 4 files changed, 10 insertions(+), 5 deletions(-) create mode 100644 test-load-urls.json diff --git a/maltrieve.py b/maltrieve.py index 0509f61..4520cce 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -300,7 +300,7 @@ def load_hashes(filename="hashes.json"): def save_hashes(hashes, filename='hashes.json'): logging.info('Dumping hashes to {f}'.format(f=filename)) with open(filename, 'w') as hashfile: - json.dump(list(hashes), hashfile) + json.dump(list(hashes), hashfile, indent=2) def load_urls(filename='urls.json'): @@ -318,8 +318,8 @@ def load_urls(filename='urls.json'): def save_urls(urls, filename='urls.json'): logging.info('Dumping past URLs to {f}'.format(f=filename)) - with open('urls.json', 'w') as urlfile: - json.dump(list(urls), urlfile) + with open(filename, 'w') as urlfile: + json.dump(list(urls), urlfile, indent=2) def main(): diff --git a/test-load-urls.json b/test-load-urls.json new file mode 100644 index 0000000..4b0b27a --- /dev/null +++ b/test-load-urls.json @@ -0,0 +1 @@ +["http://example.com/badurl"] diff --git a/test-save-hashes.json b/test-save-hashes.json index 5bf1d3a..7775811 100644 --- a/test-save-hashes.json +++ b/test-save-hashes.json @@ -1 +1,3 @@ -["d41d8cd98f00b204e9800998ecf8427e"] +[ + "d41d8cd98f00b204e9800998ecf8427e" +] diff --git a/test-save-urls.json b/test-save-urls.json index 4b0b27a..fada56b 100644 --- a/test-save-urls.json +++ b/test-save-urls.json @@ -1 +1,3 @@ -["http://example.com/badurl"] +[ + "http://example.com/badurl" +] From d385810ee2352d869ad6e7b35e7b3de64a51d0b9 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 21:09:18 -0500 Subject: [PATCH 55/97] Don't pass argv[0] --- maltrieve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maltrieve.py b/maltrieve.py index c8a65f4..2ce5adb 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -285,7 +285,7 @@ def main(): hashes = set() past_urls = set() - args = setup_args(sys.argv) + args = setup_args(sys.argv[1:]) cfg = config(args, 'maltrieve.cfg') if cfg.proxy: From 638dda95750ce1ae079bf4935b40c7e5439c1475 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 21:09:51 -0500 Subject: [PATCH 56/97] Import moved by autopep8 --- maltrieve.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/maltrieve.py b/maltrieve.py index 2ce5adb..98aa2d8 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -29,12 +29,13 @@ import resource import sys import tempfile +from urlparse import urlparse + import feedparser import grequests import magic import requests from bs4 import BeautifulSoup -from urlparse import urlparse class config: From 57ad2f5992dc9b555d201aac8649dcc3428598d1 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 23:01:07 -0500 Subject: [PATCH 57/97] Use dev branch (for now) and pip install (always) --- docker/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 341ade6..acfa806 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -36,7 +36,8 @@ RUN rm -rf /var/lib/apt/lists/* && \ WORKDIR /home RUN git clone https://github.com/krmaxwell/maltrieve.git && \ cd maltrieve && \ - python setup.py install && \ + git checkout dev && \ + pip install -e . && \ chown -R maltrieve:maltrieve /home/maltrieve USER maltrieve From 44703e7c6d40d2335980715822506feb7879486f Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 25 Mar 2015 23:34:42 -0500 Subject: [PATCH 58/97] Use a dedicated directory and run the proper command --- docker/Dockerfile | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index acfa806..195affd 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -10,9 +10,7 @@ # # To run this image after installing Docker, use a command like this: # -# sudo docker run --rm -it technoskald/maltrieve bash -# -# then run maltrieve.py with the desired parameters. +# sudo docker run --rm -it technoskald/maltrieve FROM ubuntu:14.04 MAINTAINER Michael Boman @@ -40,8 +38,12 @@ RUN git clone https://github.com/krmaxwell/maltrieve.git && \ pip install -e . && \ chown -R maltrieve:maltrieve /home/maltrieve +RUN mkdir /archive && \ + chown maltrieve:maltrieve /archive + USER maltrieve ENV HOME /home/maltrieve ENV USER maltrieve WORKDIR /home/maltrieve -CMD ["maltrieve"] +ENTRYPOINT ["maltrieve"] +CMD ["-d", "/archive/samples", "-l", "/archive/maltrieve.log"] From eea3bb7ffcfb0b7cdee66249ec9ded3e139fd7d5 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 26 Mar 2015 18:48:40 -0500 Subject: [PATCH 59/97] Configure CRITs integration --- maltrieve.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 37cbc7d..05082cc 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -110,12 +110,18 @@ def __init__(self, args, filename='maltrieve.cfg'): os.remove(temp_path) logging.info('Using {dir} as dump directory'.format(dir=self.dumpdir)) + self.logheaders = self.configp.get('Maltrieve', 'logheaders') # TODO: Merge these self.vxcage = args.vxcage or self.configp.has_option('Maltrieve', 'vxcage') self.cuckoo = args.cuckoo or self.configp.has_option('Maltrieve', 'cuckoo') self.viper = args.viper or self.configp.has_option('Maltrieve', 'viper') - self.logheaders = self.configp.get('Maltrieve', 'logheaders') + + # CRITs + self.crits = args.crits or self.configp.has_option('Maltrieve', 'crits') + self.crits_user = self.configp.has_option('Maltrieve', 'crits_user') + self.crits_key = self.configp.has_option('Maltrieve', 'crits_key') + self.crits_source = self.configp.has_option('Maltrieve', 'crits_source') def upload_crits(response, md5, cfg): @@ -143,12 +149,12 @@ def upload_crits(response, md5, cfg): domain_response = requests.post(url, headers=headers, data=domain_data, verify=False) if domain_response.status_code == requests.codes.ok: domain_response_data = domain_response.json() - logging.info("Submitted domain info for {md5} to Crits, response was {msg}".format(md5=md5, + logging.info("Submitted domain info for {md5} to CRITs, response was {msg}".format(md5=md5, msg=domain_response_data["message"])) if domain_response_data['return_code'] == 0: inserted_domain = True except: - logging.info("Exception caught from CRITs when submitting domain") + logging.info("Exception caught from CRITs when submitting domain: {code}".format(code=domain_response.status_code)) # Submit sample url = "{srv}/api/v1/samples/".format(srv=cfg.crits) @@ -159,9 +165,9 @@ def upload_crits(response, md5, cfg): else: file_type = 'raw' sample_data = { - 'api_key': cfg['crits_key'], - 'username': cfg['crits_user'], - 'source': cfg['crits_source'], + 'api_key': cfg.crits_key, + 'username': cfg.crits_user, + 'source': cfg.crits_source, 'upload_type': 'file', 'md5': md5, 'file_format': file_type # must be type zip, rar, or raw @@ -175,15 +181,15 @@ def upload_crits(response, md5, cfg): if sample_response_data['return_code'] == 0: inserted_sample = True except: - logging.info("Exception caught from Crits when submitting sample") + logging.info("Exception caught from CRITs when submitting sample: {code}".format(code=sample_response.status_code)) # Create a relationship for the sample and domain - url = "{srv}/api/v1/relationships/".format(src=cfg.crits) + url = "{srv}/api/v1/relationships/".format(srv=cfg.crits) if (inserted_sample and inserted_domain): relationship_data = { - 'api_key': cfg['crits_key'], - 'username': cfg['crits_user'], - 'source': cfg['crits_source'], + 'api_key': cfg.crits_key, + 'username': cfg.crits_user, + 'source': cfg.crits_source, 'right_type': domain_response_data['type'], 'right_id': domain_response_data['id'], 'left_type': sample_response_data['type'], @@ -379,7 +385,7 @@ def main(): hashes = set() past_urls = set() - args = setup_args(sys.argv) + args = setup_args(sys.argv[1:]) cfg = config(args, 'maltrieve.cfg') if cfg.proxy: From 128f04c7d04be6c87b1cfcdcd9a0480937e81560 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Fri, 27 Mar 2015 15:35:18 -0500 Subject: [PATCH 60/97] Configure CRITs correctly --- maltrieve.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 05082cc..a4e192f 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -118,10 +118,10 @@ def __init__(self, args, filename='maltrieve.cfg'): self.viper = args.viper or self.configp.has_option('Maltrieve', 'viper') # CRITs - self.crits = args.crits or self.configp.has_option('Maltrieve', 'crits') - self.crits_user = self.configp.has_option('Maltrieve', 'crits_user') - self.crits_key = self.configp.has_option('Maltrieve', 'crits_key') - self.crits_source = self.configp.has_option('Maltrieve', 'crits_source') + self.crits = args.crits or self.configp.get('Maltrieve', 'crits') + self.crits_user = self.configp.get('Maltrieve', 'crits_user') + self.crits_key = self.configp.get('Maltrieve', 'crits_key') + self.crits_source = self.configp.get('Maltrieve', 'crits_source') def upload_crits(response, md5, cfg): @@ -146,15 +146,20 @@ def upload_crits(response, md5, cfg): } try: # Note that this request does NOT go through proxies - domain_response = requests.post(url, headers=headers, data=domain_data, verify=False) + logging.debug("Domain submission: {url}|{data}".format(url=url, data=domain_data)) + domain_response = requests.post(url, headers=headers, data=domain_data) + # pylint says "Instance of LookupDict has no 'ok' member" if domain_response.status_code == requests.codes.ok: domain_response_data = domain_response.json() - logging.info("Submitted domain info for {md5} to CRITs, response was {msg}".format(md5=md5, - msg=domain_response_data["message"])) if domain_response_data['return_code'] == 0: inserted_domain = True + else: + logging.info("Submitted domain info for {md5} to CRITs, response was {data}".format(md5=md5, + data=domain_response_data)) + else: + logging.info("Submission of {url} failed: {code}".format(url=url, code=domain_response.status_code)) except: - logging.info("Exception caught from CRITs when submitting domain: {code}".format(code=domain_response.status_code)) + logging.info("Exception caught from CRITs when submitting domain: {response}".format(code=domain_response)) # Submit sample url = "{srv}/api/v1/samples/".format(srv=cfg.crits) @@ -175,6 +180,7 @@ def upload_crits(response, md5, cfg): try: # Note that this request does NOT go through proxies sample_response = requests.post(url, headers=headers, files=files, data=sample_data, verify=False) + # pylint says "Instance of LookupDict has no 'ok' member" if sample_response.status_code == requests.codes.ok: sample_response_data = sample_response.json() logging.info("Submitted sample info for {md5} to CRITs".format(md5=md5)) @@ -201,6 +207,7 @@ def upload_crits(response, md5, cfg): try: # Note that this request does NOT go through proxies relationship_response = requests.post(url, headers=headers, data=relationship_data, verify=False) + # pylint says "Instance of LookupDict has no 'ok' member" if relationship_response.status_code == requests.codes.ok: logging.info("Submitted relationship info for {md5} to CRITs".format(md5=md5)) except: From ed5eda950bc1fa7c95c9787333c90e2e95953f42 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Tue, 31 Mar 2015 20:31:19 -0500 Subject: [PATCH 61/97] Add timeout when downloading samples per discussion in #58 --- maltrieve.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index c8a65f4..01f7d69 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -29,12 +29,13 @@ import resource import sys import tempfile +from urlparse import urlparse + import feedparser import grequests import magic import requests from bs4 import BeautifulSoup -from urlparse import urlparse class config: @@ -337,7 +338,7 @@ def main(): print "Downloading samples, check log for details" malware_urls -= past_urls - reqs = [grequests.get(url, headers=headers, proxies=cfg.proxy) for url in malware_urls] + reqs = [grequests.get(url, timeout=60, headers=headers, proxies=cfg.proxy) for url in malware_urls] for chunk in chunker(reqs, 32): malware_downloads = grequests.map(chunk) for each in malware_downloads: From c92e06036c3d0926066f2fa1e8a61c60e965e5b2 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Tue, 31 Mar 2015 20:39:57 -0500 Subject: [PATCH 62/97] Fix the args passing... again... wtf. --- maltrieve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maltrieve.py b/maltrieve.py index 01f7d69..1cae087 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -287,7 +287,7 @@ def main(): past_urls = set() args = setup_args(sys.argv) - cfg = config(args, 'maltrieve.cfg') + cfg = config(args[1:], 'maltrieve.cfg') if cfg.proxy: logging.info('Using proxy {proxy}'.format(proxy=cfg.proxy)) From cf2955f70fc038643a69ef475c0f6314c40cdecb Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Tue, 31 Mar 2015 21:26:59 -0500 Subject: [PATCH 63/97] Remove janky old code, wut --- src/MultiPartForm.py | 69 --------- src/__init__.py | 2 - src/maltrieve.py | 277 ------------------------------------ src/maltrievecategorizer.sh | 43 ------ src/malutil.py | 45 ------ 5 files changed, 436 deletions(-) delete mode 100644 src/MultiPartForm.py delete mode 100644 src/__init__.py delete mode 100755 src/maltrieve.py delete mode 100644 src/maltrievecategorizer.sh delete mode 100644 src/malutil.py diff --git a/src/MultiPartForm.py b/src/MultiPartForm.py deleted file mode 100644 index 0af6d96..0000000 --- a/src/MultiPartForm.py +++ /dev/null @@ -1,69 +0,0 @@ -import itertools -import mimetools -import mimetypes -import urllib -import urllib2 - - -class MultiPartForm(object): - """Accumulate the data to be used when posting a form.""" - - def __init__(self): - self.form_fields = [] - self.files = [] - self.boundary = mimetools.choose_boundary() - return - - def get_content_type(self): - return 'multipart/form-data; boundary=%s' % self.boundary - - def add_field(self, name, value): - """Add a simple field to the form data.""" - self.form_fields.append((name, value)) - return - - def add_file(self, fieldname, filename, fileHandle, mimetype=None): - """Add a file to be uploaded.""" - body = fileHandle.read() - if mimetype is None: - mimetype = mimetypes.guess_type(filename)[0] or 'application/octet-stream' - self.files.append((fieldname, filename, mimetype, body)) - return - - def __str__(self): - """Return a string representing the form data, including attached files.""" - # Build a list of lists, each containing "lines" of the - # request. Each part is separated by a boundary string. - # Once the list is built, return a string where each - # line is separated by '\r\n'. - parts = [] - part_boundary = '--' + self.boundary - - # Add the form fields - parts.extend( - [ part_boundary, - 'Content-Disposition: form-data; name="%s"' % name, - '', - value, - ] - for name, value in self.form_fields - ) - - # Add the files to upload - parts.extend( - [ part_boundary, - 'Content-Disposition: file; name="%s"; filename="%s"' % \ - (field_name, filename), - 'Content-Type: %s' % content_type, - '', - body, - ] - for field_name, filename, content_type, body in self.files - ) - - # Flatten the list and add closing boundary marker, - # then return CR+LF separated data - flattened = list(itertools.chain(*parts)) - flattened.append('--' + self.boundary + '--') - flattened.append('') - return '\r\n'.join(flattened) diff --git a/src/__init__.py b/src/__init__.py deleted file mode 100644 index a3591fe..0000000 --- a/src/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -''' This file is part of maltrieve. See LICENSE for license details. ''' - diff --git a/src/maltrieve.py b/src/maltrieve.py deleted file mode 100755 index 76a30b4..0000000 --- a/src/maltrieve.py +++ /dev/null @@ -1,277 +0,0 @@ -# Copyright 2013 Kyle Maxwell -# Includes code from mwcrawler, (c) 2012 Ricardo Dias. Used under license. - -# Maltrieve - retrieve malware from the source - -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see Date: Tue, 31 Mar 2015 21:33:13 -0500 Subject: [PATCH 64/97] Empty file needed for pip install --- src/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/__init__.py diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 From c396b250c132b8bd94a710baee99b4e3f7cc219c Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Tue, 31 Mar 2015 21:36:38 -0500 Subject: [PATCH 65/97] Add code health badge --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index b9f8f51..965f2c1 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@ [![Stories in In Progress](https://badge.waffle.io/krmaxwell/maltrieve.png?label=in%20progress&title=In%20Progress)](https://waffle.io/krmaxwell/maltrieve) [![Circle CI](https://circleci.com/gh/krmaxwell/maltrieve/tree/dev.svg?style=svg)](https://circleci.com/gh/krmaxwell/maltrieve/tree/dev) [![Coverage Status](https://coveralls.io/repos/krmaxwell/maltrieve/badge.svg?branch=dev)](https://coveralls.io/r/krmaxwell/maltrieve?branch=dev) +[![Code Health](https://landscape.io/github/krmaxwell/maltrieve/dev/landscape.svg?style=flat)](https://landscape.io/github/krmaxwell/maltrieve/dev) + ``` _______ _______ _______ ______ _____ _______ _ _ _______ | | | |_____| | | |_____/ | |______ \ / |______ From 8ff176aec9af67e5050d74be2d508a08245290fc Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Tue, 31 Mar 2015 21:41:48 -0500 Subject: [PATCH 66/97] How many times am I gonna have to fix this --- maltrieve.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index c8a65f4..98aa2d8 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -29,12 +29,13 @@ import resource import sys import tempfile +from urlparse import urlparse + import feedparser import grequests import magic import requests from bs4 import BeautifulSoup -from urlparse import urlparse class config: @@ -285,7 +286,7 @@ def main(): hashes = set() past_urls = set() - args = setup_args(sys.argv) + args = setup_args(sys.argv[1:]) cfg = config(args, 'maltrieve.cfg') if cfg.proxy: From 8ce615d696b8a9790d848d67d7019c50a1e36ae5 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Tue, 31 Mar 2015 21:51:25 -0500 Subject: [PATCH 67/97] Fix error with out-of-scope variable --- maltrieve.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index a4e192f..009700d 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -159,7 +159,7 @@ def upload_crits(response, md5, cfg): else: logging.info("Submission of {url} failed: {code}".format(url=url, code=domain_response.status_code)) except: - logging.info("Exception caught from CRITs when submitting domain: {response}".format(code=domain_response)) + logging.info("Exception caught from CRITs when submitting domain") # Submit sample url = "{srv}/api/v1/samples/".format(srv=cfg.crits) @@ -222,7 +222,7 @@ def upload_vxcage(response, md5, cfg): url_tag = urlparse(response.url) files = {'file': (md5, response.content)} tags = {'tags': url_tag.netloc + ',Maltrieve'} - url = "{srv}/malware/add".format(cfg.vxcage) + url = "{srv}/malware/add".format(srv=cfg.vxcage) headers = {'User-agent': 'Maltrieve'} try: # Note that this request does NOT go through proxies From 77fb3294b40e79379b4b7edd2809beb5c3c543fc Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Tue, 31 Mar 2015 22:11:22 -0500 Subject: [PATCH 68/97] Fix exception logic --- .pre-commit-config.yaml | 2 +- maltrieve.py | 20 +++++++++++++------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 73b9138..d87e290 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,7 @@ - id: check-yaml - id: end-of-file-fixer - id: flake8 - args: [--max-line-length=140] + args: [--max-line-length=256] - id: trailing-whitespace - repo: git://github.com/ivanlei/pre-commit-python-sorter diff --git a/maltrieve.py b/maltrieve.py index 5f1b3b6..42a5795 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -154,8 +154,9 @@ def upload_crits(response, md5, cfg): if domain_response_data['return_code'] == 0: inserted_domain = True else: - logging.info("Submitted domain info for {md5} to CRITs, response was {data}".format(md5=md5, - data=domain_response_data)) + logging.info("Submitted domain info {dom} for {md5} to CRITs, response was {data}".format(dom=domain_data['domain'], + md5=md5, + data=domain_response_data)) else: logging.info("Submission of {url} failed: {code}".format(url=url, code=domain_response.status_code)) except: @@ -183,11 +184,15 @@ def upload_crits(response, md5, cfg): # pylint says "Instance of LookupDict has no 'ok' member" if sample_response.status_code == requests.codes.ok: sample_response_data = sample_response.json() - logging.info("Submitted sample info for {md5} to CRITs".format(md5=md5)) if sample_response_data['return_code'] == 0: inserted_sample = True + else: + logging.info("Submitted sample {md5} to CRITs, response was {data}".format(md5=md5, + data=sample_response_data)) + else: + logging.info("Submission of {md5} failed: {code}".format(md5=md5, code=sample_response.status_code)) except: - logging.info("Exception caught from CRITs when submitting sample: {code}".format(code=sample_response.status_code)) + logging.info("Exception caught from CRITs when submitting sample") # Create a relationship for the sample and domain url = "{srv}/api/v1/relationships/".format(srv=cfg.crits) @@ -208,13 +213,14 @@ def upload_crits(response, md5, cfg): # Note that this request does NOT go through proxies relationship_response = requests.post(url, headers=headers, data=relationship_data, verify=False) # pylint says "Instance of LookupDict has no 'ok' member" - if relationship_response.status_code == requests.codes.ok: - logging.info("Submitted relationship info for {md5} to CRITs".format(md5=md5)) + if relationship_response.status_code != requests.codes.ok: + logging.info("Submitted relationship info for {md5} to CRITs, response was {data}".format(md5=md5, data=domain_response_data)) except: # TODO: need informative but still shorter message logging.info("Relationship submission skipped.") + return True else: - logging.info("Skipping adding relationship. CRITs could not process domain or sample.") + return False def upload_vxcage(response, md5, cfg): From 116fa0b6ba55631cb3da8dfaea9daa3768833904 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Tue, 31 Mar 2015 23:30:05 -0500 Subject: [PATCH 69/97] Declare class properly --- maltrieve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maltrieve.py b/maltrieve.py index 42a5795..71b6009 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -39,7 +39,7 @@ from bs4 import BeautifulSoup -class config: +class config(object): """ Class for holding global configuration setup """ From f5e4792a6a049c8487f12be2f58e4951af61727d Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Tue, 31 Mar 2015 23:45:21 -0500 Subject: [PATCH 70/97] Push string interpolation to logger for aggregation efficiency --- maltrieve.py | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 71b6009..5033c2b 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -97,19 +97,19 @@ def __init__(self, args, filename='maltrieve.cfg'): try: os.makedirs(self.dumpdir) except IOError: - logging.error('Could not create {dir}, using default'.format(dir=self.dumpdir)) + logging.error('Could not create %s, using default', self.dumpdir) self.dumpdir = '/tmp/malware' try: fd, temp_path = tempfile.mkstemp(dir=self.dumpdir) except IOError: - logging.error('Could not open {dir} for writing, using default'.format(dir=self.dumpdir)) + logging.error('Could not open %s for writing, using default', self.dumpdir) self.dumpdir = '/tmp/malware' else: os.close(fd) os.remove(temp_path) - logging.info('Using {dir} as dump directory'.format(dir=self.dumpdir)) + logging.info('Using %s as dump directory', self.dumpdir) self.logheaders = self.configp.get('Maltrieve', 'logheaders') # TODO: Merge these @@ -146,7 +146,7 @@ def upload_crits(response, md5, cfg): } try: # Note that this request does NOT go through proxies - logging.debug("Domain submission: {url}|{data}".format(url=url, data=domain_data)) + logging.debug("Domain submission: %s|%r", url, domain_data) domain_response = requests.post(url, headers=headers, data=domain_data) # pylint says "Instance of LookupDict has no 'ok' member" if domain_response.status_code == requests.codes.ok: @@ -154,11 +154,10 @@ def upload_crits(response, md5, cfg): if domain_response_data['return_code'] == 0: inserted_domain = True else: - logging.info("Submitted domain info {dom} for {md5} to CRITs, response was {data}".format(dom=domain_data['domain'], - md5=md5, - data=domain_response_data)) + logging.info("Submitted domain info %s for %s to CRITs, response was %s", + domain_data['domain'], md5, domain_response_data) else: - logging.info("Submission of {url} failed: {code}".format(url=url, code=domain_response.status_code)) + logging.info("Submission of %s failed: %d", url, domain_response.status_code) except: logging.info("Exception caught from CRITs when submitting domain") @@ -187,10 +186,9 @@ def upload_crits(response, md5, cfg): if sample_response_data['return_code'] == 0: inserted_sample = True else: - logging.info("Submitted sample {md5} to CRITs, response was {data}".format(md5=md5, - data=sample_response_data)) + logging.info("Submitted sample %s to CRITs, response was %r", md5, sample_response_data) else: - logging.info("Submission of {md5} failed: {code}".format(md5=md5, code=sample_response.status_code)) + logging.info("Submission of %s failed: %d}", md5, sample_response.status_code) except: logging.info("Exception caught from CRITs when submitting sample") @@ -214,7 +212,8 @@ def upload_crits(response, md5, cfg): relationship_response = requests.post(url, headers=headers, data=relationship_data, verify=False) # pylint says "Instance of LookupDict has no 'ok' member" if relationship_response.status_code != requests.codes.ok: - logging.info("Submitted relationship info for {md5} to CRITs, response was {data}".format(md5=md5, data=domain_response_data)) + logging.info("Submitted relationship info for %s to CRITs, response was %r", + md5, domain_response_data) except: # TODO: need informative but still shorter message logging.info("Relationship submission skipped.") @@ -234,7 +233,7 @@ def upload_vxcage(response, md5, cfg): # Note that this request does NOT go through proxies response = requests.post(url, headers=headers, files=files, data=tags) response_data = response.json() - logging.info("Submitted {md5} to VxCage, response was {msg}".format(md5=md5, msg=response_data["message"])) + logging.info("Submitted %s to VxCage, response was %d", md5, response_data["message"]) except requests.exceptions.ConnectionError: logging.info("Could not connect to VxCage, will attempt local storage") return False @@ -251,7 +250,7 @@ def upload_cuckoo(response, md5, cfg): try: response = requests.post(url, headers=headers, data=data) response_data = response.json() - logging.info("Submitted {md5} to Cuckoo, task ID {taskid}".format(md5=md5, taskid=response_data["task_id"])) + logging.info("Submitted %s to Cuckoo, task ID %d", md5, response_data["task_id"]) except requests.exceptions.ConnectionError: logging.info("Could not connect to Cuckoo, will attempt local storage") return False @@ -270,7 +269,7 @@ def upload_viper(response, md5, cfg): # Note that this request does NOT go through proxies response = requests.post(url, headers=headers, files=files, data=tags) response_data = response.json() - logging.info("Submitted {md5} to Viper, response was {msg}".format(md5=md5, msg=response_data["message"])) + logging.info("Submitted %s to Viper, response was %s", md5, response_data["message"]) except requests.exceptions.ConnectionError: logging.info("Could not connect to Viper, will attempt local storage") return False @@ -283,18 +282,18 @@ def save_malware(response, cfg): data = response.content mime_type = magic.from_buffer(data, mime=True) if mime_type in cfg.black_list: - logging.info('{mtype} in ignore list for {url}'.format(mtype=mime_type, url=url)) + logging.info('%s in ignore list for %s', mime_type, url) return if cfg.white_list: if mime_type in cfg.white_list: pass else: - logging.info('{mtype} not in whitelist for {url}'.format(mtype=mime_type, url=url)) + logging.info('%s not in whitelist for %s', mime_type, url) return # Hash and log md5 = hashlib.md5(data).hexdigest() - logging.info("{url} hashes to {md5}".format(url=url, md5=md5)) + logging.info("%s hashes to %s", url, md5) # Assume that external repo means we don't need to write to file as well. stored = False @@ -321,7 +320,7 @@ def save_malware(response, cfg): store_path = os.path.join(cfg.dumpdir, md5) with open(store_path, 'wb') as f: f.write(data) - logging.info("Saved {md5} to dump dir".format(md5=md5)) + logging.info("Saved %s to dump dir", md5) return True @@ -402,9 +401,9 @@ def main(): cfg = config(args, 'maltrieve.cfg') if cfg.proxy: - logging.info('Using proxy {proxy}'.format(proxy=cfg.proxy)) + logging.info('Using proxy %s', cfg.proxy) my_ip = requests.get('http://ipinfo.io/ip', proxies=cfg.proxy).text - logging.info('External sites see {ip}'.format(ip=my_ip)) + logging.info('External sites see %s', my_ip) print 'External sites see {ip}'.format(ip=my_ip) if os.path.exists('hashes.json'): From efa48e7b4b73bb4af885f6d61f08bd836541916d Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Tue, 31 Mar 2015 23:54:28 -0500 Subject: [PATCH 71/97] Declare exceptions in upload_crits() --- maltrieve.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 5033c2b..4251f9b 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -158,8 +158,12 @@ def upload_crits(response, md5, cfg): domain_data['domain'], md5, domain_response_data) else: logging.info("Submission of %s failed: %d", url, domain_response.status_code) - except: - logging.info("Exception caught from CRITs when submitting domain") + except requests.ConnectionError: + logging.info("Could not connect to CRITs when submitting domain %s", domain_data['domain']) + except requests.ConnectTimeout: + logging.info("Timed out connecting to CRITs when submitting domain %s", domain_data['domain']) + except requests.HTTPError: + logging.info("HTTP error when submitting domain %s to CRITs", domain_data['domain']) # Submit sample url = "{srv}/api/v1/samples/".format(srv=cfg.crits) @@ -188,9 +192,13 @@ def upload_crits(response, md5, cfg): else: logging.info("Submitted sample %s to CRITs, response was %r", md5, sample_response_data) else: - logging.info("Submission of %s failed: %d}", md5, sample_response.status_code) - except: - logging.info("Exception caught from CRITs when submitting sample") + logging.info("Submission of sample %s failed: %d}", md5, sample_response.status_code) + except requests.ConnectionError: + logging.info("Could not connect to CRITs when submitting sample %s", md5) + except requests.ConnectTimeout: + logging.info("Timed out connecting to CRITs when submitting sample %s", md5) + except requests.HTTPError: + logging.info("HTTP error when submitting sample %s to CRITs", md5) # Create a relationship for the sample and domain url = "{srv}/api/v1/relationships/".format(srv=cfg.crits) @@ -214,10 +222,13 @@ def upload_crits(response, md5, cfg): if relationship_response.status_code != requests.codes.ok: logging.info("Submitted relationship info for %s to CRITs, response was %r", md5, domain_response_data) - except: - # TODO: need informative but still shorter message - logging.info("Relationship submission skipped.") - return True + except requests.ConnectionError: + logging.info("Could not connect to CRITs when submitting relationship for sample %s", md5) + except requests.ConnectTimeout: + logging.info("Timed out connecting to CRITs when submitting relationship for sample %s", md5) + except requests.HTTPError: + logging.info("HTTP error when submitting relationship for sample %s to CRITs", md5) + return True else: return False From 7954163ffb86a8996fd64efc35d7c16665eb63b3 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Tue, 31 Mar 2015 23:56:05 -0500 Subject: [PATCH 72/97] Fix version requirements --- setup.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/setup.py b/setup.py index 5911a1e..aaafd38 100644 --- a/setup.py +++ b/setup.py @@ -2,21 +2,21 @@ from distutils.core import setup setup(name='maltrieve', - version='0.6', + version='0.7', description="A tool to retrieve malware directly from the source for security researchers.", author='Kyle Maxwell', author_email='krmaxwell@gmail.com', url='http://maltrieve.org', install_requires=[ - 'argparse==1.2.1', - 'beautifulsoup4==4.3.2', - 'feedparser==5.1.3', - 'gevent==1.0.1', - 'greenlet==0.4.2', - 'grequests==0.2.0', - 'python-magic==0.4.6', - 'requests==2.3.0', - 'wsgiref==0.1.2', + 'argparse>=1.2.1', + 'beautifulsoup4>=4.3.2', + 'feedparser>=5.1.3', + 'gevent>=1.0.1', + 'greenlet>=0.4.2', + 'grequests>=0.2.0', + 'python-magic>=0.4.6', + 'requests>=2.3.0', + 'wsgiref>=0.1.2', 'pre-commit', 'pytest', 'pytest-cov', From 1b934a0c23c01bb7a40e8e1982c20b9152f4208b Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Tue, 31 Mar 2015 23:30:05 -0500 Subject: [PATCH 73/97] Declare class properly --- maltrieve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maltrieve.py b/maltrieve.py index 42a5795..71b6009 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -39,7 +39,7 @@ from bs4 import BeautifulSoup -class config: +class config(object): """ Class for holding global configuration setup """ From 51f978ed1f8ed42ee7d9e306ab5586d1f2d48fd2 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Tue, 31 Mar 2015 23:45:21 -0500 Subject: [PATCH 74/97] Push string interpolation to logger for aggregation efficiency --- maltrieve.py | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 71b6009..5033c2b 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -97,19 +97,19 @@ def __init__(self, args, filename='maltrieve.cfg'): try: os.makedirs(self.dumpdir) except IOError: - logging.error('Could not create {dir}, using default'.format(dir=self.dumpdir)) + logging.error('Could not create %s, using default', self.dumpdir) self.dumpdir = '/tmp/malware' try: fd, temp_path = tempfile.mkstemp(dir=self.dumpdir) except IOError: - logging.error('Could not open {dir} for writing, using default'.format(dir=self.dumpdir)) + logging.error('Could not open %s for writing, using default', self.dumpdir) self.dumpdir = '/tmp/malware' else: os.close(fd) os.remove(temp_path) - logging.info('Using {dir} as dump directory'.format(dir=self.dumpdir)) + logging.info('Using %s as dump directory', self.dumpdir) self.logheaders = self.configp.get('Maltrieve', 'logheaders') # TODO: Merge these @@ -146,7 +146,7 @@ def upload_crits(response, md5, cfg): } try: # Note that this request does NOT go through proxies - logging.debug("Domain submission: {url}|{data}".format(url=url, data=domain_data)) + logging.debug("Domain submission: %s|%r", url, domain_data) domain_response = requests.post(url, headers=headers, data=domain_data) # pylint says "Instance of LookupDict has no 'ok' member" if domain_response.status_code == requests.codes.ok: @@ -154,11 +154,10 @@ def upload_crits(response, md5, cfg): if domain_response_data['return_code'] == 0: inserted_domain = True else: - logging.info("Submitted domain info {dom} for {md5} to CRITs, response was {data}".format(dom=domain_data['domain'], - md5=md5, - data=domain_response_data)) + logging.info("Submitted domain info %s for %s to CRITs, response was %s", + domain_data['domain'], md5, domain_response_data) else: - logging.info("Submission of {url} failed: {code}".format(url=url, code=domain_response.status_code)) + logging.info("Submission of %s failed: %d", url, domain_response.status_code) except: logging.info("Exception caught from CRITs when submitting domain") @@ -187,10 +186,9 @@ def upload_crits(response, md5, cfg): if sample_response_data['return_code'] == 0: inserted_sample = True else: - logging.info("Submitted sample {md5} to CRITs, response was {data}".format(md5=md5, - data=sample_response_data)) + logging.info("Submitted sample %s to CRITs, response was %r", md5, sample_response_data) else: - logging.info("Submission of {md5} failed: {code}".format(md5=md5, code=sample_response.status_code)) + logging.info("Submission of %s failed: %d}", md5, sample_response.status_code) except: logging.info("Exception caught from CRITs when submitting sample") @@ -214,7 +212,8 @@ def upload_crits(response, md5, cfg): relationship_response = requests.post(url, headers=headers, data=relationship_data, verify=False) # pylint says "Instance of LookupDict has no 'ok' member" if relationship_response.status_code != requests.codes.ok: - logging.info("Submitted relationship info for {md5} to CRITs, response was {data}".format(md5=md5, data=domain_response_data)) + logging.info("Submitted relationship info for %s to CRITs, response was %r", + md5, domain_response_data) except: # TODO: need informative but still shorter message logging.info("Relationship submission skipped.") @@ -234,7 +233,7 @@ def upload_vxcage(response, md5, cfg): # Note that this request does NOT go through proxies response = requests.post(url, headers=headers, files=files, data=tags) response_data = response.json() - logging.info("Submitted {md5} to VxCage, response was {msg}".format(md5=md5, msg=response_data["message"])) + logging.info("Submitted %s to VxCage, response was %d", md5, response_data["message"]) except requests.exceptions.ConnectionError: logging.info("Could not connect to VxCage, will attempt local storage") return False @@ -251,7 +250,7 @@ def upload_cuckoo(response, md5, cfg): try: response = requests.post(url, headers=headers, data=data) response_data = response.json() - logging.info("Submitted {md5} to Cuckoo, task ID {taskid}".format(md5=md5, taskid=response_data["task_id"])) + logging.info("Submitted %s to Cuckoo, task ID %d", md5, response_data["task_id"]) except requests.exceptions.ConnectionError: logging.info("Could not connect to Cuckoo, will attempt local storage") return False @@ -270,7 +269,7 @@ def upload_viper(response, md5, cfg): # Note that this request does NOT go through proxies response = requests.post(url, headers=headers, files=files, data=tags) response_data = response.json() - logging.info("Submitted {md5} to Viper, response was {msg}".format(md5=md5, msg=response_data["message"])) + logging.info("Submitted %s to Viper, response was %s", md5, response_data["message"]) except requests.exceptions.ConnectionError: logging.info("Could not connect to Viper, will attempt local storage") return False @@ -283,18 +282,18 @@ def save_malware(response, cfg): data = response.content mime_type = magic.from_buffer(data, mime=True) if mime_type in cfg.black_list: - logging.info('{mtype} in ignore list for {url}'.format(mtype=mime_type, url=url)) + logging.info('%s in ignore list for %s', mime_type, url) return if cfg.white_list: if mime_type in cfg.white_list: pass else: - logging.info('{mtype} not in whitelist for {url}'.format(mtype=mime_type, url=url)) + logging.info('%s not in whitelist for %s', mime_type, url) return # Hash and log md5 = hashlib.md5(data).hexdigest() - logging.info("{url} hashes to {md5}".format(url=url, md5=md5)) + logging.info("%s hashes to %s", url, md5) # Assume that external repo means we don't need to write to file as well. stored = False @@ -321,7 +320,7 @@ def save_malware(response, cfg): store_path = os.path.join(cfg.dumpdir, md5) with open(store_path, 'wb') as f: f.write(data) - logging.info("Saved {md5} to dump dir".format(md5=md5)) + logging.info("Saved %s to dump dir", md5) return True @@ -402,9 +401,9 @@ def main(): cfg = config(args, 'maltrieve.cfg') if cfg.proxy: - logging.info('Using proxy {proxy}'.format(proxy=cfg.proxy)) + logging.info('Using proxy %s', cfg.proxy) my_ip = requests.get('http://ipinfo.io/ip', proxies=cfg.proxy).text - logging.info('External sites see {ip}'.format(ip=my_ip)) + logging.info('External sites see %s', my_ip) print 'External sites see {ip}'.format(ip=my_ip) if os.path.exists('hashes.json'): From f5bc534198e8e8859cd0ee658d556f698c460744 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Tue, 31 Mar 2015 23:54:28 -0500 Subject: [PATCH 75/97] Declare exceptions in upload_crits() --- maltrieve.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 5033c2b..4251f9b 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -158,8 +158,12 @@ def upload_crits(response, md5, cfg): domain_data['domain'], md5, domain_response_data) else: logging.info("Submission of %s failed: %d", url, domain_response.status_code) - except: - logging.info("Exception caught from CRITs when submitting domain") + except requests.ConnectionError: + logging.info("Could not connect to CRITs when submitting domain %s", domain_data['domain']) + except requests.ConnectTimeout: + logging.info("Timed out connecting to CRITs when submitting domain %s", domain_data['domain']) + except requests.HTTPError: + logging.info("HTTP error when submitting domain %s to CRITs", domain_data['domain']) # Submit sample url = "{srv}/api/v1/samples/".format(srv=cfg.crits) @@ -188,9 +192,13 @@ def upload_crits(response, md5, cfg): else: logging.info("Submitted sample %s to CRITs, response was %r", md5, sample_response_data) else: - logging.info("Submission of %s failed: %d}", md5, sample_response.status_code) - except: - logging.info("Exception caught from CRITs when submitting sample") + logging.info("Submission of sample %s failed: %d}", md5, sample_response.status_code) + except requests.ConnectionError: + logging.info("Could not connect to CRITs when submitting sample %s", md5) + except requests.ConnectTimeout: + logging.info("Timed out connecting to CRITs when submitting sample %s", md5) + except requests.HTTPError: + logging.info("HTTP error when submitting sample %s to CRITs", md5) # Create a relationship for the sample and domain url = "{srv}/api/v1/relationships/".format(srv=cfg.crits) @@ -214,10 +222,13 @@ def upload_crits(response, md5, cfg): if relationship_response.status_code != requests.codes.ok: logging.info("Submitted relationship info for %s to CRITs, response was %r", md5, domain_response_data) - except: - # TODO: need informative but still shorter message - logging.info("Relationship submission skipped.") - return True + except requests.ConnectionError: + logging.info("Could not connect to CRITs when submitting relationship for sample %s", md5) + except requests.ConnectTimeout: + logging.info("Timed out connecting to CRITs when submitting relationship for sample %s", md5) + except requests.HTTPError: + logging.info("HTTP error when submitting relationship for sample %s to CRITs", md5) + return True else: return False From be30f0620cc1d4c5b29027bc808a1d3985beb2ee Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Tue, 31 Mar 2015 23:56:05 -0500 Subject: [PATCH 76/97] Fix version requirements --- setup.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/setup.py b/setup.py index 5911a1e..aaafd38 100644 --- a/setup.py +++ b/setup.py @@ -2,21 +2,21 @@ from distutils.core import setup setup(name='maltrieve', - version='0.6', + version='0.7', description="A tool to retrieve malware directly from the source for security researchers.", author='Kyle Maxwell', author_email='krmaxwell@gmail.com', url='http://maltrieve.org', install_requires=[ - 'argparse==1.2.1', - 'beautifulsoup4==4.3.2', - 'feedparser==5.1.3', - 'gevent==1.0.1', - 'greenlet==0.4.2', - 'grequests==0.2.0', - 'python-magic==0.4.6', - 'requests==2.3.0', - 'wsgiref==0.1.2', + 'argparse>=1.2.1', + 'beautifulsoup4>=4.3.2', + 'feedparser>=5.1.3', + 'gevent>=1.0.1', + 'greenlet>=0.4.2', + 'grequests>=0.2.0', + 'python-magic>=0.4.6', + 'requests>=2.3.0', + 'wsgiref>=0.1.2', 'pre-commit', 'pytest', 'pytest-cov', From 800a8f18314b7b71134b306e44b768f3bf963b63 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 1 Apr 2015 00:07:35 -0500 Subject: [PATCH 77/97] Check for crits options before grabbing them --- maltrieve.py | 11 ++++++---- screenlog.0 | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 4 deletions(-) create mode 100644 screenlog.0 diff --git a/maltrieve.py b/maltrieve.py index 4251f9b..8da8ec6 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -118,10 +118,13 @@ def __init__(self, args, filename='maltrieve.cfg'): self.viper = args.viper or self.configp.has_option('Maltrieve', 'viper') # CRITs - self.crits = args.crits or self.configp.get('Maltrieve', 'crits') - self.crits_user = self.configp.get('Maltrieve', 'crits_user') - self.crits_key = self.configp.get('Maltrieve', 'crits_key') - self.crits_source = self.configp.get('Maltrieve', 'crits_source') + if args.crits or self.configp.has_option('Maltrieve', 'crits'): + self.crits = args.crits or self.configp.get('Maltrieve', 'crits') + self.crits_user = self.configp.get('Maltrieve', 'crits_user') + self.crits_key = self.configp.get('Maltrieve', 'crits_key') + self.crits_source = self.configp.get('Maltrieve', 'crits_source') + else: + self.crits = False def upload_crits(response, md5, cfg): diff --git a/screenlog.0 b/screenlog.0 new file mode 100644 index 0000000..2dd5131 --- /dev/null +++ b/screenlog.0 @@ -0,0 +1,59 @@ +kmaxwell@newton:~/src/maltrieve(webstergd-master)$ activate +(venv)kmaxwell@newton:~/src/maltrieve(webstergd-master)$ pippip install -e . +Obtaining file:///home/kmaxwell/src/maltrieve + Running setup.py (path:/home/kmaxwell/src/maltrieve/setup.py) egg_info for package from file:///home/kmaxwell/src/maltrieve + +Requirement already satisfied (use --upgrade to upgrade): argparse==1.2.1 in /usr/lib/python2.7 (from maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): beautifulsoup4==4.3.2 in ./venv/lib/python2.7/site-packages (from maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): feedparser==5.1.3 in ./venv/lib/python2.7/site-packages (from maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): gevent==1.0.1 in ./venv/lib/python2.7/site-packages (from maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): greenlet==0.4.2 in ./venv/lib/python2.7/site-packages (from maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): grequests==0.2.0 in ./venv/lib/python2.7/site-packages (from maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): python-magic==0.4.6 in ./venv/lib/python2.7/site-packages (from maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): requests==2.3.0 in ./venv/lib/python2.7/site-packages (from maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): wsgiref==0.1.2 in /usr/lib/python2.7 (from maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): pre-commit in ./venv/lib/python2.7/site-packages (from maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): pytest in ./venv/lib/python2.7/site-packages (from maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): pytest-cov in ./venv/lib/python2.7/site-packages (from maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): coveralls in ./venv/lib/python2.7/site-packages (from maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): aspy.yaml in ./venv/lib/python2.7/site-packages (from pre-commit->maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): cached-property in ./venv/lib/python2.7/site-packages (from pre-commit->maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): jsonschema in ./venv/lib/python2.7/site-packages (from pre-commit->maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): nodeenv>=0.11.1 in ./venv/lib/python2.7/site-packages (from pre-commit->maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): ordereddict in ./venv/lib/python2.7/site-packages (from pre-commit->maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): pyyaml in ./venv/lib/python2.7/site-packages (from pre-commit->maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): simplejson in ./venv/lib/python2.7/site-packages (from pre-commit->maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): virtualenv in ./venv/lib/python2.7/site-packages (from pre-commit->maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): py>=1.4.25 in ./venv/lib/python2.7/site-packages (from pytest->maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): coverage>=3.7.1,<4.0a1 in ./venv/lib/python2.7/site-packages (from pytest-cov->maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): cov-core>=1.14.0 in ./venv/lib/python2.7/site-packages (from pytest-cov->maltrieve==0.6) +Requirement already satisfied (use --upgrade to upgrade): docopt>=0.6.1 in ./venv/lib/python2.7/site-packages (from coveralls->maltrieve==0.6) +Installing collected packages: maltrieve + Running setup.py develop for maltrieve + + Creating /home/kmaxwell/src/maltrieve/venv/lib/python2.7/site-packages/maltrieve.egg-link (link to .) + maltrieve 0.6 is already the active version in easy-install.pth + Installing maltrieve script to /home/kmaxwell/src/maltrieve/venv/bin + + Installed /home/kmaxwell/src/maltrieve +Successfully installed maltrieve +Cleaning up... +(venv)kmaxwell@newton:~/src/maltrieve(webstergd-master)$ mpymaltrieve +Traceback (most recent call last): + File "/home/kmaxwell/src/maltrieve/venv/bin/maltrieve", line 9, in + load_entry_point('maltrieve==0.6', 'console_scripts', 'maltrieve')() + File "/home/kmaxwell/src/maltrieve/maltrieve.py", line 402, in main + cfg = config(args, 'maltrieve.cfg') + File "/home/kmaxwell/src/maltrieve/maltrieve.py", line 121, in __init__ + self.crits = args.crits or self.configp.get('Maltrieve', 'crits') + File "/usr/lib/python2.7/ConfigParser.py", line 618, in get + raise NoOptionError(option, section) +ConfigParser.NoOptionError: No option 'crits' in section: 'Maltrieve' +(venv)kmaxwell@newton:~/src/maltrieve(webstergd-master)$ ls *.cfg +maltrieve.cfg maltrieve-test.cfg +(venv)kmaxwell@newton:~/src/maltrieve(webstergd-master)$ ls *.cfgmaltrieve +Processing source URLs +Completed source processing +Downloading samples, check log for details +Completed downloads +(venv)kmaxwell@newton:~/src/maltrieve(cleanup)$ \ No newline at end of file From f8705de6b5c3de33f26699810d7b8fa3a741bb97 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 1 Apr 2015 00:13:25 -0500 Subject: [PATCH 78/97] oopsie --- screenlog.0 | 59 ----------------------------------------------------- 1 file changed, 59 deletions(-) delete mode 100644 screenlog.0 diff --git a/screenlog.0 b/screenlog.0 deleted file mode 100644 index 2dd5131..0000000 --- a/screenlog.0 +++ /dev/null @@ -1,59 +0,0 @@ -kmaxwell@newton:~/src/maltrieve(webstergd-master)$ activate -(venv)kmaxwell@newton:~/src/maltrieve(webstergd-master)$ pippip install -e . -Obtaining file:///home/kmaxwell/src/maltrieve - Running setup.py (path:/home/kmaxwell/src/maltrieve/setup.py) egg_info for package from file:///home/kmaxwell/src/maltrieve - -Requirement already satisfied (use --upgrade to upgrade): argparse==1.2.1 in /usr/lib/python2.7 (from maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): beautifulsoup4==4.3.2 in ./venv/lib/python2.7/site-packages (from maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): feedparser==5.1.3 in ./venv/lib/python2.7/site-packages (from maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): gevent==1.0.1 in ./venv/lib/python2.7/site-packages (from maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): greenlet==0.4.2 in ./venv/lib/python2.7/site-packages (from maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): grequests==0.2.0 in ./venv/lib/python2.7/site-packages (from maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): python-magic==0.4.6 in ./venv/lib/python2.7/site-packages (from maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): requests==2.3.0 in ./venv/lib/python2.7/site-packages (from maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): wsgiref==0.1.2 in /usr/lib/python2.7 (from maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): pre-commit in ./venv/lib/python2.7/site-packages (from maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): pytest in ./venv/lib/python2.7/site-packages (from maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): pytest-cov in ./venv/lib/python2.7/site-packages (from maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): coveralls in ./venv/lib/python2.7/site-packages (from maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): aspy.yaml in ./venv/lib/python2.7/site-packages (from pre-commit->maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): cached-property in ./venv/lib/python2.7/site-packages (from pre-commit->maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): jsonschema in ./venv/lib/python2.7/site-packages (from pre-commit->maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): nodeenv>=0.11.1 in ./venv/lib/python2.7/site-packages (from pre-commit->maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): ordereddict in ./venv/lib/python2.7/site-packages (from pre-commit->maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): pyyaml in ./venv/lib/python2.7/site-packages (from pre-commit->maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): simplejson in ./venv/lib/python2.7/site-packages (from pre-commit->maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): virtualenv in ./venv/lib/python2.7/site-packages (from pre-commit->maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): py>=1.4.25 in ./venv/lib/python2.7/site-packages (from pytest->maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): coverage>=3.7.1,<4.0a1 in ./venv/lib/python2.7/site-packages (from pytest-cov->maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): cov-core>=1.14.0 in ./venv/lib/python2.7/site-packages (from pytest-cov->maltrieve==0.6) -Requirement already satisfied (use --upgrade to upgrade): docopt>=0.6.1 in ./venv/lib/python2.7/site-packages (from coveralls->maltrieve==0.6) -Installing collected packages: maltrieve - Running setup.py develop for maltrieve - - Creating /home/kmaxwell/src/maltrieve/venv/lib/python2.7/site-packages/maltrieve.egg-link (link to .) - maltrieve 0.6 is already the active version in easy-install.pth - Installing maltrieve script to /home/kmaxwell/src/maltrieve/venv/bin - - Installed /home/kmaxwell/src/maltrieve -Successfully installed maltrieve -Cleaning up... -(venv)kmaxwell@newton:~/src/maltrieve(webstergd-master)$ mpymaltrieve -Traceback (most recent call last): - File "/home/kmaxwell/src/maltrieve/venv/bin/maltrieve", line 9, in - load_entry_point('maltrieve==0.6', 'console_scripts', 'maltrieve')() - File "/home/kmaxwell/src/maltrieve/maltrieve.py", line 402, in main - cfg = config(args, 'maltrieve.cfg') - File "/home/kmaxwell/src/maltrieve/maltrieve.py", line 121, in __init__ - self.crits = args.crits or self.configp.get('Maltrieve', 'crits') - File "/usr/lib/python2.7/ConfigParser.py", line 618, in get - raise NoOptionError(option, section) -ConfigParser.NoOptionError: No option 'crits' in section: 'Maltrieve' -(venv)kmaxwell@newton:~/src/maltrieve(webstergd-master)$ ls *.cfg -maltrieve.cfg maltrieve-test.cfg -(venv)kmaxwell@newton:~/src/maltrieve(webstergd-master)$ ls *.cfgmaltrieve -Processing source URLs -Completed source processing -Downloading samples, check log for details -Completed downloads -(venv)kmaxwell@newton:~/src/maltrieve(cleanup)$ \ No newline at end of file From 0e54bc69198e9c97f73f4314c0359ba5ce31e2f3 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 1 Apr 2015 00:14:10 -0500 Subject: [PATCH 79/97] Ignore logs from screen --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 6156808..fe6fae2 100644 --- a/.gitignore +++ b/.gitignore @@ -53,3 +53,4 @@ archive grequests *.bak *.json +screenlog* From 7a8b889f197127db6c4658d60f301bb825d938c5 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 1 Apr 2015 00:20:47 -0500 Subject: [PATCH 80/97] Allow specification of input file --- maltrieve.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/maltrieve.py b/maltrieve.py index 8da8ec6..97d4f6d 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -84,6 +84,9 @@ def __init__(self, args, filename='maltrieve.cfg'): else: self.white_list = False + if args.inputfile: + self.inputfile = args.inputfile + # make sure we can open the directory for writing if args.dumpdir: self.dumpdir = args.dumpdir @@ -387,6 +390,7 @@ def setup_args(args): help="Define HTTP proxy as address:port") parser.add_argument("-d", "--dumpdir", help="Define dump directory for retrieved files") + parser.add_argument("-i", "--inputfile", help="File of URLs to process") parser.add_argument("-l", "--logfile", help="Define file for logging progress") parser.add_argument("-r", "--crits", From 590aa8fe83d6edbec38e37a410b8edd4347a6a7a Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 1 Apr 2015 00:26:35 -0500 Subject: [PATCH 81/97] Process input file --- maltrieve.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/maltrieve.py b/maltrieve.py index 97d4f6d..f496e61 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -464,6 +464,11 @@ def main(): if hasattr(response, 'status_code') and response.status_code == 200: malware_urls.update(source_urls[response.url](response.text)) + if cfg.inputfile: + with open(cfg.inputfile, 'rb') as f: + moar_urls = list(f) + malware_urls.update(moar_urls) + print "Downloading samples, check log for details" malware_urls -= past_urls From 9fa337d2c88c0efb6a4b7eac020bc0d6374479a2 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Wed, 1 Apr 2015 00:37:12 -0500 Subject: [PATCH 82/97] Push string interpolation to logging in config.check_proxy() --- maltrieve.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 82aeb79..30d8f45 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -127,9 +127,9 @@ def __init__(self, args, filename='maltrieve.cfg'): def check_proxy(self): if self.proxy: - logging.info('Using proxy {proxy}'.format(proxy=self.proxy)) + logging.info('Using proxy %s', self.proxy) my_ip = requests.get('http://ipinfo.io/ip', proxies=self.proxy).text - logging.info('External sites see {ip}'.format(ip=my_ip)) + logging.info('External sites see %s', my_ip) print 'External sites see {ip}'.format(ip=my_ip) From 59f6b19b889529fb523522de06dc030bd4d51119 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 2 Apr 2015 19:34:46 -0500 Subject: [PATCH 83/97] Allow user to specify alternate configuration file --- maltrieve-test.cfg | 5 ++++- maltrieve.py | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/maltrieve-test.cfg b/maltrieve-test.cfg index add59cf..f4703df 100644 --- a/maltrieve-test.cfg +++ b/maltrieve-test.cfg @@ -7,7 +7,10 @@ User-Agent = Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0) #viper = http://127.0.0.1:8080 #cuckoo = http://127.0.0.1:8090 #vxcage = http://127.0.0.1:8080 - +#crits = https://127.0.0.1 +#crits_user = maltrieve +#crits_key = +#crits_source = maltrieve # Filter Lists are based on mime type NO SPACE BETWEEN , #black_list = text/html,text/plain diff --git a/maltrieve.py b/maltrieve.py index 30d8f45..719ceab 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -408,6 +408,7 @@ def setup_args(args): help="Enable Cuckoo analysis", action="store_true", default=False) parser.add_argument("-s", "--sort_mime", help="Sort files by MIME type", action="store_true", default=False) + parser.add_argument("--config", help="Alternate config file (default maltrieve.cfg)") return parser.parse_args(args) @@ -453,7 +454,10 @@ def main(): past_urls = set() args = setup_args(sys.argv[1:]) - cfg = config(args, 'maltrieve.cfg') + if args.config: + cfg = config(args, args.config) + else: + cfg = config(args, 'maltrieve.cfg') cfg.check_proxy() hashes = load_hashes('hashes.json') From f9452ac8274872938a245d1b424056ae085b377c Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 2 Apr 2015 19:53:31 -0500 Subject: [PATCH 84/97] Test config file stuff --- maltrieve-test.cfg | 11 ++++++----- test.py | 27 +++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/maltrieve-test.cfg b/maltrieve-test.cfg index f4703df..e3fe5cf 100644 --- a/maltrieve-test.cfg +++ b/maltrieve-test.cfg @@ -1,8 +1,9 @@ [Maltrieve] -dumpdir = archive -logfile = maltrieve.log +dumpdir = archive-test +logfile = maltrieve-test.log logheaders = true -User-Agent = Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0) +User-Agent = Test-Agent +proxy = 127.0.0.1:3128 #viper = http://127.0.0.1:8080 #cuckoo = http://127.0.0.1:8090 @@ -13,5 +14,5 @@ User-Agent = Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0) #crits_source = maltrieve # Filter Lists are based on mime type NO SPACE BETWEEN , -#black_list = text/html,text/plain -#white_list = application/pdf,application/x-dosexec +black_list = text/html,text/plain +white_list = application/pdf,application/x-dosexec diff --git a/test.py b/test.py index 042d04a..be5c7cd 100644 --- a/test.py +++ b/test.py @@ -17,6 +17,33 @@ def test_saving_args(): assert args.sort_mime +def test_read_alt_config(): + args = maltrieve.setup_args(['--config', 'maltrieve-test.cfg']) + assert args.config == "maltrieve-test.cfg" + + +def test_config_args(): + args = maltrieve.setup_args(['-l', 'testlog', '-p', '127.0.0.1:8080', '-d', '/tmp/mwtest']) + cfg = maltrieve.config(args, 'maltrieve-test.cfg') + assert cfg.logfile == 'testlog' + test_proxy = {'http': '127.0.0.1:8080'} + assert cmp(cfg.proxy, test_proxy) == 0 + assert cfg.dumpdir == '/tmp/mwtest' + + +def test_alt_config(): + args = maltrieve.setup_args(['--config', 'maltrieve-test.cfg']) + cfg = maltrieve.config(args, args.config) + assert cfg.dumpdir == 'archive-test' + assert cfg.logfile == 'maltrieve-test.log' + test_ua = {'User-Agent': 'Test-Agent'} + assert cmp(cfg.useragent, test_ua) == 0 + test_proxy = {'http': '127.0.0.1:3128'} + assert cmp(cfg.proxy, test_proxy) == 0 + assert cfg.black_list == ['text/html', 'text/plain'] + assert cfg.white_list == ['application/pdf', 'application/x-dosexec'] + + def test_parse_simple_list(): source = requests.get('http://xwell.org/assets/maltrieve-test.txt').text assert maltrieve.process_simple_list(source) == \ From ab26fc046afee83d41dd6231f738c30de75f3c85 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 2 Apr 2015 19:57:50 -0500 Subject: [PATCH 85/97] Test default dump directory --- maltrieve.py | 4 ++-- test.py | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 719ceab..80efed3 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -95,13 +95,13 @@ def __init__(self, args, filename='maltrieve.cfg'): if not os.path.exists(self.dumpdir): try: os.makedirs(self.dumpdir) - except IOError: + except OSError: logging.error('Could not create %s, using default', self.dumpdir) self.dumpdir = '/tmp/malware' try: fd, temp_path = tempfile.mkstemp(dir=self.dumpdir) - except IOError: + except OSError: logging.error('Could not open %s for writing, using default', self.dumpdir) self.dumpdir = '/tmp/malware' else: diff --git a/test.py b/test.py index be5c7cd..de5b356 100644 --- a/test.py +++ b/test.py @@ -44,6 +44,12 @@ def test_alt_config(): assert cfg.white_list == ['application/pdf', 'application/x-dosexec'] +def test_create_default_dumpdir(): + args = maltrieve.setup_args(['-d', '/']) + cfg = maltrieve.config(args, 'maltrieve-test.cfg') + assert cfg.dumpdir == '/tmp/malware' + + def test_parse_simple_list(): source = requests.get('http://xwell.org/assets/maltrieve-test.txt').text assert maltrieve.process_simple_list(source) == \ From 6347e70eed0a26ef0259febb1b21792a4bc8b0b0 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 2 Apr 2015 20:06:21 -0500 Subject: [PATCH 86/97] Specify correct submodule for requests exceptions --- maltrieve.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 80efed3..1c32ca7 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -167,11 +167,11 @@ def upload_crits(response, md5, cfg): domain_data['domain'], md5, domain_response_data) else: logging.info("Submission of %s failed: %d", url, domain_response.status_code) - except requests.ConnectionError: + except requests.exceptions.ConnectionError: logging.info("Could not connect to CRITs when submitting domain %s", domain_data['domain']) - except requests.ConnectTimeout: + except requests.exceptions.ConnectTimeout: logging.info("Timed out connecting to CRITs when submitting domain %s", domain_data['domain']) - except requests.HTTPError: + except requests.exceptions.HTTPError: logging.info("HTTP error when submitting domain %s to CRITs", domain_data['domain']) # Submit sample @@ -202,11 +202,11 @@ def upload_crits(response, md5, cfg): logging.info("Submitted sample %s to CRITs, response was %r", md5, sample_response_data) else: logging.info("Submission of sample %s failed: %d}", md5, sample_response.status_code) - except requests.ConnectionError: + except requests.exceptions.ConnectionError: logging.info("Could not connect to CRITs when submitting sample %s", md5) - except requests.ConnectTimeout: + except requests.exceptions.ConnectTimeout: logging.info("Timed out connecting to CRITs when submitting sample %s", md5) - except requests.HTTPError: + except requests.exceptions.HTTPError: logging.info("HTTP error when submitting sample %s to CRITs", md5) # Create a relationship for the sample and domain @@ -231,11 +231,11 @@ def upload_crits(response, md5, cfg): if relationship_response.status_code != requests.codes.ok: logging.info("Submitted relationship info for %s to CRITs, response was %r", md5, domain_response_data) - except requests.ConnectionError: + except requests.exceptions.ConnectionError: logging.info("Could not connect to CRITs when submitting relationship for sample %s", md5) - except requests.ConnectTimeout: + except requests.exceptions.ConnectTimeout: logging.info("Timed out connecting to CRITs when submitting relationship for sample %s", md5) - except requests.HTTPError: + except requests.exceptions.HTTPError: logging.info("HTTP error when submitting relationship for sample %s to CRITs", md5) return True else: From d393d176a547681581beb726ced7a41100a46dc4 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 2 Apr 2015 20:07:31 -0500 Subject: [PATCH 87/97] Test crits config and more dumpdir tests --- maltrieve-test.cfg | 8 ++++---- test.py | 10 ++++++++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/maltrieve-test.cfg b/maltrieve-test.cfg index e3fe5cf..11bea5d 100644 --- a/maltrieve-test.cfg +++ b/maltrieve-test.cfg @@ -8,10 +8,10 @@ proxy = 127.0.0.1:3128 #viper = http://127.0.0.1:8080 #cuckoo = http://127.0.0.1:8090 #vxcage = http://127.0.0.1:8080 -#crits = https://127.0.0.1 -#crits_user = maltrieve -#crits_key = -#crits_source = maltrieve +crits = http://127.0.0.1:8080 +crits_user = maltrieve +crits_key = YOUR_API_KEY_HERE +crits_source = maltrieve # Filter Lists are based on mime type NO SPACE BETWEEN , black_list = text/html,text/plain diff --git a/test.py b/test.py index de5b356..ad5fa5e 100644 --- a/test.py +++ b/test.py @@ -42,6 +42,10 @@ def test_alt_config(): assert cmp(cfg.proxy, test_proxy) == 0 assert cfg.black_list == ['text/html', 'text/plain'] assert cfg.white_list == ['application/pdf', 'application/x-dosexec'] + assert cfg.crits == 'http://127.0.0.1:8080' + assert cfg.crits_user == 'maltrieve' + assert cfg.crits_key == 'YOUR_API_KEY_HERE' + assert cfg.crits_source == 'maltrieve' def test_create_default_dumpdir(): @@ -50,6 +54,12 @@ def test_create_default_dumpdir(): assert cfg.dumpdir == '/tmp/malware' +def test_create_default_dumpdir_when_specified_doesnt_exist(): + args = maltrieve.setup_args(['-d', '/_nope_']) + cfg = maltrieve.config(args, 'maltrieve-test.cfg') + assert cfg.dumpdir == '/tmp/malware' + + def test_parse_simple_list(): source = requests.get('http://xwell.org/assets/maltrieve-test.txt').text assert maltrieve.process_simple_list(source) == \ From 16c2a652f34ee9b9ab8c56e177f0b3201777c435 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 2 Apr 2015 20:27:09 -0500 Subject: [PATCH 88/97] Test local file saving --- maltrieve.py | 10 ++++------ test.py | 24 ++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 1c32ca7..0786f39 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -303,13 +303,13 @@ def save_malware(response, cfg): mime_type = magic.from_buffer(data, mime=True) if mime_type in cfg.black_list: logging.info('%s in ignore list for %s', mime_type, url) - return + return False if cfg.white_list: if mime_type in cfg.white_list: pass else: logging.info('%s not in whitelist for %s', mime_type, url) - return + return False # Hash and log md5 = hashlib.md5(data).hexdigest() @@ -495,10 +495,8 @@ def main(): for each in malware_downloads: if not each or each.status_code != 200: continue - md5 = save_malware(each, cfg) - if not md5: - continue - past_urls.add(each.url) + if save_malware(each, cfg): + past_urls.add(each.url) print "Completed downloads" diff --git a/test.py b/test.py index ad5fa5e..6926ef5 100644 --- a/test.py +++ b/test.py @@ -1,3 +1,5 @@ +import os + import maltrieve import requests @@ -98,3 +100,25 @@ def test_save_urls(): urls = set(['http://example.com/badurl']) maltrieve.save_urls(urls, 'test-save-urls.json') test_load_urls('test-save-urls.json') + + +def test_save_blacklist(): + args = maltrieve.setup_args(['--config', 'maltrieve-test.cfg']) + cfg = maltrieve.config(args, args.config) + r = requests.get('http://xwell.org') + assert maltrieve.save_malware(r, cfg) is False + + +def test_save_whitelist_fail(): + args = maltrieve.setup_args(['--config', 'maltrieve-test.cfg']) + cfg = maltrieve.config(args, args.config) + r = requests.get('http://xwell.org/assets/images/dodecahedron.png') + assert maltrieve.save_malware(r, cfg) is False + + +def test_save_whitelist_passk(): + args = maltrieve.setup_args(['--config', 'maltrieve-test.cfg']) + cfg = maltrieve.config(args, args.config) + r = requests.get('http://xwell.org/assets/docs/test.pdf') + assert maltrieve.save_malware(r, cfg) + assert os.access('archive-test/b9ff662486d448da7b60ba6234867c65', os.F_OK) From a0f5a53bc6bfa63cb0f298a27802e68358e0c6c4 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 2 Apr 2015 20:31:53 -0500 Subject: [PATCH 89/97] Test sorting by MIME type --- test.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/test.py b/test.py index 6926ef5..c62b6c8 100644 --- a/test.py +++ b/test.py @@ -116,9 +116,18 @@ def test_save_whitelist_fail(): assert maltrieve.save_malware(r, cfg) is False -def test_save_whitelist_passk(): +def test_save_whitelist_pass(): args = maltrieve.setup_args(['--config', 'maltrieve-test.cfg']) cfg = maltrieve.config(args, args.config) r = requests.get('http://xwell.org/assets/docs/test.pdf') assert maltrieve.save_malware(r, cfg) assert os.access('archive-test/b9ff662486d448da7b60ba6234867c65', os.F_OK) + + +def test_sort_mime(): + args = maltrieve.setup_args(['--config', 'maltrieve-test.cfg']) + cfg = maltrieve.config(args, args.config) + cfg.sort_mime = True + r = requests.get('http://xwell.org/assets/docs/test.pdf') + assert maltrieve.save_malware(r, cfg) + assert os.access('archive-test/application_pdf/b9ff662486d448da7b60ba6234867c65', os.F_OK) From bd1110b3a1266b25257be840cfdfde22de01645f Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 2 Apr 2015 20:37:46 -0500 Subject: [PATCH 90/97] Test empty URLs file --- test.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test.py b/test.py index c62b6c8..fcd852f 100644 --- a/test.py +++ b/test.py @@ -91,6 +91,11 @@ def test_save_hashes(): test_load_hashes('test-save-hashes.json') +def test_empty_urls(): + fname = 'maltrieve.py' + assert maltrieve.load_urls(fname) == set() + + def test_load_urls(urlfile='test-load-urls.json'): assert maltrieve.load_urls(urlfile) == \ set(['http://example.com/badurl']) From d3effcf396687ea25d03de5c1e65d47807ccc1c6 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 2 Apr 2015 20:38:47 -0500 Subject: [PATCH 91/97] Remove error we can't reach since it's a child of the previous error --- maltrieve.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index 0786f39..f61ae54 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -169,8 +169,6 @@ def upload_crits(response, md5, cfg): logging.info("Submission of %s failed: %d", url, domain_response.status_code) except requests.exceptions.ConnectionError: logging.info("Could not connect to CRITs when submitting domain %s", domain_data['domain']) - except requests.exceptions.ConnectTimeout: - logging.info("Timed out connecting to CRITs when submitting domain %s", domain_data['domain']) except requests.exceptions.HTTPError: logging.info("HTTP error when submitting domain %s to CRITs", domain_data['domain']) @@ -204,8 +202,6 @@ def upload_crits(response, md5, cfg): logging.info("Submission of sample %s failed: %d}", md5, sample_response.status_code) except requests.exceptions.ConnectionError: logging.info("Could not connect to CRITs when submitting sample %s", md5) - except requests.exceptions.ConnectTimeout: - logging.info("Timed out connecting to CRITs when submitting sample %s", md5) except requests.exceptions.HTTPError: logging.info("HTTP error when submitting sample %s to CRITs", md5) @@ -233,8 +229,6 @@ def upload_crits(response, md5, cfg): md5, domain_response_data) except requests.exceptions.ConnectionError: logging.info("Could not connect to CRITs when submitting relationship for sample %s", md5) - except requests.exceptions.ConnectTimeout: - logging.info("Timed out connecting to CRITs when submitting relationship for sample %s", md5) except requests.exceptions.HTTPError: logging.info("HTTP error when submitting relationship for sample %s to CRITs", md5) return True From 231d8f3f959de968f68459c6ebebfbeedbe80e51 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 2 Apr 2015 20:43:50 -0500 Subject: [PATCH 92/97] Clean up or ignore some linter warnings --- maltrieve.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/maltrieve.py b/maltrieve.py index f61ae54..9afcdff 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -157,7 +157,7 @@ def upload_crits(response, md5, cfg): # Note that this request does NOT go through proxies logging.debug("Domain submission: %s|%r", url, domain_data) domain_response = requests.post(url, headers=headers, data=domain_data) - # pylint says "Instance of LookupDict has no 'ok' member" + # pylint says "Instance of LookupDict has no 'ok' member" but it's wrong, I checked if domain_response.status_code == requests.codes.ok: domain_response_data = domain_response.json() if domain_response_data['return_code'] == 0: @@ -191,7 +191,7 @@ def upload_crits(response, md5, cfg): try: # Note that this request does NOT go through proxies sample_response = requests.post(url, headers=headers, files=files, data=sample_data, verify=False) - # pylint says "Instance of LookupDict has no 'ok' member" + # pylint says "Instance of LookupDict has no 'ok' member" but it's wrong, I checked if sample_response.status_code == requests.codes.ok: sample_response_data = sample_response.json() if sample_response_data['return_code'] == 0: @@ -411,14 +411,14 @@ def load_hashes(filename="hashes.json"): if os.path.exists(filename): with open(filename, 'rb') as hashfile: hashes = set(json.load(hashfile)) - logging.info('Loaded hashes from {f}'.format(f=filename)) + logging.info('Loaded hashes from %s', filename) else: hashes = set() return hashes def save_hashes(hashes, filename='hashes.json'): - logging.info('Dumping hashes to {f}'.format(f=filename)) + logging.info('Dumping hashes to %s', filename) with open(filename, 'w') as hashfile: json.dump(list(hashes), hashfile, indent=2) @@ -428,7 +428,7 @@ def load_urls(filename='urls.json'): try: with open(filename, 'rb') as urlfile: urls = set(json.load(urlfile)) - logging.info('Loaded urls from {f}'.format(f=filename)) + logging.info('Loaded urls from %s', filename) except ValueError: # this usually happens when the file is empty urls = set() else: @@ -437,7 +437,7 @@ def load_urls(filename='urls.json'): def save_urls(urls, filename='urls.json'): - logging.info('Dumping past URLs to {f}'.format(f=filename)) + logging.info('Dumping past URLs to %s', filename) with open(filename, 'w') as urlfile: json.dump(list(urls), urlfile, indent=2) From 71e4c7a1bed7b91d2aa915aed8e8a16edb079d23 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 2 Apr 2015 20:54:26 -0500 Subject: [PATCH 93/97] Test inputfile argument --- test.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test.py b/test.py index fcd852f..ebec631 100644 --- a/test.py +++ b/test.py @@ -33,6 +33,12 @@ def test_config_args(): assert cfg.dumpdir == '/tmp/mwtest' +def test_inputfile(): + args = maltrieve.setup_args(['-i', 'test-input']) + cfg = maltrieve.config(args, 'maltrieve-test.cfg') + assert cfg.inputfile == 'test-input' + + def test_alt_config(): args = maltrieve.setup_args(['--config', 'maltrieve-test.cfg']) cfg = maltrieve.config(args, args.config) From fb0c4d892dc14c2935d7c981eead5670c2a6b9e0 Mon Sep 17 00:00:00 2001 From: webstergd Date: Tue, 31 Mar 2015 18:50:50 +0200 Subject: [PATCH 94/97] Readme now discusses how to create a cronjob and adjusted header size and removed redundant word. --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index b3b3e1b..c13ea47 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,24 @@ optional arguments: ### Configuration File Many of Maltrieve's command line options can be specified in ```maltrieve.cfg```. +## Automated Execution (Optional) +Cron can be used to automate the execution of Maltrieve. The following example is provided to help get you started. It will create a cron job that will run Maltrieve every day at 2:01 as a standard user. That said, we recommend enhancing this by creating a custom script for production environments. + +### Ubuntu +As a user execute +``` +crontab -e +``` +If installed normally, add the following to the end of the file. +``` +01 02 * * * maltrieve +``` +If downloaded to a folder and executed, add the following to the end of the file. +``` +01 02 * * * cd && /usr/bin/python maltrieve.py +``` +### Redhat +Redhat systems will need to ensure that the user is added to the /etc/cron.allow file. ## License From 3ac22d3ab2630531e9700967a7a447d1016705a6 Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 2 Apr 2015 21:07:21 -0500 Subject: [PATCH 95/97] Recommend VirusTotal and Malwr --- README.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index c13ea47..dfb90d0 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ optional arguments: Many of Maltrieve's command line options can be specified in ```maltrieve.cfg```. ## Automated Execution (Optional) -Cron can be used to automate the execution of Maltrieve. The following example is provided to help get you started. It will create a cron job that will run Maltrieve every day at 2:01 as a standard user. That said, we recommend enhancing this by creating a custom script for production environments. +Cron can be used to automate the execution of Maltrieve. The following example is provided to help get you started. It will create a cron job that will run Maltrieve every day at 2:01 as a standard user. That said, we recommend enhancing this by creating a custom script for production environments. ### Ubuntu As a user execute @@ -96,13 +96,17 @@ If downloaded to a folder and executed, add the following to the end of the file 01 02 * * * cd && /usr/bin/python maltrieve.py ``` ### Redhat -Redhat systems will need to ensure that the user is added to the /etc/cron.allow file. +Redhat systems will need to ensure that the user is added to the /etc/cron.allow file. + + +## Other Tools + +Maltrieve doesn't do analysis. In addition to the integrations listed above, we can recommend using [VirusTotalApi](https://github.com/doomedraven/VirusTotalApi) for working with [VirusTotal](https://www.virustotal.com). [Malwr](https://malwr.com) is a similar site based on Cuckoo Sandbox. ## License Released under GPL version 3. See the [LICENSE](./LICENSE) file for full details. - ## Known bugs We list all the bugs we know about (plus some things we know we need to add) at the [GitHub issues](https://github.com/krmaxwell/maltrieve/issues) page. From 97a2e996298cecb815ab3864f06a972c7913cfdc Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 2 Apr 2015 21:40:56 -0500 Subject: [PATCH 96/97] Check links in README --- .gitignore | 1 + README.md | 4 ++-- setup.py | 4 +++- test.py | 7 +++++++ 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index fe6fae2..cb2d5c3 100644 --- a/.gitignore +++ b/.gitignore @@ -54,3 +54,4 @@ grequests *.bak *.json screenlog* +README.html diff --git a/README.md b/README.md index dfb90d0..baa585e 100644 --- a/README.md +++ b/README.md @@ -25,8 +25,8 @@ Maltrieve originated as a fork of [mwcrawler](https://github.com/ricardo-dias/mw These lists will be implemented if/when they return to activity. -* [Malware Blacklist](http://www.malwareblacklist.com/showMDL.php) -* [NovCon Minotaur](http://minotauranalysis.com/malwarelist-urls.aspx) +* [Malware Blacklist](http://www.malwareblacklist.com) +* [NovCon Minotaur](http://minotauranalysis.com) Other improvements include: diff --git a/setup.py b/setup.py index aaafd38..7128ad3 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,9 @@ 'pre-commit', 'pytest', 'pytest-cov', - 'coveralls' + 'coveralls', + 'LinkChecker', + 'markdown' ], package_dir={'maltrieve': 'src'}, packages=['maltrieve'], diff --git a/test.py b/test.py index ebec631..8420ae1 100644 --- a/test.py +++ b/test.py @@ -1,6 +1,8 @@ import os +import subprocess import maltrieve +import markdown import requests @@ -142,3 +144,8 @@ def test_sort_mime(): r = requests.get('http://xwell.org/assets/docs/test.pdf') assert maltrieve.save_malware(r, cfg) assert os.access('archive-test/application_pdf/b9ff662486d448da7b60ba6234867c65', os.F_OK) + + +def test_README_links(): + markdown.markdownFromFile(input='README.md', output='README.html') + assert subprocess.call(['linkchecker', '--check-extern', 'README.html']) == 0 From 66c8d449b05096de9f7c4292e7f3963d1adda01c Mon Sep 17 00:00:00 2001 From: Kyle Maxwell Date: Thu, 2 Apr 2015 21:50:00 -0500 Subject: [PATCH 97/97] Remove mention of inactive lists --- README.md | 5 ----- 1 file changed, 5 deletions(-) diff --git a/README.md b/README.md index baa585e..c9b1ee0 100644 --- a/README.md +++ b/README.md @@ -23,11 +23,6 @@ Maltrieve originated as a fork of [mwcrawler](https://github.com/ricardo-dias/mw * [CleanMX](http://support.clean-mx.de/clean-mx/xmlviruses.php?) * [ZeusTracker](https://zeustracker.abuse.ch/monitor.php?urlfeed=binaries) -These lists will be implemented if/when they return to activity. - -* [Malware Blacklist](http://www.malwareblacklist.com) -* [NovCon Minotaur](http://minotauranalysis.com) - Other improvements include: * Proxy support