diff --git a/README.md b/README.md index 7b28516..db1c8a4 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,9 @@ After catching malicious phishing domain names using [certstream](https://certst Search for specific filetypes submitted to [urlscan.io](https://urlscan.io/) and recursively download the webpage if predefined file extensions are found. #### Prerequisites +- Ubuntu 18.04+ (should work on other Linux distros) - Python 2.7.14 +- Torsocks (optional: used with flag `--tor`) #### Setup 1. Open a terminal and run the following command: @@ -27,8 +29,12 @@ The following command will: - Score and add suspicious domains to a queue while other domains continue to be scored - Simultaneously make requests to the domains in the queue to search for predefined file extensions - Recursively download the site when an open directory is found hosting a file with a particular extension + +Optional arguments: +- **--timeout** : Set time to wait for a connection +- **--tor** : Download files via the Tor network ```bash -python opendir_certstream.py +python opendir_certstream.py [--timeout] [--tor] ``` **Note**: Any URLs in the queue will be lost once the program stops. @@ -43,10 +49,12 @@ The following command will: - **File Extension** : 7z, apk, bat, bz, bz2, crypt, dll, doc, docx, exe, gz, hta, iso, jar, json, lnk, ppt, ps1, py, rar, sfx, sh, tar, vb, vbs, xld, xls, xlsx, zip Optional arguments: -- **Dry Run** : Perform a test run to see what would be downloaded -- **Exclude** : A comma-separated list of domains to not download content from (ex. 'google.com,bing.com') +- **--dryrun** : Perform a test run to see what would be downloaded +- **--exclude** : A comma-separated list of domains to not download content from (ex. 'google.com,bing.com') +- **--timeout** : Set time to wait for a connection +- **--tor** : Download files via the Tor network ```bash -python opendir_urlscan.py [--dry-run] [--exclude=CSV] +python opendir_urlscan.py [--dry-run] [--exclude=CSV] [--timeout] [--tor] ``` **Note**: If the path is a file, it will be downloaded regardless of whether it's an open directory. @@ -60,6 +68,8 @@ python opendir_urlscan.py [--dry-run] [--e ![opendir_urlscan - Download](https://github.com/leunammejii/analyst_arsenal/blob/master/static/assets/opendir_urlscan_download.png) #### Things to know -- Be responsible. +- Be responsible!!! +- Downloads via Tor happen over **127.0.0.1:9050** +- These scripts **will not** check Torsocks settings Please fork, create merge requests, and help make this better. diff --git a/external.yaml b/external.yaml index d108735..81e0b45 100644 --- a/external.yaml +++ b/external.yaml @@ -46,3 +46,12 @@ tlds: # Add your own TLDs here, e.g.: # '.nu': # '.se': + +queries: + 'automatic' : 'task.method%3Aautomatic' + 'manual' : 'task.method%3Amanual' + 'certstream' : '(task.source%3Acertstream-idn OR task.source%3Acertstream-suspicious)' + 'openphish' : 'task.source%3Aopenphish' + 'phishtank' : 'task.source%3Aphishtank' + 'twitter' : '(task.source%3Atwitter OR task.source%3Atwitter_illegalFawn OR task.source%3Atwitter_phishingalert)' + 'urlhaus' : 'task.source%3Aurlhaus' diff --git a/opendir_certstream.py b/opendir_certstream.py index 3cbde87..bbe14a7 100644 --- a/opendir_certstream.py +++ b/opendir_certstream.py @@ -6,8 +6,17 @@ - Simultaneously make requests to the domains in the queue to search for predefined file extensions - Recursively download the site when an open directory is found hosting a file with a particular extension +Optional arguments: +- --timeout : Set time to wait for a connection +- --tor : Download files via the Tor network + Credit: https://github.com/x0rz/phishing_catcher +Resources: + http://docs.python-requests.org/en/master/user/advanced/#proxies + https://gist.github.com/jefftriplett/9748036 + https://ec.haxx.se/libcurl-proxies.html + Usage: ``` @@ -17,6 +26,7 @@ Debugger: open("/tmp/splunk_script.txt", "a").write("{}: \n".format()) """ +import argparse import os import Queue import re @@ -39,12 +49,20 @@ from confusables import unconfuse -certstream_url = "wss://certstream.calidog.io" -pbar = tqdm.tqdm(desc="certificate_update", unit="cert") -uagent = "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko" - -global url_queue -url_queue = Queue.Queue() +# Parse Arguments +parser = argparse.ArgumentParser(description="Attempt to detect phishing kits and open directories via Certstream.") +parser.add_argument("--timeout", + dest="timeout", + type=int, + default=30, + required=False, + help="Set time to wait for a connection") +parser.add_argument("--tor", + dest="tor", + action="store_true", + required=False, + help="Download files over the Tor network") +args = parser.parse_args() # hxxp://sebastiandahlgren[.]se/2014/06/27/running-a-method-as-a-background-thread-in-python/ class QueueManager(object): @@ -75,10 +93,15 @@ def run(self): url = url_queue.get() tqdm.tqdm.write( "[*] Session : " - "{}".format(colored(url, "blue"))) + "{}".format(colored(url, "blue")) + ) try: - resp = requests.get(url, headers={"User-Agent": uagent}, timeout=3.1) - except: + resp = requests.get(url, + proxies=proxies, + headers={"User-Agent": uagent}, + timeout=timeout, + allow_redirects=True) + except Exception as err: continue if not (resp.status_code == 200 and "Index of " in resp.content): @@ -96,15 +119,18 @@ def run(self): tqdm.tqdm.write( "[*] Download : " - "{}".format(colored(url, "green", attrs=["underline", "bold"]))) + "{} ('Index of ' found)".format( + colored(url, "green", attrs=["underline", "bold"])) + ) try: subprocess.call([ + "{}".format(torsocks), "wget", "--execute=robots=off", "--tries=2", "--no-clobber", - "--timeout=3.1", + "--timeout={}".format(timeout), "--waitretry=0", "--directory-prefix=./{}/".format(directory), "--content-disposition", @@ -113,9 +139,11 @@ def run(self): "--no-parent", url ]) - exit(0) break - except: + except Exception as err: + print("[!] Error : {}".format( + colored(err, "red", attrs=["bold"]) + )) continue time.sleep(self.interval) @@ -178,8 +206,13 @@ def score_domain(domain): try: res = get_tld(domain, as_object=True, fail_silently=True, fix_protocol=True) - domain = '.'.join([res.subdomain, res.domain]) - except Exception: + + if res is not None: + domain = '.'.join([res.subdomain, res.domain]) + except Exception as err: + print("[!] Error : {}".format( + colored(err, "red", attrs=["bold"]) + )) pass score += int(round(entropy.shannon_entropy(domain)*50)) @@ -210,8 +243,23 @@ def score_domain(domain): return score -if __name__ == "__main__": +def main(): + """ """ + global uagent + uagent = "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko" + global timeout + timeout = args.timeout + certstream_url = "wss://certstream.calidog.io" + global url_queue + url_queue = Queue.Queue() + + # Print start messages + show_summary() + show_network(uagent, timeout) + + # Read suspicious.yaml and external.yaml with open("suspicious.yaml", "r") as f: + global suspicious suspicious = yaml.safe_load(f) with open("external.yaml", "r") as f: @@ -238,5 +286,58 @@ def score_domain(domain): print(colored("At least one extension is required for 'files'.", "red", attrs=["bold"])) exit() + # Start queue and listen for events via Certstream + print(colored("Starting queue...\n", "yellow", attrs=["bold"])) QueueManager() + + global pbar + pbar = tqdm.tqdm(desc="certificate_update", unit="cert") certstream.listen_for_events(callback, url=certstream_url) + +def show_summary(): + """Print summary of arguments selected""" + + print("Summary:") + print(" timeout : {}".format(args.timeout)) + print(" tor : {}\n".format(args.tor)) + return + +def show_network(uagent, timeout): + """Select network to use, get IP address, and print message""" + global torsocks + global proxies + if args.tor: + ip_type = "Tor" + proxies = { + "http": "socks5h://127.0.0.1:9050", + "https": "socks5h://127.0.0.1:9050" + } + torsocks = "torsocks" + else: + ip_type = "Original" + proxies = {} + torsocks = "" + + try: + global requested_ip + requested_ip = requests.get("https://api.ipify.org", + proxies=proxies, + headers={"User-Agent": uagent}, + timeout=timeout, + allow_redirects=True).content + except Exception as err: + print("[!!] Error : {}".format( + colored(err, "red", attrs=["bold"]) + )) + exit() + + print(colored("Getting IP Address...", "yellow", attrs=["bold"])) + if args.tor: + obfuscated_ip = ".".join(["XXX.XXX.XXX", requested_ip.split(".")[:-1][0]]) + print(colored("{} IP: {}\n".format(ip_type, obfuscated_ip), "yellow", attrs=["bold"])) + else: + print(colored("{} IP: {}\n".format(ip_type, requested_ip), "yellow", attrs=["bold"])) + return + +if __name__ == "__main__": + main() diff --git a/opendir_urlscan.py b/opendir_urlscan.py index 3c95ebe..81d2e69 100644 --- a/opendir_urlscan.py +++ b/opendir_urlscan.py @@ -5,18 +5,23 @@ - Recursively download the site when an open directory hosting a file with the desired file extension 3 positional arguments needed: - - Query Type : automatic, manual, certstream, openphish, phishtank, twitter, urlhaus - Delta : Number of days back to search (GMT) - File Extension : 7z, apk, bat, bz, bz2, crypt, dll, doc, docx, exe, gz, hta, iso, jar, json, lnk, ppt, ps1, py, rar, sfx, sh, tar, vb, vbs, xld, xls, xlsx, zip Optional arguments: - -- Dry Run : Perform a test run to see what would be downloaded -- Exclude : A comma-separated list of domains to not download content from (ex. 'google.com,bing.com') +- --dryrun : Perform a test run to see what would be downloaded +- --exclude : A comma-separated list of domains to not download content from (ex. 'google.com,bing.com') +- --timeout : Set time to wait for a connection +- --tor : Download files via the Tor network Credit: https://github.com/ninoseki/miteru +Resources: + http://docs.python-requests.org/en/master/user/advanced/#proxies + https://gist.github.com/jefftriplett/9748036 + https://ec.haxx.se/libcurl-proxies.html + Usage: ``` @@ -39,7 +44,6 @@ sys.path.insert(0, script_path) import requests from termcolor import colored, cprint -import tqdm import yaml @@ -54,7 +58,14 @@ help="Number of days back to search (GMT)") parser.add_argument(metavar="file extension", dest="file_extension", + choices=["7z", "apk", "bat", "bz", "bz2", "crypt", "dll", "doc", "docx", "exe", "gz", "hta", "iso", "jar", "json", "lnk", "ppt", "ps1", "py", "rar", "sfx", "sh", "tar", "vb", "vbs", "xld", "xls", "xlsx", "zip"], help="7z, apk, bat, bz, bz2, crypt, dll, doc, docx, exe, gz, hta, iso, jar, json, lnk, ppt, ps1, py, rar, sfx, sh, tar, vb, vbs, xld, xls, xlsx, zip") +parser.add_argument("--timeout", + dest="timeout", + type=int, + default=30, + required=False, + help="Set time to wait for a connection") parser.add_argument("--dryrun", dest="dry_run", action="store_true", @@ -62,57 +73,57 @@ help="Perform a test run to see what would be downloaded") parser.add_argument("--exclude", dest="exclude", + type=str, + default="", required=False, help="A comma-separated list of domains to not download content from (ex. 'google.com,bing.com')") +parser.add_argument("--tor", + dest="tor", + action="store_true", + required=False, + help="Download files over the Tor network") args = parser.parse_args() -# Print dry-run message if True -if args.dry_run: - print(colored("Starting dry run...\n", "yellow", attrs=["bold"])) - def main(): """ """ - qtype = args.query_type.lower() - delta = args.delta - ext = args.file_extension.lower() - - today = datetime.now() - timespan = datetime.strftime(today - timedelta(delta), "%a, %d %b %Y 05:00:00") - timespan = datetime.strptime(timespan, "%a, %d %b %Y %H:%M:%S") - - # Request data from urlscan.io - api = "https://urlscan.io/api/v1/search/?q={}%20AND%20filename%3A.{}&size=10000" - uagent = "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko" - resp = requests.get(api.format(queries[qtype], ext), - headers={"User-Agent": uagent}, - timeout=10) - - if not (resp.status_code == 200 and "results" in resp.json().keys()): - exit(0) - - - results = resp.json()["results"] - urls = [] + # Set variables for arguments + qtype = args.query_type.lower() + delta = args.delta + ext = args.file_extension.lower() + exclusions = args.exclude.split(",") + uagent = "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko" + + # Print start messages + show_summary() + show_network(uagent) + + # Read external.yaml + with open("external.yaml", "r") as f: + external = yaml.safe_load(f) - for result in results: - # Break at time specified - analysis_time = datetime.strptime(result["task"]["time"], "%Y-%m-%dT%H:%M:%S.%fZ") + if external["archives"] is not None: + archives = external["archives"] + else: + print(colored("At least one extension is required for 'archives'.", "red", attrs=["bold"])) + exit() - if analysis_time < timespan: - break + if external["files"] is not None: + files = external["files"] + else: + print(colored("At least one extension is required for 'files'.", "red", attrs=["bold"])) + exit() - # Build list of URLs ending with specified extension or Mime-Type - url = result["page"]["url"] + # Set queries + queries = external["queries"] - if url.endswith('.{}'.format(ext)): - urls.append(url) - continue + # Build dict of extensions + extensions = {} + extensions.update(archives) + extensions.update(files) - if "files" in result.keys(): - result_files = [x for x in result["files"] if x["mimeType"].startswith(extensions[ext])] - - if len(result_files) > 0: - urls.append(url) + # Request URLs from urlscan.io + timeout = args.timeout + urls = get_urls(delta, queries, qtype, ext, uagent, timeout, extensions) for url in sorted(set(urls), key=urls.index): # Check if the current URL has already been redirected to @@ -129,10 +140,7 @@ def main(): if args.exclude and domain in exclusions: continue - tqdm.tqdm.write( - "[*] Original : " - "{}".format("{}".format(colored(url, "cyan"))) - ) + print("[*] Original : {}".format(colored(url, "cyan"))) url = "//".join([protocol, domain]) # Build list of URL resources @@ -146,19 +154,24 @@ def main(): # Follow URL path and continue if a download was detected for a dry-run if "dry_domain" in vars(): - tqdm.tqdm.write( - "[*] Download : " - "{}".format("{} (Recursively downloaded)".format(colored(url, "green", attrs=["underline", "bold"])))) + print("[*] Download : {} (Recursively downloaded)".format( + colored(url, "green", attrs=["underline", "bold"]) + )) continue # Send first request to the URL - tqdm.tqdm.write( - "[*] Session : " - "{}".format("{}".format(colored(url, "blue")))) + print("[*] Session : {}".format(colored(url, "blue"))) try: - resp = requests.get(url, headers={"User-Agent": uagent}, timeout=5, allow_redirects=True) - except: + resp = requests.get(url, + proxies=proxies, + headers={"User-Agent": uagent}, + timeout=timeout, + allow_redirects=True) + except Exception as err: + print("[!] Error : {}".format( + colored(err, "red", attrs=["bold"]) + )) continue if resp.status_code != 200: @@ -167,22 +180,23 @@ def main(): # An open directory is found if "Index of " in resp.content: if glob.glob("./*/{}".format(domain.split(":")[0])): - tqdm.tqdm.write( - "[-] Skipping : " - "{}".format("{} (Directory '{}' already exists)".format(colored(url, "red", attrs=["underline", "bold"]), domain.split(":")[0]))) + print("[-] Skipping : {} (Directory '{}' already exists)".format( + colored(url, "red", attrs=["underline", "bold"]), + domain.split(":")[0] + )) break for extension in extensions.keys(): - if "{}<".format(extension) in resp.content.lower() and extension in archives: + if ".{}<".format(extension) in resp.content.lower() and extension in archives: directory = "KitJackinSeason" - elif "{}<".format(ext) in resp.content.lower() and extension in files: + elif ".{}<".format(ext) in resp.content.lower() and extension in files: directory = "InterestingFile" else: continue - tqdm.tqdm.write( - "[*] Download : " - "{}".format("{} ('Index of ' found)".format(colored(url, "green", attrs=["underline", "bold"])))) + print("[*] Download : {} ('Index of ' found)".format( + colored(url, "green", attrs=["underline", "bold"]) + )) if args.dry_run: dry_domain = True @@ -191,12 +205,14 @@ def main(): try: if directory == "InterestingFile": os.mkdir("./InterestingFile/{}".format(domain.split(":")[0])) + subprocess.call([ + "{}".format(torsocks), "wget", "--execute=robots=off", "--tries=2", "--no-clobber", - "--timeout=5", + "--timeout={}".format(timeout), "--waitretry=0", "--directory-prefix=./{}/{}".format(directory, domain.split(":")[0]), "--content-disposition", @@ -206,39 +222,45 @@ def main(): url ]) break - except: + except Exception as err: + print("[!] Error : {}".format( + colored(err, "red", attrs=["bold"]) + )) continue # A URL is found ending in the specified extension but the server responded with no Content-Type if "Content-Type" not in resp.headers.keys(): if os.path.exists("./InterestingFile/{}".format(domain.split(":")[0])): - tqdm.tqdm.write( - "[-] Skipping : " - "{}".format("{} (Directory '{}' already exists)".format(colored(url, "red", attrs=["underline", "bold"]), domain.split(":")[0]))) + print("[-] Skipping : {} (Directory '{}' already exists)".format( + colored(url, "red", attrs=["underline", "bold"]), + domain.split(":")[0] + )) break if url.endswith('.{}'.format(ext)): if resp.url != url: redirect = resp.url - tqdm.tqdm.write( - "[*] Redirect : " - "{}".format("{} (Responded with no Content-Type)".format(colored(redirect, "green", attrs=["underline", "bold"])))) + print("[*] Redirect : {} (Responded with no Content-Type)".format( + colored(redirect, "green", attrs=["underline", "bold"]) + )) else: - tqdm.tqdm.write( - "[*] Download : " - "{}".format("{} (Responded with no Content-Type)".format(colored(url, "green", attrs=["underline", "bold"])))) + print("[*] Download : {} (Responded with no Content-Type)".format( + colored(url, "green", attrs=["underline", "bold"]) + )) if args.dry_run: break try: os.mkdir("./InterestingFile/{}".format(domain.split(":")[0])) + subprocess.call([ + "{}".format(torsocks), "wget", "--execute=robots=off", "--tries=2", "--no-clobber", - "--timeout=5", + "--timeout={}".format(timeout), "--waitretry=0", "--directory-prefix=./InterestingFile/{}".format(domain.split(":")[0]), "--content-disposition", @@ -246,38 +268,46 @@ def main(): url ]) break - except: + except Exception as err: + print("[!] Error : {}".format( + colored(err, "red", attrs=["bold"]) + )) continue # A file is found with the Mime-Type of the specified extension if resp.headers["Content-Type"].startswith(extensions[ext]) or url.endswith(".{}".format(ext)): if os.path.exists("./InterestingFile/{}".format(domain.split(":")[0])): - tqdm.tqdm.write( - "[-] Skipping : " - "{}".format("{} (Directory '{}' already exists)".format(colored(url, "red", attrs=["underline", "bold"]), domain.split(":")[0]))) + print("[-] Skipping : {} (Directory '{}' already exists)".format( + colored(url, "red", attrs=["underline", "bold"]), + domain.split(":")[0] + )) break if resp.url != url: redirect = resp.url - tqdm.tqdm.write( - "[*] Redirect : " - "{}".format("{} ({} found)".format(colored(redirect, "green", attrs=["underline", "bold"]), ext))) + print("[*] Redirect : {} ({} found)".format( + colored(redirect, "green", attrs=["underline", "bold"]), + ext + )) else: - tqdm.tqdm.write( - "[*] Download : " - "{}".format("{} ({} found)".format(colored(url, "green", attrs=["underline", "bold"]), ext))) + print("[*] Download : {} ({} found)".format( + colored(url, "green", attrs=["underline", "bold"]), + ext + )) if args.dry_run: break try: os.mkdir("./InterestingFile/{}".format(domain.split(":")[0])) + subprocess.call([ + "{}".format(torsocks), "wget", "--execute=robots=off", "--tries=2", "--no-clobber", - "--timeout=5", + "--timeout={}".format(timeout), "--waitretry=0", "--directory-prefix=./InterestingFile/{}".format(domain.split(":")[0]), "--content-disposition", @@ -285,56 +315,106 @@ def main(): url ]) break - except: + except Exception as err: + print("[!] Error : {}".format( + colored(err, "red", attrs=["bold"]) + )) continue if "dry_domain" in vars(): del dry_domain return -if __name__ == "__main__": - # Print summary of what's about to be checked +def show_summary(): + """Print summary of arguments selected""" + if args.dry_run: + print(colored("Starting dry run...\n", "yellow", attrs=["bold"])) + print("Summary:") - print(" query_type : {}".format(args.query_type)) + print(" query_type : {}".format(args.query_type.lower())) print(" delta : {}".format(args.delta)) - print(" file_extension : {}".format(args.file_extension)) - - if args.exclude: - exclusions = args.exclude.split(',') - print(" exclusions : {}".format(exclusions)) - - print("") - - queries = { - "automatic" : "task.method%3Aautomatic", - "manual" : "task.method%3Amanual", - "certstream" : "(task.source%3Acertstream-idn OR \ - task.source%3Acertstream-suspicious)", - "openphish" : "task.source%3Aopenphish", - "phishtank" : "task.source%3Aphishtank", - "twitter" : "(task.source%3Atwitter OR \ - task.source%3Atwitter_illegalFawn OR \ - task.source%3Atwitter_phishingalert)", - "urlhaus" : "task.source%3Aurlhaus" - } - - with open("external.yaml", "r") as f: - external = yaml.safe_load(f) + print(" file_extension : {}".format(args.file_extension.lower())) + print(" exclusions : {}".format(args.exclude.split(","))) + print(" timeout : {}".format(args.timeout)) + print(" tor : {}\n".format(args.tor)) + return - if external["archives"] is not None: - archives = external["archives"] +def show_network(uagent): + """Select network to use, get IP address, and print message""" + global torsocks + global proxies + if args.tor: + ip_type = "Tor" + proxies = { + "http": "socks5h://127.0.0.1:9050", + "https": "socks5h://127.0.0.1:9050" + } + torsocks = "torsocks" else: - print(colored("At least one extension is required for 'archives'.", "red", attrs=["bold"])) + ip_type = "Original" + proxies = {} + torsocks = "" + + try: + requested_ip = requests.get("https://api.ipify.org", + proxies=proxies, + headers={"User-Agent": uagent}, + timeout=args.timeout, + allow_redirects=True).content + except Exception as err: + print("[!] Error : {}".format( + colored(err, "red", attrs=["bold"]) + )) exit() - if external["files"] is not None: - files = external["files"] + print(colored("Getting IP Address...", "yellow", attrs=["bold"])) + if args.tor: + obfuscated_ip = ".".join(["XXX.XXX.XXX", requested_ip.split(".")[:-1][0]]) + print(colored("{} IP: {}\n".format(ip_type, obfuscated_ip), "yellow", attrs=["bold"])) else: - print(colored("At least one extension is required for 'files'.", "red", attrs=["bold"])) + print(colored("{} IP: {}\n".format(ip_type, requested_ip), "yellow", attrs=["bold"])) + return + +def get_urls(delta, queries, qtype, ext, uagent, timeout, extensions): + """Request URLs from urlscan.io""" + # Get stopping point + today = datetime.now() + timespan = datetime.strftime(today - timedelta(delta), "%a, %d %b %Y 05:00:00") + timespan = datetime.strptime(timespan, "%a, %d %b %Y %H:%M:%S") + + api = "https://urlscan.io/api/v1/search/?q={}%20AND%20filename%3A.{}&size=10000" + resp = requests.get(api.format(queries[qtype], ext), + proxies=proxies, + headers={"User-Agent": uagent}, + timeout=timeout, + allow_redirects=True) + + if not (resp.status_code == 200 and "results" in resp.json().keys()): exit() - extensions = {} - extensions.update(archives) - extensions.update(files) + results = resp.json()["results"] + urls = [] + + for result in results: + # Break at delta specified + analysis_time = datetime.strptime(result["task"]["time"], "%Y-%m-%dT%H:%M:%S.%fZ") + + if analysis_time < timespan: + break + + # Build list of URLs ending with specified extension or Mime-Type + url = result["page"]["url"] + + if url.endswith('.{}'.format(ext)): + urls.append(url) + continue - main() \ No newline at end of file + if "files" in result.keys(): + result_files = [x for x in result["files"] if x["mimeType"].startswith(extensions[ext])] + + if len(result_files) > 0: + urls.append(url) + return urls + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt index 69c37e3..7e7fa05 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,15 @@ +certifi>=2017.4.17 certstream==1.10 +chardet<3.1.0,>=3.0.2 entropy==0.10 +idna<2.9,>=2.5 python_Levenshtein==0.12.0 +PySocks!=1.5.7,>=1.5.6 PyYAML==3.13 requests termcolor==1.1.0 tld==0.7.9 tqdm==4.19.4 -websocket-client==0.48.0 \ No newline at end of file +urllib3<1.25,>=1.21.1 +websocket-client==0.48.0 +