diff --git a/paramspider/main.py b/paramspider/main.py index 38ff9f5..d69df0b 100644 --- a/paramspider/main.py +++ b/paramspider/main.py @@ -1,11 +1,14 @@ +#!/usr/bin/env python3 import argparse import os import logging import colorama from colorama import Fore, Style +from urllib.parse import urlparse, parse_qs, urlencode, unquote +import time from . import client # Importing client from a module named "client" -from urllib.parse import urlparse, parse_qs, urlencode -import os + +start_time = time.time() yellow_color_code = "\033[93m" reset_color_code = "\033[0m" @@ -78,22 +81,32 @@ def clean_urls(urls, extensions, placeholder): cleaned_urls.add(cleaned_url) return list(cleaned_urls) -def fetch_and_clean_urls(domain, extensions, stream_output,proxy, placeholder): +def fetch_and_clean_urls(domain, extensions, stream_output, proxy, placeholder, output_filename, subs): """ Fetch and clean URLs related to a specific domain from the Wayback Machine. Args: - domain (str): The domain name to fetch URLs for. + domain (str): The domain name to fetch related URLs for. extensions (list): List of file extensions to check against. stream_output (bool): True to stream URLs to the terminal. + output_filename (str): Name of the output file. + subs (bool): True to include subdomains. Returns: None """ logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Fetching URLs for {Fore.CYAN + domain + Style.RESET_ALL}") - wayback_uri = f"https://web.archive.org/cdx/search/cdx?url={domain}/*&output=txt&collapse=urlkey&fl=original&page=/" - response = client.fetch_url_content(wayback_uri,proxy) - urls = response.text.split() + if subs: + url = f"https://web.archive.org/cdx/search/cdx?url=*.{domain}/*&output=txt&fl=original&collapse=urlkey&page=/" + else: + url = f"https://web.archive.org/cdx/search/cdx?url={domain}/*&output=txt&fl=original&collapse=urlkey&page=/" + + response = client.fetch_url_content(url, proxy) + if response is False: + return + + response = unquote(response.text) + urls = response.split() logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Found {Fore.GREEN + str(len(urls)) + Style.RESET_ALL} URLs for {Fore.CYAN + domain + Style.RESET_ALL}") @@ -102,11 +115,14 @@ def fetch_and_clean_urls(domain, extensions, stream_output,proxy, placeholder): logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Found {Fore.GREEN + str(len(cleaned_urls)) + Style.RESET_ALL} URLs after cleaning") logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Extracting URLs with parameters") - results_dir = "results" + results_dir = "output" if not os.path.exists(results_dir): os.makedirs(results_dir) - - result_file = os.path.join(results_dir, f"{domain}.txt") + + if output_filename: + result_file = os.path.join(results_dir, f"{output_filename}") + else: + result_file = os.path.join(results_dir, f"{domain}.txt") with open(result_file, "w") as f: for url in cleaned_urls: @@ -114,22 +130,85 @@ def fetch_and_clean_urls(domain, extensions, stream_output,proxy, placeholder): f.write(url + "\n") if stream_output: print(url) - logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Saved cleaned URLs to {Fore.CYAN + result_file + Style.RESET_ALL}") + print("\u001b[31m[!] Total Execution Time : %ss\u001b[0m" % str((time.time() - start_time))[:-12] +"\n") + +def fetch_urls_from_list(list_file, subs): + """ + Fetch and clean URLs from a list of domains. + + Args: + list_file (str): Path to the file containing a list of domain names. + subs (bool): True to include subdomains. + + Returns: + None + """ + combined_urls = [] + with open(list_file, "r") as f: + domains = [line.strip().lower().replace('https://', '').replace('http://', '') for line in f.readlines()] + domains = [domain for domain in domains if domain] # Remove empty lines + domains = list(set(domains)) # Remove duplicates + + for domain in domains: + logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Fetching URLs for {Fore.CYAN + domain + Style.RESET_ALL}") + if subs: + url = f"https://web.archive.org/cdx/search/cdx?url=*.{domain}/*&output=txt&fl=original&collapse=urlkey&page=/" + else: + url = f"https://web.archive.org/cdx/search/cdx?url={domain}/*&output=txt&fl=original&collapse=urlkey&page=/" + + response = client.fetch_url_content(url, None) + if response is False: + continue + + response = unquote(response.text) + urls = response.split() + + logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Found {Fore.GREEN + str(len(urls)) + Style.RESET_ALL} URLs for {Fore.CYAN + domain + Style.RESET_ALL}") + + cleaned_urls = clean_urls(urls, HARDCODED_EXTENSIONS, "FUZZ") + logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Cleaning URLs for {Fore.CYAN + domain + Style.RESET_ALL}") + logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Found {Fore.GREEN + str(len(cleaned_urls)) + Style.RESET_ALL} URLs after cleaning") + logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Extracting URLs with parameters") + + combined_urls.extend(cleaned_urls) + + results_dir = "output" + if not os.path.exists(results_dir): + os.makedirs(results_dir) + + result_file = os.path.join(results_dir, f"{domain}.txt") + + with open(result_file, "w") as f: + for url in cleaned_urls: + if "?" in url: + f.write(url + "\n") + + logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Saved cleaned URLs to {Fore.CYAN + result_file + Style.RESET_ALL}"+"\n") + + # Save combined URLs to a separate file + combined_output_file = os.path.join(results_dir, "combined.txt") + with open(combined_output_file, "w") as f: + for url in combined_urls: + f.write(url + "\n") + logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Saved combined URLs to {Fore.CYAN + combined_output_file + Style.RESET_ALL}") + print("\u001b[31m[!] Total Execution Time : %ss\u001b[0m" % str((time.time() - start_time))[:-12] +"\n") def main(): """ Main function to handle command-line arguments and start URL mining process. """ log_text = """ - - _ __ - ___ ___ ________ ___ _ ___ ___ (_)__/ /__ ____ - / _ \/ _ `/ __/ _ `/ ' \(_-