From 849145c225a51967296559273107ec4e2b95810a Mon Sep 17 00:00:00 2001 From: PushkraJ <104222808+PushkraJ99@users.noreply.github.com> Date: Wed, 20 Mar 2024 01:34:48 +0530 Subject: [PATCH] Update main.py 1) Added Output For List Scan with Seprate domain_name.txt for each domain in list and also Combined.txt for all Domain Results in One File 2) Subdomain Scan added For Single Domain Scan and List Scan Default is False For Scanning Subdomains --subs command needed --- paramspider/main.py | 140 +++++++++++++++++++++++++++++++++----------- 1 file changed, 105 insertions(+), 35 deletions(-) diff --git a/paramspider/main.py b/paramspider/main.py index 38ff9f5..d69df0b 100644 --- a/paramspider/main.py +++ b/paramspider/main.py @@ -1,11 +1,14 @@ +#!/usr/bin/env python3 import argparse import os import logging import colorama from colorama import Fore, Style +from urllib.parse import urlparse, parse_qs, urlencode, unquote +import time from . import client # Importing client from a module named "client" -from urllib.parse import urlparse, parse_qs, urlencode -import os + +start_time = time.time() yellow_color_code = "\033[93m" reset_color_code = "\033[0m" @@ -78,22 +81,32 @@ def clean_urls(urls, extensions, placeholder): cleaned_urls.add(cleaned_url) return list(cleaned_urls) -def fetch_and_clean_urls(domain, extensions, stream_output,proxy, placeholder): +def fetch_and_clean_urls(domain, extensions, stream_output, proxy, placeholder, output_filename, subs): """ Fetch and clean URLs related to a specific domain from the Wayback Machine. Args: - domain (str): The domain name to fetch URLs for. + domain (str): The domain name to fetch related URLs for. extensions (list): List of file extensions to check against. stream_output (bool): True to stream URLs to the terminal. + output_filename (str): Name of the output file. + subs (bool): True to include subdomains. Returns: None """ logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Fetching URLs for {Fore.CYAN + domain + Style.RESET_ALL}") - wayback_uri = f"https://web.archive.org/cdx/search/cdx?url={domain}/*&output=txt&collapse=urlkey&fl=original&page=/" - response = client.fetch_url_content(wayback_uri,proxy) - urls = response.text.split() + if subs: + url = f"https://web.archive.org/cdx/search/cdx?url=*.{domain}/*&output=txt&fl=original&collapse=urlkey&page=/" + else: + url = f"https://web.archive.org/cdx/search/cdx?url={domain}/*&output=txt&fl=original&collapse=urlkey&page=/" + + response = client.fetch_url_content(url, proxy) + if response is False: + return + + response = unquote(response.text) + urls = response.split() logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Found {Fore.GREEN + str(len(urls)) + Style.RESET_ALL} URLs for {Fore.CYAN + domain + Style.RESET_ALL}") @@ -102,11 +115,14 @@ def fetch_and_clean_urls(domain, extensions, stream_output,proxy, placeholder): logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Found {Fore.GREEN + str(len(cleaned_urls)) + Style.RESET_ALL} URLs after cleaning") logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Extracting URLs with parameters") - results_dir = "results" + results_dir = "output" if not os.path.exists(results_dir): os.makedirs(results_dir) - - result_file = os.path.join(results_dir, f"{domain}.txt") + + if output_filename: + result_file = os.path.join(results_dir, f"{output_filename}") + else: + result_file = os.path.join(results_dir, f"{domain}.txt") with open(result_file, "w") as f: for url in cleaned_urls: @@ -114,22 +130,85 @@ def fetch_and_clean_urls(domain, extensions, stream_output,proxy, placeholder): f.write(url + "\n") if stream_output: print(url) - logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Saved cleaned URLs to {Fore.CYAN + result_file + Style.RESET_ALL}") + print("\u001b[31m[!] Total Execution Time : %ss\u001b[0m" % str((time.time() - start_time))[:-12] +"\n") + +def fetch_urls_from_list(list_file, subs): + """ + Fetch and clean URLs from a list of domains. + + Args: + list_file (str): Path to the file containing a list of domain names. + subs (bool): True to include subdomains. + + Returns: + None + """ + combined_urls = [] + with open(list_file, "r") as f: + domains = [line.strip().lower().replace('https://', '').replace('http://', '') for line in f.readlines()] + domains = [domain for domain in domains if domain] # Remove empty lines + domains = list(set(domains)) # Remove duplicates + + for domain in domains: + logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Fetching URLs for {Fore.CYAN + domain + Style.RESET_ALL}") + if subs: + url = f"https://web.archive.org/cdx/search/cdx?url=*.{domain}/*&output=txt&fl=original&collapse=urlkey&page=/" + else: + url = f"https://web.archive.org/cdx/search/cdx?url={domain}/*&output=txt&fl=original&collapse=urlkey&page=/" + + response = client.fetch_url_content(url, None) + if response is False: + continue + + response = unquote(response.text) + urls = response.split() + + logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Found {Fore.GREEN + str(len(urls)) + Style.RESET_ALL} URLs for {Fore.CYAN + domain + Style.RESET_ALL}") + + cleaned_urls = clean_urls(urls, HARDCODED_EXTENSIONS, "FUZZ") + logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Cleaning URLs for {Fore.CYAN + domain + Style.RESET_ALL}") + logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Found {Fore.GREEN + str(len(cleaned_urls)) + Style.RESET_ALL} URLs after cleaning") + logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Extracting URLs with parameters") + + combined_urls.extend(cleaned_urls) + + results_dir = "output" + if not os.path.exists(results_dir): + os.makedirs(results_dir) + + result_file = os.path.join(results_dir, f"{domain}.txt") + + with open(result_file, "w") as f: + for url in cleaned_urls: + if "?" in url: + f.write(url + "\n") + + logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Saved cleaned URLs to {Fore.CYAN + result_file + Style.RESET_ALL}"+"\n") + + # Save combined URLs to a separate file + combined_output_file = os.path.join(results_dir, "combined.txt") + with open(combined_output_file, "w") as f: + for url in combined_urls: + f.write(url + "\n") + logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Saved combined URLs to {Fore.CYAN + combined_output_file + Style.RESET_ALL}") + print("\u001b[31m[!] Total Execution Time : %ss\u001b[0m" % str((time.time() - start_time))[:-12] +"\n") def main(): """ Main function to handle command-line arguments and start URL mining process. """ log_text = """ - - _ __ - ___ ___ ________ ___ _ ___ ___ (_)__/ /__ ____ - / _ \/ _ `/ __/ _ `/ ' \(_-