diff --git a/.gitignore b/.gitignore index 71d46820..0323e752 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ -config/ result/ files/ files/upload_file diff --git a/config/cf.local.iplist b/config/cf.local.iplist new file mode 100644 index 00000000..f01d2ae8 --- /dev/null +++ b/config/cf.local.iplist @@ -0,0 +1,206 @@ +192.133.11.0/24 +104.24.0.0/14 +185.221.160.0/24 +193.227.99.0/24 +188.114.102.0/23 +45.131.208.0/22 +103.31.4.0/22 +168.100.6.0/24 +141.101.106.0/23 +45.133.247.0/24 +185.244.106.0/24 +103.156.22.0/23 +194.36.216.0/22 +108.162.216.0/23 +188.114.100.0/24 +203.55.107.0/24 +154.85.9.0/24 +185.38.135.0/24 +185.148.104.0/22 +103.22.202.0/23 +141.101.82.0/23 +108.162.235.0/24 +45.94.169.0/24 +194.152.44.0/24 +108.162.192.0/18 +185.72.49.0/24 +104.30.16.0/20 +108.162.218.0/24 +205.233.181.0/24 +212.110.134.0/23 +194.53.53.0/24 +31.43.179.0/24 +188.114.96.0/22 +45.131.4.0/22 +104.30.64.0/18 +104.30.1.0/24 +103.121.59.0/24 +195.137.167.0/24 +185.201.139.0/24 +191.101.251.0/24 +45.85.118.0/23 +199.27.132.0/24 +104.254.140.0/24 +154.85.99.0/24 +154.84.20.0/23 +154.84.24.0/24 +185.162.228.0/22 +45.95.241.0/24 +185.109.21.0/24 +203.17.126.0/24 +147.185.161.0/24 +23.178.112.0/24 +104.30.4.0/22 +66.81.255.0/24 +203.193.21.0/24 +80.94.83.0/24 +170.114.46.0/24 +203.32.120.0/23 +45.158.56.0/24 +203.23.106.0/24 +199.60.103.0/24 +185.59.218.0/24 +203.24.102.0/23 +103.168.172.0/24 +103.11.214.0/24 +185.7.190.0/23 +159.246.55.0/24 +141.101.72.0/22 +203.107.173.0/24 +103.22.200.0/22 +185.207.92.0/24 +89.207.18.0/24 +185.238.228.0/24 +195.245.221.0/24 +159.112.235.0/24 +104.28.0.0/15 +103.169.142.0/24 +212.24.127.0/24 +103.112.176.0/24 +188.114.111.0/24 +203.29.52.0/22 +170.114.52.0/24 +108.162.250.0/24 +89.116.250.0/24 +141.101.66.0/23 +194.1.194.0/24 +108.162.236.0/22 +103.172.111.0/24 +45.145.28.0/23 +103.244.116.0/22 +104.234.158.0/24 +23.247.163.0/24 +45.14.174.0/24 +170.114.45.0/24 +154.84.26.0/23 +173.245.54.0/24 +154.219.3.0/24 +103.21.244.0/24 +185.213.240.0/24 +103.79.228.0/23 +185.234.22.0/24 +141.101.76.0/23 +199.27.128.0/22 +141.101.110.0/24 +185.67.124.0/24 +185.170.166.0/24 +193.17.206.0/24 +104.31.0.0/16 +147.78.140.0/24 +185.213.243.0/24 +154.84.14.0/23 +185.193.28.0/22 +103.160.204.0/24 +104.30.128.0/17 +185.176.26.0/24 +141.101.84.0/22 +104.30.8.0/21 +195.85.23.0/24 +89.47.56.0/23 +203.28.8.0/23 +141.101.96.0/21 +45.159.216.0/22 +203.34.80.0/24 +66.235.200.0/24 +173.245.58.0/23 +108.162.192.0/20 +141.101.108.0/23 +203.23.103.0/24 +141.101.65.0/24 +103.204.13.0/24 +212.239.86.0/24 +45.87.175.0/24 +185.18.250.0/24 +188.244.122.0/24 +23.227.60.0/24 +203.89.5.0/24 +141.193.213.0/24 +193.9.49.0/24 +108.162.212.0/23 +194.40.240.0/23 +141.101.68.0/22 +45.137.99.0/24 +203.24.108.0/23 +66.81.247.0/24 +173.245.48.0/20 +203.23.104.0/24 +185.135.9.0/24 +193.16.63.0/24 +194.169.194.0/24 +141.101.64.0/18 +141.11.194.0/23 +141.101.112.0/20 +108.162.210.0/23 +188.114.106.0/23 +23.141.168.0/24 +185.174.138.0/24 +104.16.0.0/13 +154.83.22.0/24 +188.114.108.0/24 +108.162.240.0/21 +45.142.120.0/24 +103.11.212.0/24 +104.30.2.0/23 +203.22.223.0/24 +141.101.92.0/22 +45.84.59.0/24 +45.80.111.0/24 +203.30.188.0/22 +203.34.28.0/24 +203.13.32.0/24 +108.162.255.0/24 +23.227.37.0/24 +45.8.211.0/24 +104.30.32.0/19 +194.36.55.0/24 +154.51.129.0/24 +185.176.24.0/24 +193.67.144.0/24 +154.51.160.0/24 +103.21.244.0/22 +194.36.49.0/24 +103.81.228.0/24 +154.83.2.0/24 +199.181.197.0/24 +199.212.90.0/24 +173.245.63.0/24 +195.85.59.0/24 +195.242.122.0/23 +45.12.30.0/23 +154.84.175.0/24 +203.19.222.0/24 +154.83.30.0/24 +147.78.121.0/24 +188.42.88.0/23 +185.146.172.0/23 +108.165.216.0/24 +193.188.14.0/24 +103.22.201.0/24 +192.65.217.0/24 +173.245.49.0/24 +188.114.96.0/20 +23.227.38.0/23 +141.101.90.0/24 +154.84.16.0/24 +108.162.248.0/23 +45.8.104.0/22 \ No newline at end of file diff --git a/config/cfchallenger.py b/config/cfchallenger.py new file mode 100644 index 00000000..0cbc5091 --- /dev/null +++ b/config/cfchallenger.py @@ -0,0 +1,406 @@ +from __future__ import annotations + +import argparse +import json +import logging +import re +from datetime import datetime +from enum import Enum +from typing import Any, Dict, List, Optional + +import playwright.sync_api +from playwright._impl._api_types import Error as PlaywrightError +from playwright.sync_api import Frame, sync_playwright + +Cookies = List[Dict[str, Any]] + + +class ChallengePlatform(Enum): + """Cloudflare challenge platform URI paths.""" + + JAVASCRIPT = "/cdn-cgi/challenge-platform/h/[bg]/orchestrate/jsch/v1" + MANAGED = "/cdn-cgi/challenge-platform/h/[bg]/orchestrate/managed/v1" + CAPTCHA = "/cdn-cgi/challenge-platform/h/[bg]/orchestrate/captcha/v1" + + +class CloudflareSolver: + """ + A class for solving Cloudflare challenges with Playwright. + + Parameters + ---------- + user_agent : str + The user agent string to use for the browser requests. + timeout : int + The browser default timeout in seconds. + http2 : bool + Enable or disable the usage of HTTP/2 for the browser requests. + http3 : bool + Enable or disable the usage of HTTP/3 for the browser requests. + headless : bool + Enable or disable headless mode for the browser. + proxy : Optional[str] + The proxy server URL to use for the browser requests. + + Attributes + ---------- + page : playwright.sync_api.Page + The Playwright page. + cookies : Cookies + The cookies from current the page. + + Methods + ------- + extract_clearance_cookie(cookies: Cookies) -> Optional[Dict[str, Any]] + Extract the Cloudflare clearance cookie from a list of cookies. + detect_challenge() -> Optional[ChallengePlatform] + Detect the Cloudflare challenge platform on the current page. + solve_challenge() + Solve the Cloudflare challenge on the current page. + """ + + def __init__( + self, + *, + user_agent: str, + timeout: int, + http2: bool, + http3: bool, + headless: bool, + proxy: Optional[str], + ) -> None: + self._playwright = sync_playwright().start() + + if proxy is not None: + proxy = self._parse_proxy(proxy) + + browser = self._playwright.firefox.launch( + firefox_user_prefs={ + "network.http.http2.enabled": http2, + "network.http.http3.enable": http3, + }, + headless=headless, + proxy=proxy, + ) + + context = browser.new_context() + context.set_default_timeout(timeout * 1000) + self.page = context.new_page() + + def __enter__(self) -> CloudflareSolver: + return self + + def __exit__(self, *args: Any) -> None: + self._playwright.stop() + + @staticmethod + def _parse_proxy(proxy: str) -> Dict[str, str]: + """ + Parse a proxy URL string into a dictionary of proxy parameters for the Playwright browser. + + Parameters + ---------- + proxy : str + Proxy URL string. + + Returns + ------- + Dict[str, str] + Dictionary of proxy parameters. + """ + if "@" in proxy: + proxy_regex = re.match("(.+)://(.+):(.+)@(.+)", proxy) + server = f"{proxy_regex.group(1)}://{proxy_regex.group(4)}" + + proxy_params = { + "server": server, + "username": proxy_regex.group(2), + "password": proxy_regex.group(3), + } + else: + proxy_params = {"server": proxy} + + return proxy_params + + def _get_turnstile_frame(self) -> Optional[Frame]: + """ + Get the Cloudflare turnstile frame. + + Returns + ------- + Optional[Frame] + Cloudflare turnstile frame. + """ + for frame in self.page.frames: + if ( + re.match( + "https://challenges.cloudflare.com/cdn-cgi/challenge-platform/h/[bg]/turnstile", + frame.url, + ) + is not None + ): + return frame + + return None + + @property + def cookies(self) -> Cookies: + """ + The cookies from the current page. + + Returns + ------- + Cookies + List of cookies. + """ + return self.page.context.cookies() + + @staticmethod + def extract_clearance_cookie(cookies: Cookies) -> Optional[Dict[str, Any]]: + """ + Extract the Cloudflare clearance cookie from a list of cookies. + + Parameters + ---------- + cookies : Cookies + List of cookies. + + Returns + ------- + Optional[Dict[str, Any]] + cf_clearance cookie dictionary. + """ + for cookie in cookies: + if cookie["name"] == "cf_clearance": + return cookie + + return None + + def detect_challenge(self) -> Optional[ChallengePlatform]: + """ + Detect the Cloudflare challenge platform on the current page. + + Returns + ------- + Optional[ChallengePlatform] + Cloudflare challenge platform. + """ + html = self.page.content() + + for platform in ChallengePlatform: + if re.search(platform.value, html) is not None: + return platform + + return None + + def solve_challenge(self) -> None: + """Solve the Cloudflare challenge on the current page.""" + verify_button_pattern = re.compile( + "Verify (I am|you are) (not a bot|(a )?human)" + ) + + verify_button = self.page.get_by_role("button", name=verify_button_pattern) + challenge_spinner = self.page.locator("#challenge-spinner") + challenge_stage = self.page.locator("div#challenge-stage") + + while ( + self.extract_clearance_cookie(self.cookies) is None + and self.detect_challenge() is not None + ): + if challenge_spinner.is_visible(): + challenge_spinner.wait_for(state="hidden") + + if verify_button.is_visible(): + verify_button.click() + challenge_stage.wait_for(state="hidden") + elif self._get_turnstile_frame() is not None: + turnstile_frame = self._get_turnstile_frame() + turnstile_frame.get_by_role("checkbox").click() + challenge_stage.wait_for(state="hidden") + elif any( + frame.url.startswith("https://cf-assets.hcaptcha.com/captcha/v1") + for frame in self.page.frames + ): + self.page.reload() + + +def main() -> None: + parser = argparse.ArgumentParser( + description="A simple program for scraping Cloudflare clearance (cf_clearance) cookies from websites issuing Cloudflare challenges to visitors" + ) + + parser.add_argument( + "url", + metavar="URL", + help="The URL to scrape the Cloudflare clearance cookie from", + type=str, + ) + + parser.add_argument( + "-f", + "--file", + default=None, + help="The file to write the Cloudflare clearance cookie information to, in JSON format", + type=str, + ) + + parser.add_argument( + "-t", + "--timeout", + default=15, + help="The browser default timeout in seconds", + type=int, + ) + + parser.add_argument( + "-p", + "--proxy", + default=None, + help="The proxy server URL to use for the browser requests (SOCKS5 proxy authentication is not supported)", + type=str, + ) + + parser.add_argument( + "-ua", + "--user-agent", + # default= "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Chrome/112.0.5615.29", + default="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0", + help="The user agent to use for the browser requests", + type=str, + ) + + parser.add_argument( + "--disable-http2", + action="store_true", + help="Disable the usage of HTTP/2 for the browser requests", + ) + + parser.add_argument( + "--disable-http3", + action="store_true", + help="Disable the usage of HTTP/3 for the browser requests", + ) + + parser.add_argument( + "-d", + "--debug", + action="store_true", + help="Run the browser in headed mode", + ) + + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Increase the output verbosity", + ) + + args = parser.parse_args() + logging_level = logging.INFO if args.verbose else logging.ERROR + + logging.basicConfig( + format="[%(asctime)s] [%(levelname)s] %(message)s", + datefmt="%H:%M:%S", + level=logging_level, + ) + + logging.info("Launching %s browser...", "headed" if args.debug else "headless") + + challenge_messages = { + ChallengePlatform.JAVASCRIPT: "Solving Cloudflare challenge [JavaScript]...", + ChallengePlatform.MANAGED: "Solving Cloudflare challenge [Managed]...", + ChallengePlatform.CAPTCHA: "Solving Cloudflare challenge [CAPTCHA]...", + } + + with CloudflareSolver( + user_agent=args.user_agent, + timeout=args.timeout, + http2=not args.disable_http2, + http3=not args.disable_http3, + headless=not args.debug, + proxy=args.proxy, + ) as solver: + logging.info("Going to %s...", args.url) + + try: + solver.page.goto(args.url) + except PlaywrightError as err: + logging.error(err) + return + + challenge_platform = solver.detect_challenge() + + if challenge_platform is None: + logging.error("No Cloudflare challenge detected.") + return + + logging.info(challenge_messages[challenge_platform]) + + try: + solver.solve_challenge() + except PlaywrightError as err: + logging.error(err) + + clearance_cookie = solver.extract_clearance_cookie(solver.cookies) + + # loop = asyncio.get_event_loop() + # + # async def opser(page:playwright.sync_api.Page,address): + # # wait untill fully loaded + # print("waiting for "+address) + # page.wait_for_selector("[body]") + # # get the content + # print("Wait Finished for " + address) + # content = await page.content() + # # print the content + # print(content) + # loop.run_until_complete(opser(solver.page,args.url)) + # loop.close() + + # await solver.page.wait_for_load_state(state="domcontentloaded") + solver.page.wait_for_event("load") + content = solver.page.content() + print(content) + + if clearance_cookie is None: + logging.error("Failed to retrieve the Cloudflare clearance cookie.") + return + + if not args.verbose: + print(clearance_cookie["value"]) + + logging.info("Cookie: cf_clearance=%s", clearance_cookie["value"]) + logging.info("User agent: %s", args.user_agent) + + if args.file is None: + return + + logging.info("Writing Cloudflare clearance cookie information to %s...", args.file) + + try: + with open(args.file, encoding="utf-8") as file: + json_data = json.load(file) + except (FileNotFoundError, json.JSONDecodeError): + json_data = {"clearance_cookies": []} + + # Get the unix timestamp using the cookie's expiration date minus one year + unix_timestamp = clearance_cookie["expires"] - 31557600 + timestamp = datetime.utcfromtimestamp(unix_timestamp).isoformat() + + json_data["clearance_cookies"].append( + { + "unix_timestamp": unix_timestamp, + "timestamp": timestamp, + "domain": clearance_cookie["domain"], + "cf_clearance": clearance_cookie["value"], + "user_agent": args.user_agent, + "proxy": args.proxy, + } + ) + + with open(args.file, "w", encoding="utf-8") as file: + json.dump(json_data, file, indent=4) + + +if __name__ == "__main__": + main() diff --git a/config/main.py b/config/main.py new file mode 100644 index 00000000..4553cb9a --- /dev/null +++ b/config/main.py @@ -0,0 +1,124 @@ +import requests +import re +from datetime import datetime +import pytz +from pathlib import Path +import subprocess +import os +import time + +path = Path(__file__).resolve().parent +url = 'https://asnlookup.com/asn/' +agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0" +ips = list() +asns = ['AS13335', 'AS209242'] +correctIp = ['23', '31', '45', '66', '80', '89', '103', '104', '108', '141', + '147', '154', '159', '168', '170', '173', '185', '188', '191', + '192', '193', '194', '195', '199', '203', '205', '212'] + + +def substring_after(s, delim): + return s.partition(delim)[2] + + +def substring_before(s, delim): + return s.partition(delim)[0] + + +def substring_between(s, before, after): + return substring_before(substring_after(s, before), after) + + +def getPageContent(url): + pathss = str(path) + os.sep + main = pathss + "cfchallenger.py" + result = subprocess.getoutput(f'python {main} {url}'.format(main=main, url=url)) + return result + + +deli_before = """