-
Notifications
You must be signed in to change notification settings - Fork 0
/
check.py
63 lines (51 loc) · 2.07 KB
/
check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import pandas as pd
import requests
from urllib.parse import quote
import json
import os
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# Get environment variables
API_URL = os.environ.get('API_URL', 'http://localhost:3000')
# Load the CSV file starting from row 4500
df = pd.read_csv('lists/clean.csv', skiprows=range(1, 11377))
# Create the results directory if it doesn't exist
if not os.path.exists('local'):
os.makedirs('local')
# Create a new column 'url' using the 'Domain Name' column
df['url'] = 'http://' + df['domain_name'].str.lower()
# Loop through all URLs in the dataframe
for _, row in df.iterrows():
domain_name = row['domain_name']
url = row['url']
# Encode the URL
encoded_url = quote(url, safe='')
print(f"Processing URL: {url}")
# Send the GET request
response = requests.get(f'{API_URL}/extract?url={encoded_url}')
# Check the HTTP status code
if response.status_code == 200:
try:
data = response.json()
url_data = list(data['urls'].values())[0]
url_status = url_data.get('status')
if url_status == 0:
with open('local/bad_host.json', 'a') as f:
json.dump(data, f)
f.write('\n')
print(f"Bad Host: Data for URL {url} saved to bad_host.json.")
else:
with open('local/success.json', 'a') as f:
json.dump(data, f)
f.write('\n')
print(f"Success: Data for URL {url} saved to success.json.")
with open('local/complete.csv', 'a') as f:
f.write(f"{domain_name},{url}\n")
print(f"Success: Domain {domain_name} and URL {url} saved to complete.csv.")
except json.JSONDecodeError:
print(f"Failure: Could not decode JSON response for URL {url}")
else:
with open('local/failure.csv', 'a') as f:
f.write(url + '\n')
print(f"Failure: URL {url} saved to failure.csv. HTTP Status code: {response.status_code}")