-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_active_sites.py
86 lines (75 loc) · 2.82 KB
/
check_active_sites.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import re
import requests
import csv
from datetime import datetime, timedelta
# Constants
MARKDOWN_FILE = 'readme.md'
OUTPUT_TEXT_FILE = 'urls.txt'
OUTPUT_CSV_FILE = 'url_status.csv'
CURRENT_THRESHOLD = datetime.now() - timedelta(days=30)
TIMEOUT = 10 # seconds
def extract_urls_from_markdown(file_path):
"""Extract all URLs from a markdown file."""
try:
with open(file_path, 'r') as file:
content = file.read()
return re.findall(r'https?://[^\s\)]+', content)
except FileNotFoundError:
print(f"Error: {file_path} not found.")
return []
def get_last_modified_date(url):
"""Return the last modified date of a URL or None if not available or request fails."""
try:
response = requests.head(url, timeout=TIMEOUT)
if 'Last-Modified' in response.headers:
return datetime.strptime(response.headers['Last-Modified'], "%a, %d %b %Y %H:%M:%S %Z")
except requests.RequestException as e:
print(f"Error accessing {url}: {e}")
return None
def process_urls(urls):
"""Process each URL, checking its last modified date and active status."""
data = []
for url in urls:
last_modified = get_last_modified_date(url)
if last_modified:
is_active = last_modified >= CURRENT_THRESHOLD
data.append({
'URL': url,
'Last Updated': last_modified.strftime("%Y-%m-%d"),
'Active': 'Yes' if is_active else 'No'
})
else:
data.append({'URL': url, 'Last Updated': 'Unknown', 'Active': 'Unknown'})
return data
def save_urls_to_text_file(urls, file_path):
"""Save extracted URLs to a text file."""
try:
with open(file_path, 'w') as file:
for url in urls:
file.write(url + '\n')
except IOError as e:
print(f"Error writing to {file_path}: {e}")
def save_data_to_csv(data, file_path):
"""Save processed data to a CSV file."""
try:
with open(file_path, 'w', newline='') as file:
writer = csv.DictWriter(file, fieldnames=['URL', 'Last Updated', 'Active'])
writer.writeheader()
writer.writerows(data)
except IOError as e:
print(f"Error writing to {file_path}: {e}")
def main():
# Step 1: Extract URLs from markdown file
urls = extract_urls_from_markdown(MARKDOWN_FILE)
if not urls:
print("No URLs found or file is missing.")
return
# Step 2: Save URLs to text file
save_urls_to_text_file(urls, OUTPUT_TEXT_FILE)
# Step 3: Process URLs for last modified date and active status
url_data = process_urls(urls)
# Step 4: Save the processed data to a CSV file
save_data_to_csv(url_data, OUTPUT_CSV_FILE)
print("Processing complete. Check 'url_status.csv' for results.")
if __name__ == "__main__":
main()