-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf_maker.py
96 lines (78 loc) · 2.99 KB
/
pdf_maker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import requests
from bs4 import BeautifulSoup
from weasyprint import HTML
def scrape_to_pdf(page_url, output_dir, number):
"""
page_url: URL of the page to scrape
output_dir: Directory to save the PDF files
number: Index of the PDF file
Fetches the page content, extracts the main content, and converts it to a PDF.
"""
# Get page content
response = requests.get(page_url)
if response.status_code != 200:
print(f"Failed to retrieve {page_url}")
return
# Parse the content
soup = BeautifulSoup(response.content, "html.parser")
# Extract the main content within the div#main
main_content = soup.find("div", {"id": "main", "class": "w3-col l10 m12"})
if not main_content:
print(f"Main content not found on {page_url}")
return
# Extract title for the PDF file name
title = soup.find("h1").get_text().strip().replace(" ", "_")
title = title.replace("/", "_")
if number < 10:
filename = f"{output_dir}/0{number}.{title}.pdf"
else:
filename = f"{output_dir}/{number}.{title}.pdf"
# Convert the extracted HTML to a PDF
HTML(string=str(main_content)).write_pdf(filename)
print(f"Saved {filename}")
def scrape_course_from_array(urls):
"""
urls: list of URLs to scrape
Makes the output directory and scrapes each URL using scrape_to_pdf.
"""
# Create output directory if it doesn't exist
list_of_contents = os.listdir('output')
length = urls[0].split('/').__len__()
if length >= 6:
count = 0
output_dir = "output/"
for part in urls[0].split('/'):
if count == 2:
output_dir = output_dir + part.split('.')[1]
elif count == 3:
output_dir = output_dir + '_' + part
elif count == 4:
output_dir = output_dir + '_' + part
count += 1
output_dir = output_dir + '_pdf'
else:
output_dir = f"output/{urls[0].split('/')[2].split('.')[1]}_{urls[0].split('/')[3]}_pdf"
try:
for content in list_of_contents:
if content == output_dir.split('/')[1]:
print('Directory already exists')
exist = True
else:
exist = False
if exist == False:
os.makedirs(output_dir, exist_ok=True)
except OSError as e:
print(f"Error creating directory: {e}")
return
# Loop through each URL and scrape it
i = 0
for url in urls:
page_url = url.strip() # Remove any leading/trailing whitespaces
if page_url: # Ensure the line is not empty
i += 1
scrape_to_pdf(page_url, output_dir, i)
print(f"\nScraped {i} pages to {output_dir}")
if __name__ == "__main__":
# Replace 'urls.txt' with the path to your file containing the URLs
scrape_course_from_array(['https://www.w3schools.com/css/default.asp', 'https://www.w3schools.com/css/default.asp']) # test with array