-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathscraper.py
81 lines (60 loc) · 2.74 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import requests
from bs4 import BeautifulSoup
import json
def scraper_function(url):
#add your cookie header string in a .txt file and add it here
with open(r"cookies/switching_linear.txt", "r") as file:
# Read the contents of the file and strip any leading or trailing whitespace
cookies = file.read().strip()
# Replace these with the actual URL and cookie header string
cookie_header = {"Cookie": cookies}
# Use a session for connection re-use and efficiency
with requests.Session() as session:
# Send a GET request to the specified URL with the provided headers
response = session.get(url, headers=cookie_header)
# Check if the request was successful
if response.status_code == 200:
# Parse the HTML content of the page
response.encoding = "utf-8"
soup = BeautifulSoup(response.text, "html.parser")
for script in soup.find_all('script'):
if script.get('type') == 'math/tex':
script.replace_with(script.text.strip())
# Find all question divs
question_divs = soup.find_all(
"div", class_=["qt-mc-question", "qt-nm-question"]
)
# Initialize list to store questions
questions = []
for question_number, question_div in enumerate(question_divs, start=1):
question_text = question_div.find("div", class_="qt-question").get_text()
img_tags = question_div.find("div", class_="qt-question").find_all(
"img"
)
image_urls = [img["src"] for img in img_tags if "src" in img.attrs]
choices_div = question_div.find("div", class_="qt-choices")
choices = (
[
label.text.strip().replace("\t", "")
for label in choices_div.find_all("label")
]
if choices_div
else []
)
questions.append(
{
"question_number": question_number,
"question_text": question_text,
"choices": choices,
"image_urls": image_urls,
}
)
# with open("scraped_data.json", "w") as file:
# json.dump(questions, file, indent=4)
return questions
else:
print(f"Failed to scrape {url}. Status code: {response.status_code}")
return []
# scraper_function(
# "https://onlinecourses.nptel.ac.in/noc24_cs03/unit?unit=106&assessment=107"
# )