-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_az900.py
45 lines (36 loc) · 1.68 KB
/
scrape_az900.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import os
import pandas as pd
from bs4 import BeautifulSoup
# Directory where the HTML files are stored
html_dir = "."
# List to store the extracted data
data = []
# Loop through each HTML file
for filename in os.listdir(html_dir):
if filename.endswith(".html"):
filepath = os.path.join(html_dir, filename)
# Open and parse the HTML file
with open(filepath, "r", encoding="utf-8") as file:
soup = BeautifulSoup(file, "lxml")
# Extract questions, answers, and explanations
# Adjust these based on the actual structure of the HTML
questions = soup.find_all("div", class_="question")
answers = soup.find_all("div", class_="answer")
explanations = soup.find_all("div", class_="explanation")
# Loop through each question, answer, and explanation
for i in range(min(len(questions), len(answers), len(explanations))):
question_text = questions[i].get_text(strip=True)
answer_text = answers[i].get_text(strip=True)
explanation_text = explanations[i].get_text(strip=True)
# Add the data to the list
data.append({
"Episode": filename.split("-")[1].replace(".html", ""),
"Question": question_text,
"Answer": answer_text,
"Explanation": explanation_text
})
# Convert the data to a DataFrame and save it as a CSV file
df = pd.DataFrame(data)
csv_path = "AZ-900_Practice_Test_Questions.csv"
df.to_csv(csv_path, index=False)
print(f"Scraping complete. Data saved to {csv_path}")