-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathScriptEx.py
42 lines (34 loc) · 1.66 KB
/
ScriptEx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import requests
from bs4 import BeautifulSoup
import csv
import re
base_url = "https://southpark.fandom.com"
index_url = base_url + "/wiki/Category:Scripts"
response = requests.get(index_url)
soup = BeautifulSoup(response.text, "html.parser")
# Find all the episode scripts in the index page
script_links = [a["href"] for a in soup.find_all("a", href=True) if "/wiki/File:" in a["href"]]
# Iterate through each script link and extract the content
rows = []
import tqdm
for script_link in tqdm.tqdm(script_links):
script_url = base_url + script_link
response = requests.get(script_url)
soup = BeautifulSoup(response.text, "html.parser")
# Find the headerscontent table in the HTML
table = soup.find_all('table', {'class': 'headerscontent'})[1]
# Extract the content from the table and store it in a list of rows
for tr in table.find_all('tr'):
td = [td.text for td in tr.find_all('td')]
if len(td) == 2 and td[0].strip() and td[1].strip(): # Check if both columns are occupied
speaker = "Cartman" if "Eric" in td[0] or "Cartman" in td[0] else "[OTH]"
sentences = td[1].strip()
sentences =' '.join(word for word in re.split(r'\s*\[.*?\]\s*', sentences) if word) # Remove everything within square braces
rows.append([speaker, sentences])
# Write the content of the list to a CSV file
filename = "all_scripts_filtered.csv"
with open(filename, 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(["Speaker", "Sentences"])
writer.writerows(rows)
print(f"All the scripts have been extracted, filtered, and saved to the file {filename}.")