-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_collection.py
78 lines (63 loc) · 2.48 KB
/
data_collection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# Project Imports
from bs4 import BeautifulSoup
import pandas as pd
import re
import requests
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
def scrape_script():
'''
Scrapes 'The Social Network' script from imsdb.com and extracts Mark's dialogue
Returns: pandas.DataFrame containing Mark's dialogue with 'Speaker' and 'Dialogue' columns
'''
#Extract online script and begin parsing
script_html=requests.get('https://imsdb.com/scripts/Social-Network,-The.html').text
soup = BeautifulSoup(script_html,'html.parser')
# Store Mark's Dialogue
mark_dialogue=pd.DataFrame(columns=['Speaker','Dialogue'])
# Iterate through <b> html tags - where character names are listed
for tag in soup.find_all('b'):
#Extract speaker name
speaker= tag.get_text(strip=True)
dialogue=''
next_element=tag.next_sibling
# Iterate until next speaker <b> tag
while next_element and (next_element.name!='b' if next_element.name else True):
# Add text content, strip extra spaces and newlines
if next_element.name is None:
dialogue+=next_element.strip() + ' '
next_element= next_element.next_sibling
# Add to dataframe
if dialogue.strip() and speaker == 'MARK':
mark_dialogue = mark_dialogue.append({'Speaker': speaker,'Dialogue': dialogue.strip()}, ignore_index=True)
return mark_dialogue
#Download NLTK packages
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# Create lemmatizer and stopwords function
lemmatizer = WordNetLemmatizer()
stop_words= set(stopwords.words('english'))
def preprocess_text(text):
'''
Removes stop words and gets the 'lemma' or meaning of repeated words
Returns: tokens, individual words with meanings from the script
'''
text = re.sub(r'[^a-z\s]', '', text.lower())
tokens = word_tokenize(text)
#Remove stop words and lemmatize
tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
return tokens
print("Start script execution")
def process_script():
# Scrape the script
print("wee")
mark_dialogue = scrape_script()
# Preprocess dialogue
mark_dialogue['Tokens'] = mark_dialogue['Dialogue'].apply(preprocess_text)
return mark_dialogue
if __name__ == "__main__":
processed_dialogue = process_script()
print(processed_dialogue)