-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_cleaning.py
154 lines (114 loc) · 4.42 KB
/
data_cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
"""
Helper functions to Preprocessing Data
"""
import pandas as pd
import string
from string import digits
import spacy
from collections import defaultdict
import gensim
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
def comments_to_words(sentences, deacc=True):
'''
takes comments from data file and prepares them to be cleaned by
removing punctuation and splitting sentences into single words
'''
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence)))
def remove_stopwords(data, custom_words):
'''
removes stop words
'''
stop_words = stopwords.words('english') + custom_words + list(string.punctuation)
return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in data]
def lemmatization(data, allowed_word_types = ['NOUN', 'ADJ', 'VERB', 'ADV']):
output = []
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
for i in data:
doc = nlp(" ".join(i))
output.append([token.lemma_ for token in doc if token.pos_ in allowed_word_types])
return output
def department_star(x):
departments = x['department_name'].values.tolist()
star = x['student_star'].values.tolist()
d = defaultdict(list)
for key, value in zip(departments, star):
d[key].append(value)
return dict(d)
def dataframe(x):
'''
dataframe with school, professor, department, state and avg rating
'''
schools = x['school_name'].values.tolist()
professors = x['professor_name'].values.tolist()
departments = x['department_name'].values.tolist()
state = x['state_name'].values.tolist()
star = x['star_rating'].values.tolist()
data_tuple = list(zip(schools, professors, departments, state, star))
df = pd.DataFrame(data_tuple, columns = ['School', 'Professor', 'Department', 'State', 'Star'])
df = df.drop_duplicates()
df = df.sort_values('School')
return df
def professor_tags(x):
'''
dataframe containing professor and tags
'''
professors = x['professor_name'].values.tolist()
tags = x['tag_professor'].values.tolist()
for i in range(len(tags)):
tags[i] = str(tags[i])
tags[i] = tags[i].replace("(","").replace(")", "")
remove_digits = str.maketrans('','', digits)
tags[i] = tags[i].translate(remove_digits)
tags[i] = tags[i].replace("\\", "")
data_tuple = list(zip(professors, tags))
df = pd.DataFrame(data_tuple, columns = ['Professor', 'Tags'])
df = df.drop_duplicates()
return df
def professor_tags_dict(x):
'''
dictionary of the form
[prof : [(tag1, freq1), (tag2, freq2) ... ]
'''
professors = df['professor_name'].values.tolist()
df.tag_professor = df.tag_professor.fillna('')
tags = df['tag_professor'].values.tolist()
prof_tags_dict = collections.defaultdict(list)
for i in range(len(tags)):
tgs = re.sub('\(\d\)',':', tags[i])
clean_tags = [x.strip() for x in tgs.split(':') if x!='']
freqs = re.findall('(\d)', tags[i])
tag_freqs = list(zip(clean_tags, freqs))
prof_tags_dict[professors[i]] = tag_freqs
with open('prof_tags_dict.pickle', 'wb') as handle:
pickle.dump(prof_tags_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
def professor_diff(x):
'''
dataframe containing professor and student difficulty
'''
professors = x['professor_name'].values.tolist()
difficulty = x['student_difficult'].values.tolist()
data_tuple = list(zip(professors, difficulty))
df = pd.DataFrame(data_tuple, columns = ['Professor', 'Difficulty'])
return df
def date_star(x):
'''
df with professor, star rating and date
'''
professors = x['professor_name'].values.tolist()
star = x['star_rating'].values.tolist()
date = x['post_date'].values.tolist()
data_tuple = list(zip(professors, star, date))
df = pd.DataFrame(data_tuple, columns = ['Professor', 'Star', 'Post Date'])
return df
def diff_star(x):
'''
df with professor difficulty and star rating
'''
professors = x['professor_name'].values.tolist()
star = x['star_rating'].values.tolist()
difficulty = x['student_difficult'].values.tolist()
data_tuple = list(zip(professors, difficulty, star))
df = pd.DataFrame(data_tuple, columns = ['Professor', 'Difficulty', 'Star'])
return df