-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare.py
179 lines (130 loc) · 6.69 KB
/
prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
import unicodedata
import re
import json
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import pandas as pd
###########################################################################################################################################################################
def basic_prep_ac():
'''This function reads in the animal crossing csv and '''
df= pd.read_csv('animal_crossing.csv', index_col=0)
#finding the length of the readme
df['readme_length'] = wrangle.find_readme_length()
#removing the nulls
df.dropna(inplace=True)
return df
###########################################################################################################################################################################
def clean(text):
'''This function cleans up the text data from the readme_contents.'''
wnl = nltk.stem.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
text = (unicodedata.normalize('NFKD', text)
.encode('ascii', 'ignore')
.decode('utf-8', 'ignore')
.lower())
words = re.sub(r'[^\w\s]', '', text).split()
# Return a joined string
return " ".join([wnl.lemmatize(word) for word in words if word not in stopwords])
###########################################################################################################################################################################
def split(df):
"""
3 way split for train, validate, and test datasets
To stratify, send in a column name
"""
train, test = train_test_split(df, test_size=.2, random_state=123)
train, validate = train_test_split(train, test_size=.3, random_state=123)
return train, validate, test
###########################################################################################################################################################################
def basic_clean(text):
'''
This function takes in a string and normalizes it by lowercasing
everything and replacing anything that is not a letter, number,
whitespace or a single quote.
'''
#lowercase all letters in the text
text = text.lower()
# normalize unicode by encoding into ASCII (ignore non-ASCII characters)
# then decoding back into unicode
text = unicodedata.normalize('NFKD', text) .encode('ascii', 'ignore') .decode('utf-8', 'ignore')
# remove any that is not a letter, number, single quote, or whitespace
text = re.sub(r"[^a-z0-9'\s]", '', text)
return text
###########################################################################################################################################################################
def tokenize(text):
'''
This function takes in a string and returns the string will the
words tokenized
'''
# Create the tokenizer
tokenizer = nltk.tokenize.ToktokTokenizer()
# Use the tokenizer
text = tokenizer.tokenize(text, return_str=True)
return text
###########################################################################################################################################################################
def stem(text):
'''
This function takes in a string and returns the string after applying
stemming to all the words.
'''
# Create the porter stemmer
ps = nltk.porter.PorterStemmer()
# Apply the stemmer to each word in our string.
stems = [ps.stem(word) for word in text.split()]
# Join our lists of words into a string again
text_stemmed = ' '.join(stems)
return text_stemmed
###########################################################################################################################################################################
def lemmatize(text):
'''
This function takes in a string and returns the string after applying
lemmatization to all the words.
'''
# Create the Lemmatizer.
wnl = nltk.stem.WordNetLemmatizer()
# Use the lemmatizer on each word in the list of words we created by using split.
lemmas = [wnl.lemmatize(word) for word in text.split()]
# Join our list of words into a string again; assign to a variable to save changes.
text_lemmatized = ' '.join(lemmas)
return text_lemmatized
###########################################################################################################################################################################
def remove_stopwords(text, extra_words=[], exclude_words=[]):
'''
This function takes in a string and optional lists of extra_words and
words to exclude from the list and then returns the string after removing stop_words
'''
# Define the stop word list
stopword_list = stopwords.words('english')
# add extra_words (if any) to the stopwords list
if len(extra_words) > 0:
stopword_list = stopword_list.append(extra_words)
# remove exclude_words (if any) from the stopwords list
if len(exclude_words) > 0:
stopword_list = stopword_list.remove(exclude_words)
# Split words in text.
text = text.split()
# Create a list of words from my string with stopwords removed and assign to variable.
filtered_words = [word for word in text if word not in stopword_list]
# Join words in the list back into strings; assign to a variable to keep changes.
text_without_stopwords = ' '.join(filtered_words)
return text_without_stopwords
###########################################################################################################################################################################
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
'''
This function take in a df and the string name for a text column with
option to pass lists for extra_words and exclude_words and
returns a df with the text article title, original text, stemmed text,
lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
'''
df['clean'] = df[column].apply(basic_clean) .apply(tokenize) .apply(remove_stopwords,
extra_words=extra_words,
exclude_words=exclude_words)\
.apply(lemmatize)
df['stemmed'] = df[column].apply(basic_clean).apply(stem)
df['lemmatized'] = df[column].apply(basic_clean).apply(lemmatize)
return df[['title', column, 'stemmed', 'lemmatized', 'clean']]