-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathucla_sentiment_analysis.py
157 lines (124 loc) · 4.26 KB
/
ucla_sentiment_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# -*- coding: utf-8 -*-
"""part1 2_21.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1Me67PabGSdyCNiTc3xXtXBE37rNuj98u
"""
import pandas as pd
import numpy as np
import pandas as pd
from scipy import spatial
import string
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
#import stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()
#import lemmatizer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()
#import word embedding
!wget http://nlp.stanford.edu/data/glove.42B.300d.zip
!unzip glove.42B.300d.zip
#naming word embedding data
!head -n 1000 glove.42B.300d.txt > top_1000.txt
embeddings = {}
with open('top_1000.txt', 'r') as f:
for line in f:
values = line.split()
word = values[0]
vector = np.asarray(values[1:], "float32")
embeddings[word] = vector
words_with_embeddings = set([w for w in embeddings])
#imports for sentiment analysis
#make sure it's installed by running following
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer=SentimentIntensityAnalyzer()
#needed for plotting
import matplotlib.pyplot as plt
def clean(text):
'''
cleans text by changing text to a list of words, removing punctuation, numbers, unwanted characters (, ’ ” + ),
changing to lower case, and removing stop words. Also removes emojis.
'''
# Remove non-alphanumeric characters
text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]', '', text)
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
new_list = []
# clean description
text = text.translate(str.maketrans('', '', string.punctuation))
description = text.lower().strip()
words = description.split(" ")
stops = set(stopwords.words('english'))
#filter out numbers/characters
new_list = [x for x in words if not any(c.isdigit() or c in ['+', '-', '/', '"', "'"] for c in x)]
#removes any empty strings
new_list = list(filter(None, new_list))
# filter out stop words
new_list = [w for w in new_list if not w in stops]
return new_list
clean(mission_statement)
def stem(words):
'''
takes in a clean list of words, uses stemmer to return a list of lemmas for words without duplicates
'''
new_list = []
for w in words:
x = ps.stem(w)
if x not in new_list:
new_list += [x]
return new_list
def lemma(words):
'''
takes in a clean list of words, uses stemmer to return a list of lemmas for words without duplicates
'''
new_list = []
for w in words:
x = lemmatizer.lemmatize(w)
if x not in new_list:
new_list += [x]
return new_list
#not necessarily needed for sentiment analysis
def calculate_description_embedding(words):
'''
takes in a clean list of words, finds the word embeddings for each word, and finds the average word embedding for the list of words.
'''
#removes any words not in words embedding list
words = [w for w in words if (w in words_with_embeddings)]
if len(words) == 0:
return None
# calculate embedding and return
return sum([embeddings[w] for w in words])/len(words)
def get_sentiment(polarity):
'''function to determine polariy based on sentiment analysis score
'''
if polarity < -0.5:
return 'Very Negative'
elif polarity >= -0.5 and polarity < -0.1:
return 'Negative'
elif polarity > 0.1 and polarity < 0.5:
return 'Positive'
elif polarity >= 0.5:
return 'Very Positive'
else:
return 'Neutral'
def graph_sentiment(text):
'''assumes text is a list that has been cleaned and lemmatized already, outputting a graph of sentiment analysis
'''
df=pd.DataFrame()
df['polarity']=[analyzer.polarity_scores(text)['compound'] for text in text]
df['sentiment']=df.polarity.apply(get_sentiment)
plt.figure(figsize=(3,3))
df.sentiment.value_counts().plot.bar()
#trying it out with ucla!
#import ucla data
!wget https://raw.githubusercontent.com/p-ai-org/p-colleges/main/Brian/Brian%20Reddit/ucla.txt
#opening file as a string of words
with open("ucla.txt") as file:
ucla = file.read().replace('\n',' ')
graph_sentiment(lemma(clean(ucla)))