ucla_sentiment_analysis.py

# -*- coding: utf-8 -*-
"""part1 2_21.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1Me67PabGSdyCNiTc3xXtXBE37rNuj98u
"""

import pandas as pd
import numpy as np
import pandas as pd
from scipy import spatial
import string
import nltk
import re

nltk.download('stopwords')
from nltk.corpus import stopwords
#import stemmer 
from nltk.stem import PorterStemmer
ps = PorterStemmer()
#import lemmatizer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

#import word embedding 
!wget http://nlp.stanford.edu/data/glove.42B.300d.zip
!unzip glove.42B.300d.zip

#naming word embedding data 
!head -n 1000 glove.42B.300d.txt > top_1000.txt

embeddings = {}
with open('top_1000.txt', 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings[word] = vector
        
words_with_embeddings = set([w for w in embeddings])

#imports for sentiment analysis 
#make sure it's installed by running following 
!pip install vaderSentiment

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer=SentimentIntensityAnalyzer()

#needed for plotting 
import matplotlib.pyplot as plt

def clean(text):
  '''
  cleans text by changing text to a list of words, removing punctuation, numbers, unwanted characters (, ’ ” + ), 
  changing to lower case, and removing stop words. Also removes emojis. 
  '''
  # Remove non-alphanumeric characters
  text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]', '', text)
  text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
  
  new_list = []
  # clean description
  text = text.translate(str.maketrans('', '', string.punctuation))
  description = text.lower().strip()
  words = description.split(" ")
  stops = set(stopwords.words('english'))

  #filter out numbers/characters
  new_list = [x for x in words if not any(c.isdigit() or c in ['+', '-', '/', '"', "'"] for c in x)]
    
  #removes any empty strings
  new_list = list(filter(None, new_list))

  # filter out stop words
  new_list = [w for w in new_list if not w in stops] 
  

  return new_list

clean(mission_statement)

def stem(words):
  '''
  takes in a clean list of words, uses stemmer to return a list of lemmas for words without duplicates
  '''
  new_list = []
  for w in words:
    x = ps.stem(w)
    if x not in new_list:
      new_list += [x]
  return new_list

def lemma(words):
  '''
  takes in a clean list of words, uses stemmer to return a list of lemmas for words without duplicates
  '''
  new_list = []
  for w in words:
    x = lemmatizer.lemmatize(w)
    if x not in new_list:
      new_list += [x]
  return new_list

#not necessarily needed for sentiment analysis 
def calculate_description_embedding(words):
  '''
  takes in a clean list of words, finds the word embeddings for each word, and finds the average word embedding for the list of words.
  '''
  #removes any words not in words embedding list
  words = [w for w in words if (w in words_with_embeddings)]
    
  if len(words) == 0:
      return None
    
  # calculate embedding and return
  return sum([embeddings[w] for w in words])/len(words)

def get_sentiment(polarity):
  '''function to determine polariy based on sentiment analysis score
  '''
    if polarity < -0.5:
      return 'Very Negative'
    elif polarity >= -0.5 and polarity < -0.1:
      return 'Negative'
    elif polarity > 0.1 and polarity < 0.5:
      return 'Positive'
    elif polarity >= 0.5:
        return 'Very Positive'
    else:
        return 'Neutral'

def graph_sentiment(text):
  '''assumes text is a list that has been cleaned and lemmatized already, outputting a graph of sentiment analysis
  '''
  df=pd.DataFrame()
  df['polarity']=[analyzer.polarity_scores(text)['compound'] for text in text]
  df['sentiment']=df.polarity.apply(get_sentiment)
  plt.figure(figsize=(3,3))
  df.sentiment.value_counts().plot.bar()


#trying it out with ucla!

#import ucla data 
!wget https://raw.githubusercontent.com/p-ai-org/p-colleges/main/Brian/Brian%20Reddit/ucla.txt

#opening file as a string of words 
with open("ucla.txt") as file:
    ucla = file.read().replace('\n',' ')


graph_sentiment(lemma(clean(ucla)))