TheRensselaerIDEA · shwehtom89 · Dec 7, 2020 · Dec 7, 2020 · Dec 16, 2020
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,6 @@ twitter_monitor/tmlog.txt
 embedder/embedderlog.txt
 twitter_monitor/jdllog.txt
 sentiment/sentimentlog.txt
+sentiment/semeval*
+sentiment/sentiment.pt
 analysis/snapshots/*.Rdata
diff --git a/sentiment/bert_eval.py b/sentiment/bert_eval.py
@@ -0,0 +1,65 @@
+import torch
+import requests
+from transformers import AutoTokenizer, AutoConfig, BertForSequenceClassification
+from torch.nn.functional import softmax
+from typing import List
+
+# global constants
+MODEL_NAME = 'digitalepidemiologylab/covid-twitter-bert-v2'
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+mapping = { 0: 'negative', 1: 'neutral', 2: 'positive' }
+
+class BertSentiment():
+	"""
+	Initializes a bert model used for evaluation
+	@param path: local relative path of the bert model
+	@param remote: defaults to empty, if specified will download model from url
+	""" 
+	def __init__(self, path: str, config: str, remote: str=""):
+		if len(remote) != 0:
+			self.download(remote)
+		self.tokenizer = tokenizer
+		self.load(path, config)
+
+	"""
+	Downloads bert model from remote
+	@param remote: url location of bert model
+	@param dest: destination path where model will be downloaded to
+	"""
+	def download(self, remote: str, dest: str) -> str:
+		try:
+			res = requests.get(remote, allow_redirects=True)
+			with open(dest, "wb") as f:
+				f.write(res.content)
+			return dest
+		except:
+			print("Could not download model")
+			return None
+
+	"""
+	Loads pytorch model in for inference
+	@param path: local path to the bert model
+        @param config: local path to bert config
+	"""
+	def load(self, path:str, config:str):
+		self.device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
+		self.config = AutoConfig.from_pretrained(config)
+		self.model = BertForSequenceClassification(self.config)
+		self.model.load_state_dict(torch.load(path, self.device))
+		self.model.to(self.device)
+		self.model.eval()
+
+	"""
+	Takes in a tweet and calculates a sentiment prediction confidences
+	"""
+	def score(self, text):
+		encodings = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=35)
+		inputs = encodings["input_ids"].to(self.device)
+		with torch.no_grad():
+			logits = self.model(inputs, labels=None)[0]
+		preds = softmax(logits.cpu(), dim=1)
+		infer = torch.argmax(preds, dim=1)
+		sentiment = [mapping[p.item()] for p in infer]
+		infer = infer - 1
+		return preds.tolist(), sentiment, infer.tolist()
+
diff --git a/sentiment/bert_sentiment.py b/sentiment/bert_sentiment.py
@@ -10,7 +10,7 @@
 import numpy as np
 import pandas as pd
 import torch
-import torch.nn as nna
+import torch.nn as nn
 import pickle
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import classification_report
@@ -188,7 +188,7 @@ def format_time(elapsed):
 """
 
 # declare a model, export to GPU if available and set to training
-device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')
 
 model = AutoModelForSequenceClassification.from_pretrained(
     "digitalepidemiologylab/covid-twitter-bert-v2",
@@ -235,7 +235,8 @@ def format_time(elapsed):
     labels = batch['labels'].to(device)
 
     # feed the batch of data into the model
-    loss, logits = model(input_ids, attention_mask=attention_mask, labels=labels)
+    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
+    loss, logits = outputs[0], outputs[1]
     total_train_loss += loss.item()
 
     # clip the norm of the gradients to 1
@@ -269,7 +270,8 @@ def format_time(elapsed):
 
     # feed tata into model without training
     with torch.no_grad():
-      loss, logits = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
+      outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
+      loss, logits = outputs[0], outputs[1]
     total_eval_loss += loss.item()
 
     logits = logits.detach().cpu().numpy()
@@ -295,10 +297,7 @@ def format_time(elapsed):
   print("  Validation Loss: {0:.2f}".format(avg_val_loss))
   print("  Validation took: {:}".format(validation_time))
 
-torch.save(model, './sentiment.pt')
-with open('training_data.pickle', 'wb') as f:
-  pickle.dump({'accuracy': accuracies, 'training_loss': training_losses, 'val_loss': val_losses}, f)
-
+torch.save(model.state_dict(), './bert_sentiment.pt')
 
 """
 upload model to dropbox in case runtime disconects
@@ -351,7 +350,8 @@ def largeUpload(file_path, dest_path):
   b_labels = batch['labels'].to(device)
 
   with torch.no_grad():
-    loss, logits = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
+    outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
+    loss, logits = outputs[0], outputs[1]
   loss_data = np.append(loss_data, loss.item())
 
   logits = logits.detach().cpu().numpy()

diff --git a/sentiment/config.json b/sentiment/config.json
@@ -2,13 +2,15 @@
     "py/object": "config.Config",
     "elasticsearch_host": "localhost",
     "elasticsearch_verify_certs": false,
-    "elasticsearch_index_name": "coronavirus-data2",
-    "elasticsearch_batch_size": 500,
+    "elasticsearch_index_name": "coronavirus-data-masks",
+    "elasticsearch_batch_size": 1000,
     "elasticsearch_timeout_secs": 30,
     "sleep_idle_secs": 5,
     "sleep_not_idle_secs": 0.01,
-    "log_level": "WARNING",
+    "log_level": "INFO",
     "semeval_dataset" : "semeval",
     "semeval_url" : "https://www.dropbox.com/s/byzr8yoda6bua1b/2017_English_final.zip?dl=1",
-    "thresholds" : [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
+    "thresholds" : [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45],
+    "model_path" : "bert_sentiment.pt",
+    "model_config" : "bert_config.config"
 }
diff --git a/sentiment/sentiment.py b/sentiment/sentiment.py
@@ -2,6 +2,7 @@
 import sentiment_helpers
 import time
 import logging
+from bert_eval import BertSentiment
 from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 from elasticsearch import Elasticsearch
 from elasticsearch.helpers import bulk
@@ -31,8 +32,10 @@
 #Load vader sentiment intensity analyzer
 vader = SentimentIntensityAnalyzer()
 
+bert = BertSentiment(config.model_path, config.model_config)
+
 #Initialize elasticsearch settings
-es = Elasticsearch(hosts=[config.elasticsearch_host], 
+es = Elasticsearch(hosts=[config.elasticsearch_host],
                    verify_certs=config.elasticsearch_verify_certs,
                    timeout=config.elasticsearch_timeout_secs)
 
@@ -56,30 +59,49 @@
             continue
 
         #Run sentiment analysis on the batch
-        logging.info("Found {0} unscored docs. Calculating sentiment scores with Vader...".format(len(hits)))
-        updates = []
-        for hit in hits:
-            text, quoted_text = sentiment_helpers.get_tweet_text(hit)
-            text = sentiment_helpers.clean_text_for_vader(text)
+        logging.info("Found {0} unscored docs. Calculating sentiment scores with Vader and Bert...".format(len(hits)))
+        texts, quoted_texts = zip(*map(sentiment_helpers.get_tweet_text, hits))
+        texts = list(map(sentiment_helpers.clean_text_for_vader, texts))
+        quoted = {index: quote for index, quote in map(lambda x: (x[0], sentiment_helpers.clean_text_for_vader(x[1])), filter(lambda y: y[1] is not None, enumerate(quoted_texts)))}
+        quoted_concat = {index: "{0} {1}".format(quote, texts[index]) for index, quote in quoted.items()}
+
+        # use bert to batch score text inputs
+        bert_texts = bert.score(texts)
+        bert_quoted = bert.score(list(quoted.values()))
+        bert_concat = bert.score(list(quoted_concat.values()))
+        bert_quoted = {key: val for key, val in zip(quoted.keys(), zip(*bert_quoted))}
+        bert_concat = {key: val for key, val in zip(quoted_concat.keys(), zip(*bert_concat))}
+
+        def create_update(hit):
             action = {
                 "_op_type": "update",
-                "_id": hit.meta["id"],
+                "_id": hit[1].meta["id"],
                 "doc": {
                     "sentiment": {
                         "vader": {
-                            "primary": vader.polarity_scores(text)["compound"]
-                        }
+                            "primary": vader.polarity_scores(texts[hit[0]])["compound"]
+                        },
+			"bert" : {
+                            "scores": bert_texts[0][hit[0]],
+                            "class": bert_texts[1][hit[0]],
+                            "primary": bert_texts[2][hit[0]]
+			}
                     }
                 }
             }
-            if quoted_text is not None:
-                quoted_text = sentiment_helpers.clean_text_for_vader(quoted_text)
-                quoted_concat_text = "{0} {1}".format(quoted_text, text)
-                action["doc"]["sentiment"]["vader"]["quoted"] = vader.polarity_scores(quoted_text)["compound"]
-                action["doc"]["sentiment"]["vader"]["quoted_concat"] = vader.polarity_scores(quoted_concat_text)["compound"]
-
-            updates.append(action)
+            if hit[0] in quoted:
+                action["doc"]["sentiment"]["vader"]["quoted"] = vader.polarity_scores(quoted_texts[hit[0]])["compound"]
+                action["doc"]["sentiment"]["vader"]["quoted_concat"] = vader.polarity_scores(quoted_concat[hit[0]])["compound"]
+                action["doc"]["sentiment"]["bert"]["quoted"] = bert_quoted[hit[0]][2]
+                action["doc"]["sentiment"]["bert"]["quoted_scores"] = bert_quoted[hit[0]][0]
+                action["doc"]["sentiment"]["bert"]["quoted_class"] = bert_quoted[hit[0]][1]
+                action["doc"]["sentiment"]["bert"]["quoted_concat"] = bert_concat[hit[0]][2]
+                action["doc"]["sentiment"]["bert"]["quoted_concat_scores"] = bert_concat[hit[0]][0]
+                action["doc"]["sentiment"]["bert"]["quoted_concat_class"] = bert_concat[hit[0]][1]
+            return action
 
+        updates = list(map(create_update, enumerate(hits)))
+
         #Issue the bulk update request
         logging.info("Making bulk request to Elasticsearch with {0} update actions...".format(len(updates)))
         bulk(es, updates, index=config.elasticsearch_index_name, chunk_size=len(updates))
@@ -89,4 +111,4 @@
         time.sleep(config.sleep_not_idle_secs)
 
     except Exception as ex:
-        logging.exception("Exception occurred while polling or processing a batch.")
+        logging.exception("Exception occurred while polling or processing a batch.")
diff --git a/sentiment/sentiment_helpers.py b/sentiment/sentiment_helpers.py
@@ -15,11 +15,26 @@ def get_query():
         "filter": [
             {
             "bool": {
-                "must_not": {
-                "exists": {
-                    "field": "sentiment.vader.primary"
+                "should": [{
+                   "bool": {
+                   "must_not": {
+                       "exists": {
+                           "field": "sentiment.vader.primary"
+                       }
+                    }
                   }
-                }
+                 },
+                 {
+                   "bool": {
+                   "must_not": {
+                      "exists": {
+                          "field": "sentiment.bert.primary"
+                      }
+                    }
+                  }
+                 }
+                ],
+                "minimum_should_match" : 1 
               }
             },
             {
@@ -55,4 +70,4 @@ def clean_text_for_vader(text):
   text = re.sub(r"http\S+", "", text)
   text = re.sub(r" +", " ", text)
   text = text.strip()
-  return text
+  return text