From 546b32bf3b14f98c7fb77a318ea0f62ca3eca6b4 Mon Sep 17 00:00:00 2001
From: shweht <shweht@rpi.edu>
Date: Sun, 6 Dec 2020 20:43:28 -0500
Subject: [PATCH 1/3] use bert to score tweets in elasticsearch

---
 .gitignore                     |  2 ++
 sentiment/bert_eval.py         | 60 ++++++++++++++++++++++++++++++++++
 sentiment/sentiment.py         | 21 ++++++++++--
 sentiment/sentiment_helpers.py | 43 +++++++++++++++++++++++-
 4 files changed, 122 insertions(+), 4 deletions(-)
 create mode 100644 sentiment/bert_eval.py

diff --git a/.gitignore b/.gitignore
index ee7b9c0..790b3f7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,6 @@ twitter_monitor/tmlog.txt
 embedder/embedderlog.txt
 twitter_monitor/jdllog.txt
 sentiment/sentimentlog.txt
+sentiment/semeval*
+sentiment/sentiment.pt
 analysis/snapshots/*.Rdata
diff --git a/sentiment/bert_eval.py b/sentiment/bert_eval.py
new file mode 100644
index 0000000..d13951e
--- /dev/null
+++ b/sentiment/bert_eval.py
@@ -0,0 +1,60 @@
+import torch
+import requests
+from transformers import AutoTokenizer
+from torch.nn.functional import softmax
+from typing import List
+
+# global constants
+MODEL_NAME = 'digitalepidemiologylab/covid-twitter-bert-v2'
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+mapping = { 0: 'negative', 1: 'neutral', 2: 'positive' }
+
+class BertSentiment():
+	"""
+	Initializes a bert model used for evaluation
+	@param path: local relative path of the bert model
+	@param remote: defaults to empty, if specified will download model from url
+	""" 
+	def __init__(self, path: str, remote: str=""):
+		if len(remote) != 0:
+			self.download(remote)
+		self.tokenizer = tokenizer
+		self.load(path)
+	
+	"""
+	Downloads bert model from remote
+	@param remote: url location of bert model
+	@param dest: destination path where model will be downloaded to
+	"""
+	def download(self, remote: str, dest: str) -> str:
+		try:
+			res = requests.get(remote, allow_redirects=True)
+			with open(dest, "wb") as f:
+				f.write(res.content)
+			return dest
+		except:
+			print("Could not download model")
+			return None
+
+	"""
+	Loads pytorch model in for inference
+	@patam path: local path to the bert model
+	"""
+	def load(self, path:str):
+		self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+		self.model = torch.load(path)
+		self.model.to(self.device)
+		self.model.eval()
+	
+	"""
+	Takes in a tweet and calculates a sentiment prediction confidences
+	"""
+	def score(self, text):
+		encoding = self.tokenizer(text, return_tensors="pt", padding=True)
+		inputs = encoding["input_ids"].to(self.device)
+		logits = self.model(inputs, labels=None)[0]
+		temp = torch.flatten(logits.cpu())
+		preds = softmax(temp, dim=0)
+		sentiment = mapping[torch.argmax(preds).item()]
+		return preds.tolist(), sentiment
+
diff --git a/sentiment/sentiment.py b/sentiment/sentiment.py
index 6cb9f9f..e8a4d89 100644
--- a/sentiment/sentiment.py
+++ b/sentiment/sentiment.py
@@ -2,6 +2,7 @@
 import sentiment_helpers
 import time
 import logging
+from bert_eval import BertSentiment
 from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 from elasticsearch import Elasticsearch
 from elasticsearch.helpers import bulk
@@ -31,7 +32,10 @@
 #Load vader sentiment intensity analyzer
 vader = SentimentIntensityAnalyzer()
 
+bert = BertSentiment(config.model_path)
+
 #Initialize elasticsearch settings
+print(config.elasticsearch_verify_certs)
 es = Elasticsearch(hosts=[config.elasticsearch_host], 
                    verify_certs=config.elasticsearch_verify_certs,
                    timeout=config.elasticsearch_timeout_secs)
@@ -56,11 +60,12 @@
             continue
 
         #Run sentiment analysis on the batch
-        logging.info("Found {0} unscored docs. Calculating sentiment scores with Vader...".format(len(hits)))
+        logging.info("Found {0} unscored docs. Calculating sentiment scores with Vader and Bert...".format(len(hits)))
         updates = []
         for hit in hits:
             text, quoted_text = sentiment_helpers.get_tweet_text(hit)
             text = sentiment_helpers.clean_text_for_vader(text)
+            scores, result = bert.score(text)
             action = {
                 "_op_type": "update",
                 "_id": hit.meta["id"],
@@ -68,15 +73,25 @@
                     "sentiment": {
                         "vader": {
                             "primary": vader.polarity_scores(text)["compound"]
-                        }
+                        },
+			"bert" : {
+                            "scores": scores,
+                            "class": result
+			}
                     }
                 }
             }
             if quoted_text is not None:
                 quoted_text = sentiment_helpers.clean_text_for_vader(quoted_text)
                 quoted_concat_text = "{0} {1}".format(quoted_text, text)
+                quoted_scores, quoted_class = bert.score(quoted_text)
+                quoted_concat_scores, quoted_concat_class = bert.score(quoted_concat_text)
                 action["doc"]["sentiment"]["vader"]["quoted"] = vader.polarity_scores(quoted_text)["compound"]
                 action["doc"]["sentiment"]["vader"]["quoted_concat"] = vader.polarity_scores(quoted_concat_text)["compound"]
+                action["doc"]["sentiment"]["bert"]["quoted_scores"] = quoted_scores
+                action["doc"]["sentiment"]["bert"]["quoted_class"] = quoted_class
+                action["doc"]["sentiment"]["bert"]["quoted_concat_scores"] = quoted_concat_scores
+                action["doc"]["sentiment"]["bert"]["quoted_concat_class"] = quoted_concat_class
 
             updates.append(action)
 
@@ -89,4 +104,4 @@
         time.sleep(config.sleep_not_idle_secs)
 
     except Exception as ex:
-        logging.exception("Exception occurred while polling or processing a batch.")
\ No newline at end of file
+        logging.exception("Exception occurred while polling or processing a batch.")
diff --git a/sentiment/sentiment_helpers.py b/sentiment/sentiment_helpers.py
index 6870d77..bdebd2b 100644
--- a/sentiment/sentiment_helpers.py
+++ b/sentiment/sentiment_helpers.py
@@ -1,6 +1,47 @@
 import re
 
 def get_query():
+#    query = {
+#    "_source": [
+#        "text",
+#        "full_text",
+#        "extended_tweet.full_text",
+#        "quoted_status.text",
+#        "quoted_status.full_text",
+#        "quoted_status.extended_tweet.full_text"
+#    ],
+#    "query": {
+#        "bool": {
+#        "filter": [
+#            {
+#            "bool": {
+#                "must_not": [
+#                   {
+#                   "exists": {
+#                       "field": "sentiment.vader.primary"
+#                     }
+#                   },
+#                   {
+#                   "exists": {
+#                       "field": "sentiment.bert.scores"
+#                     }
+#                   }
+#                ]
+#              }
+#            },
+#            {
+#            "bool": {
+#                "must_not": {
+#                "exists": {
+#                    "field": "retweeted_status.id"
+#                  }
+#                }
+#              }
+#            }
+#          ]
+#        }
+#      }
+#    }
     query = {
     "_source": [
         "text",
@@ -55,4 +96,4 @@ def clean_text_for_vader(text):
   text = re.sub(r"http\S+", "", text)
   text = re.sub(r" +", " ", text)
   text = text.strip()
-  return text
\ No newline at end of file
+  return text

From a4b49ee3ccf1aeec129cd5dd2d3b4f40b251b203 Mon Sep 17 00:00:00 2001
From: shweht <shweht@rpi.edu>
Date: Sun, 6 Dec 2020 23:59:32 -0500
Subject: [PATCH 2/3] make truncation fix

---
 sentiment/bert_eval.py         |  2 +-
 sentiment/sentiment.py         |  3 +-
 sentiment/sentiment_helpers.py | 64 ++++++++++------------------------
 3 files changed, 21 insertions(+), 48 deletions(-)

diff --git a/sentiment/bert_eval.py b/sentiment/bert_eval.py
index d13951e..3c3960a 100644
--- a/sentiment/bert_eval.py
+++ b/sentiment/bert_eval.py
@@ -50,7 +50,7 @@ def load(self, path:str):
 	Takes in a tweet and calculates a sentiment prediction confidences
 	"""
 	def score(self, text):
-		encoding = self.tokenizer(text, return_tensors="pt", padding=True)
+		encoding = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=35)
 		inputs = encoding["input_ids"].to(self.device)
 		logits = self.model(inputs, labels=None)[0]
 		temp = torch.flatten(logits.cpu())
diff --git a/sentiment/sentiment.py b/sentiment/sentiment.py
index e8a4d89..ba65127 100644
--- a/sentiment/sentiment.py
+++ b/sentiment/sentiment.py
@@ -35,8 +35,7 @@
 bert = BertSentiment(config.model_path)
 
 #Initialize elasticsearch settings
-print(config.elasticsearch_verify_certs)
-es = Elasticsearch(hosts=[config.elasticsearch_host], 
+es = Elasticsearch(hosts=[config.elasticsearch_host],
                    verify_certs=config.elasticsearch_verify_certs,
                    timeout=config.elasticsearch_timeout_secs)
 
diff --git a/sentiment/sentiment_helpers.py b/sentiment/sentiment_helpers.py
index bdebd2b..219aeef 100644
--- a/sentiment/sentiment_helpers.py
+++ b/sentiment/sentiment_helpers.py
@@ -1,47 +1,6 @@
 import re
 
 def get_query():
-#    query = {
-#    "_source": [
-#        "text",
-#        "full_text",
-#        "extended_tweet.full_text",
-#        "quoted_status.text",
-#        "quoted_status.full_text",
-#        "quoted_status.extended_tweet.full_text"
-#    ],
-#    "query": {
-#        "bool": {
-#        "filter": [
-#            {
-#            "bool": {
-#                "must_not": [
-#                   {
-#                   "exists": {
-#                       "field": "sentiment.vader.primary"
-#                     }
-#                   },
-#                   {
-#                   "exists": {
-#                       "field": "sentiment.bert.scores"
-#                     }
-#                   }
-#                ]
-#              }
-#            },
-#            {
-#            "bool": {
-#                "must_not": {
-#                "exists": {
-#                    "field": "retweeted_status.id"
-#                  }
-#                }
-#              }
-#            }
-#          ]
-#        }
-#      }
-#    }
     query = {
     "_source": [
         "text",
@@ -56,11 +15,26 @@ def get_query():
         "filter": [
             {
             "bool": {
-                "must_not": {
-                "exists": {
-                    "field": "sentiment.vader.primary"
+                "should": [{
+                   "bool": {
+                   "must_not": {
+                       "exists": {
+                           "field": "sentiment.vader.primary"
+                       }
+                    }
                   }
-                }
+                 },
+                 {
+                   "bool": {
+                   "must_not": {
+                      "exists": {
+                          "field": "sentiment.bert.class"
+                      }
+                    }
+                  }
+                 }
+                ],
+                "minimum_should_match" : 1 
               }
             },
             {

From d95de62b2143453d9d11a7eb2432187c03a3fade Mon Sep 17 00:00:00 2001
From: shweht <shweht@rpi.edu>
Date: Tue, 15 Dec 2020 21:23:52 -0500
Subject: [PATCH 3/3] fix up bert model and bert update script

---
 sentiment/bert_eval.py         | 33 ++++++++++++---------
 sentiment/bert_sentiment.py    | 18 ++++++------
 sentiment/config.json          | 10 ++++---
 sentiment/sentiment.py         | 54 +++++++++++++++++++---------------
 sentiment/sentiment_helpers.py |  2 +-
 5 files changed, 66 insertions(+), 51 deletions(-)

diff --git a/sentiment/bert_eval.py b/sentiment/bert_eval.py
index 3c3960a..225fdb2 100644
--- a/sentiment/bert_eval.py
+++ b/sentiment/bert_eval.py
@@ -1,6 +1,6 @@
 import torch
 import requests
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, AutoConfig, BertForSequenceClassification
 from torch.nn.functional import softmax
 from typing import List
 
@@ -15,11 +15,11 @@ class BertSentiment():
 	@param path: local relative path of the bert model
 	@param remote: defaults to empty, if specified will download model from url
 	""" 
-	def __init__(self, path: str, remote: str=""):
+	def __init__(self, path: str, config: str, remote: str=""):
 		if len(remote) != 0:
 			self.download(remote)
 		self.tokenizer = tokenizer
-		self.load(path)
+		self.load(path, config)
 	
 	"""
 	Downloads bert model from remote
@@ -38,11 +38,14 @@ def download(self, remote: str, dest: str) -> str:
 
 	"""
 	Loads pytorch model in for inference
-	@patam path: local path to the bert model
+	@param path: local path to the bert model
+        @param config: local path to bert config
 	"""
-	def load(self, path:str):
-		self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
-		self.model = torch.load(path)
+	def load(self, path:str, config:str):
+		self.device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
+		self.config = AutoConfig.from_pretrained(config)
+		self.model = BertForSequenceClassification(self.config)
+		self.model.load_state_dict(torch.load(path, self.device))
 		self.model.to(self.device)
 		self.model.eval()
 	
@@ -50,11 +53,13 @@ def load(self, path:str):
 	Takes in a tweet and calculates a sentiment prediction confidences
 	"""
 	def score(self, text):
-		encoding = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=35)
-		inputs = encoding["input_ids"].to(self.device)
-		logits = self.model(inputs, labels=None)[0]
-		temp = torch.flatten(logits.cpu())
-		preds = softmax(temp, dim=0)
-		sentiment = mapping[torch.argmax(preds).item()]
-		return preds.tolist(), sentiment
+		encodings = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=35)
+		inputs = encodings["input_ids"].to(self.device)
+		with torch.no_grad():
+			logits = self.model(inputs, labels=None)[0]
+		preds = softmax(logits.cpu(), dim=1)
+		infer = torch.argmax(preds, dim=1)
+		sentiment = [mapping[p.item()] for p in infer]
+		infer = infer - 1
+		return preds.tolist(), sentiment, infer.tolist()
 
diff --git a/sentiment/bert_sentiment.py b/sentiment/bert_sentiment.py
index bcc6619..c934d04 100644
--- a/sentiment/bert_sentiment.py
+++ b/sentiment/bert_sentiment.py
@@ -10,7 +10,7 @@
 import numpy as np
 import pandas as pd
 import torch
-import torch.nn as nna
+import torch.nn as nn
 import pickle
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import classification_report
@@ -188,7 +188,7 @@ def format_time(elapsed):
 """
 
 # declare a model, export to GPU if available and set to training
-device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')
 
 model = AutoModelForSequenceClassification.from_pretrained(
     "digitalepidemiologylab/covid-twitter-bert-v2",
@@ -235,7 +235,8 @@ def format_time(elapsed):
     labels = batch['labels'].to(device)
 
     # feed the batch of data into the model
-    loss, logits = model(input_ids, attention_mask=attention_mask, labels=labels)
+    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
+    loss, logits = outputs[0], outputs[1]
     total_train_loss += loss.item()
 
     # clip the norm of the gradients to 1
@@ -269,7 +270,8 @@ def format_time(elapsed):
 
     # feed tata into model without training
     with torch.no_grad():
-      loss, logits = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
+      outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
+      loss, logits = outputs[0], outputs[1]
     total_eval_loss += loss.item()
     
     logits = logits.detach().cpu().numpy()
@@ -295,10 +297,7 @@ def format_time(elapsed):
   print("  Validation Loss: {0:.2f}".format(avg_val_loss))
   print("  Validation took: {:}".format(validation_time))
 
-torch.save(model, './sentiment.pt')
-with open('training_data.pickle', 'wb') as f:
-  pickle.dump({'accuracy': accuracies, 'training_loss': training_losses, 'val_loss': val_losses}, f)
-
+torch.save(model.state_dict(), './bert_sentiment.pt')
 
 """
 upload model to dropbox in case runtime disconects
@@ -351,7 +350,8 @@ def largeUpload(file_path, dest_path):
   b_labels = batch['labels'].to(device)
 
   with torch.no_grad():
-    loss, logits = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
+    outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
+    loss, logits = outputs[0], outputs[1]
   loss_data = np.append(loss_data, loss.item())
     
   logits = logits.detach().cpu().numpy()
diff --git a/sentiment/config.json b/sentiment/config.json
index d502dd5..178bd11 100644
--- a/sentiment/config.json
+++ b/sentiment/config.json
@@ -2,13 +2,15 @@
     "py/object": "config.Config",
     "elasticsearch_host": "localhost",
     "elasticsearch_verify_certs": false,
-    "elasticsearch_index_name": "coronavirus-data2",
-    "elasticsearch_batch_size": 500,
+    "elasticsearch_index_name": "coronavirus-data-masks",
+    "elasticsearch_batch_size": 1000,
     "elasticsearch_timeout_secs": 30,
     "sleep_idle_secs": 5,
     "sleep_not_idle_secs": 0.01,
-    "log_level": "WARNING",
+    "log_level": "INFO",
     "semeval_dataset" : "semeval",
     "semeval_url" : "https://www.dropbox.com/s/byzr8yoda6bua1b/2017_English_final.zip?dl=1",
-    "thresholds" : [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
+    "thresholds" : [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45],
+    "model_path" : "bert_sentiment.pt",
+    "model_config" : "bert_config.config"
 }
diff --git a/sentiment/sentiment.py b/sentiment/sentiment.py
index ba65127..31c6c42 100644
--- a/sentiment/sentiment.py
+++ b/sentiment/sentiment.py
@@ -32,7 +32,7 @@
 #Load vader sentiment intensity analyzer
 vader = SentimentIntensityAnalyzer()
 
-bert = BertSentiment(config.model_path)
+bert = BertSentiment(config.model_path, config.model_config)
 
 #Initialize elasticsearch settings
 es = Elasticsearch(hosts=[config.elasticsearch_host],
@@ -60,40 +60,48 @@
 
         #Run sentiment analysis on the batch
         logging.info("Found {0} unscored docs. Calculating sentiment scores with Vader and Bert...".format(len(hits)))
-        updates = []
-        for hit in hits:
-            text, quoted_text = sentiment_helpers.get_tweet_text(hit)
-            text = sentiment_helpers.clean_text_for_vader(text)
-            scores, result = bert.score(text)
+        texts, quoted_texts = zip(*map(sentiment_helpers.get_tweet_text, hits))
+        texts = list(map(sentiment_helpers.clean_text_for_vader, texts))
+        quoted = {index: quote for index, quote in map(lambda x: (x[0], sentiment_helpers.clean_text_for_vader(x[1])), filter(lambda y: y[1] is not None, enumerate(quoted_texts)))}
+        quoted_concat = {index: "{0} {1}".format(quote, texts[index]) for index, quote in quoted.items()}
+        
+        # use bert to batch score text inputs
+        bert_texts = bert.score(texts)
+        bert_quoted = bert.score(list(quoted.values()))
+        bert_concat = bert.score(list(quoted_concat.values()))
+        bert_quoted = {key: val for key, val in zip(quoted.keys(), zip(*bert_quoted))}
+        bert_concat = {key: val for key, val in zip(quoted_concat.keys(), zip(*bert_concat))}
+        
+        def create_update(hit):
             action = {
                 "_op_type": "update",
-                "_id": hit.meta["id"],
+                "_id": hit[1].meta["id"],
                 "doc": {
                     "sentiment": {
                         "vader": {
-                            "primary": vader.polarity_scores(text)["compound"]
+                            "primary": vader.polarity_scores(texts[hit[0]])["compound"]
                         },
 			"bert" : {
-                            "scores": scores,
-                            "class": result
+                            "scores": bert_texts[0][hit[0]],
+                            "class": bert_texts[1][hit[0]],
+                            "primary": bert_texts[2][hit[0]]
 			}
                     }
                 }
             }
-            if quoted_text is not None:
-                quoted_text = sentiment_helpers.clean_text_for_vader(quoted_text)
-                quoted_concat_text = "{0} {1}".format(quoted_text, text)
-                quoted_scores, quoted_class = bert.score(quoted_text)
-                quoted_concat_scores, quoted_concat_class = bert.score(quoted_concat_text)
-                action["doc"]["sentiment"]["vader"]["quoted"] = vader.polarity_scores(quoted_text)["compound"]
-                action["doc"]["sentiment"]["vader"]["quoted_concat"] = vader.polarity_scores(quoted_concat_text)["compound"]
-                action["doc"]["sentiment"]["bert"]["quoted_scores"] = quoted_scores
-                action["doc"]["sentiment"]["bert"]["quoted_class"] = quoted_class
-                action["doc"]["sentiment"]["bert"]["quoted_concat_scores"] = quoted_concat_scores
-                action["doc"]["sentiment"]["bert"]["quoted_concat_class"] = quoted_concat_class
-
-            updates.append(action)
+            if hit[0] in quoted:
+                action["doc"]["sentiment"]["vader"]["quoted"] = vader.polarity_scores(quoted_texts[hit[0]])["compound"]
+                action["doc"]["sentiment"]["vader"]["quoted_concat"] = vader.polarity_scores(quoted_concat[hit[0]])["compound"]
+                action["doc"]["sentiment"]["bert"]["quoted"] = bert_quoted[hit[0]][2]
+                action["doc"]["sentiment"]["bert"]["quoted_scores"] = bert_quoted[hit[0]][0]
+                action["doc"]["sentiment"]["bert"]["quoted_class"] = bert_quoted[hit[0]][1]
+                action["doc"]["sentiment"]["bert"]["quoted_concat"] = bert_concat[hit[0]][2]
+                action["doc"]["sentiment"]["bert"]["quoted_concat_scores"] = bert_concat[hit[0]][0]
+                action["doc"]["sentiment"]["bert"]["quoted_concat_class"] = bert_concat[hit[0]][1]
+            return action
 
+        updates = list(map(create_update, enumerate(hits)))
+        
         #Issue the bulk update request
         logging.info("Making bulk request to Elasticsearch with {0} update actions...".format(len(updates)))
         bulk(es, updates, index=config.elasticsearch_index_name, chunk_size=len(updates))
diff --git a/sentiment/sentiment_helpers.py b/sentiment/sentiment_helpers.py
index 219aeef..c635267 100644
--- a/sentiment/sentiment_helpers.py
+++ b/sentiment/sentiment_helpers.py
@@ -28,7 +28,7 @@ def get_query():
                    "bool": {
                    "must_not": {
                       "exists": {
-                          "field": "sentiment.bert.class"
+                          "field": "sentiment.bert.primary"
                       }
                     }
                   }