From 546b32bf3b14f98c7fb77a318ea0f62ca3eca6b4 Mon Sep 17 00:00:00 2001 From: shweht Date: Sun, 6 Dec 2020 20:43:28 -0500 Subject: [PATCH 1/3] use bert to score tweets in elasticsearch --- .gitignore | 2 ++ sentiment/bert_eval.py | 60 ++++++++++++++++++++++++++++++++++ sentiment/sentiment.py | 21 ++++++++++-- sentiment/sentiment_helpers.py | 43 +++++++++++++++++++++++- 4 files changed, 122 insertions(+), 4 deletions(-) create mode 100644 sentiment/bert_eval.py diff --git a/.gitignore b/.gitignore index ee7b9c0..790b3f7 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,6 @@ twitter_monitor/tmlog.txt embedder/embedderlog.txt twitter_monitor/jdllog.txt sentiment/sentimentlog.txt +sentiment/semeval* +sentiment/sentiment.pt analysis/snapshots/*.Rdata diff --git a/sentiment/bert_eval.py b/sentiment/bert_eval.py new file mode 100644 index 0000000..d13951e --- /dev/null +++ b/sentiment/bert_eval.py @@ -0,0 +1,60 @@ +import torch +import requests +from transformers import AutoTokenizer +from torch.nn.functional import softmax +from typing import List + +# global constants +MODEL_NAME = 'digitalepidemiologylab/covid-twitter-bert-v2' +tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) +mapping = { 0: 'negative', 1: 'neutral', 2: 'positive' } + +class BertSentiment(): + """ + Initializes a bert model used for evaluation + @param path: local relative path of the bert model + @param remote: defaults to empty, if specified will download model from url + """ + def __init__(self, path: str, remote: str=""): + if len(remote) != 0: + self.download(remote) + self.tokenizer = tokenizer + self.load(path) + + """ + Downloads bert model from remote + @param remote: url location of bert model + @param dest: destination path where model will be downloaded to + """ + def download(self, remote: str, dest: str) -> str: + try: + res = requests.get(remote, allow_redirects=True) + with open(dest, "wb") as f: + f.write(res.content) + return dest + except: + print("Could not download model") + return None + + """ + Loads pytorch model in for inference + @patam path: local path to the bert model + """ + def load(self, path:str): + self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + self.model = torch.load(path) + self.model.to(self.device) + self.model.eval() + + """ + Takes in a tweet and calculates a sentiment prediction confidences + """ + def score(self, text): + encoding = self.tokenizer(text, return_tensors="pt", padding=True) + inputs = encoding["input_ids"].to(self.device) + logits = self.model(inputs, labels=None)[0] + temp = torch.flatten(logits.cpu()) + preds = softmax(temp, dim=0) + sentiment = mapping[torch.argmax(preds).item()] + return preds.tolist(), sentiment + diff --git a/sentiment/sentiment.py b/sentiment/sentiment.py index 6cb9f9f..e8a4d89 100644 --- a/sentiment/sentiment.py +++ b/sentiment/sentiment.py @@ -2,6 +2,7 @@ import sentiment_helpers import time import logging +from bert_eval import BertSentiment from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk @@ -31,7 +32,10 @@ #Load vader sentiment intensity analyzer vader = SentimentIntensityAnalyzer() +bert = BertSentiment(config.model_path) + #Initialize elasticsearch settings +print(config.elasticsearch_verify_certs) es = Elasticsearch(hosts=[config.elasticsearch_host], verify_certs=config.elasticsearch_verify_certs, timeout=config.elasticsearch_timeout_secs) @@ -56,11 +60,12 @@ continue #Run sentiment analysis on the batch - logging.info("Found {0} unscored docs. Calculating sentiment scores with Vader...".format(len(hits))) + logging.info("Found {0} unscored docs. Calculating sentiment scores with Vader and Bert...".format(len(hits))) updates = [] for hit in hits: text, quoted_text = sentiment_helpers.get_tweet_text(hit) text = sentiment_helpers.clean_text_for_vader(text) + scores, result = bert.score(text) action = { "_op_type": "update", "_id": hit.meta["id"], @@ -68,15 +73,25 @@ "sentiment": { "vader": { "primary": vader.polarity_scores(text)["compound"] - } + }, + "bert" : { + "scores": scores, + "class": result + } } } } if quoted_text is not None: quoted_text = sentiment_helpers.clean_text_for_vader(quoted_text) quoted_concat_text = "{0} {1}".format(quoted_text, text) + quoted_scores, quoted_class = bert.score(quoted_text) + quoted_concat_scores, quoted_concat_class = bert.score(quoted_concat_text) action["doc"]["sentiment"]["vader"]["quoted"] = vader.polarity_scores(quoted_text)["compound"] action["doc"]["sentiment"]["vader"]["quoted_concat"] = vader.polarity_scores(quoted_concat_text)["compound"] + action["doc"]["sentiment"]["bert"]["quoted_scores"] = quoted_scores + action["doc"]["sentiment"]["bert"]["quoted_class"] = quoted_class + action["doc"]["sentiment"]["bert"]["quoted_concat_scores"] = quoted_concat_scores + action["doc"]["sentiment"]["bert"]["quoted_concat_class"] = quoted_concat_class updates.append(action) @@ -89,4 +104,4 @@ time.sleep(config.sleep_not_idle_secs) except Exception as ex: - logging.exception("Exception occurred while polling or processing a batch.") \ No newline at end of file + logging.exception("Exception occurred while polling or processing a batch.") diff --git a/sentiment/sentiment_helpers.py b/sentiment/sentiment_helpers.py index 6870d77..bdebd2b 100644 --- a/sentiment/sentiment_helpers.py +++ b/sentiment/sentiment_helpers.py @@ -1,6 +1,47 @@ import re def get_query(): +# query = { +# "_source": [ +# "text", +# "full_text", +# "extended_tweet.full_text", +# "quoted_status.text", +# "quoted_status.full_text", +# "quoted_status.extended_tweet.full_text" +# ], +# "query": { +# "bool": { +# "filter": [ +# { +# "bool": { +# "must_not": [ +# { +# "exists": { +# "field": "sentiment.vader.primary" +# } +# }, +# { +# "exists": { +# "field": "sentiment.bert.scores" +# } +# } +# ] +# } +# }, +# { +# "bool": { +# "must_not": { +# "exists": { +# "field": "retweeted_status.id" +# } +# } +# } +# } +# ] +# } +# } +# } query = { "_source": [ "text", @@ -55,4 +96,4 @@ def clean_text_for_vader(text): text = re.sub(r"http\S+", "", text) text = re.sub(r" +", " ", text) text = text.strip() - return text \ No newline at end of file + return text From a4b49ee3ccf1aeec129cd5dd2d3b4f40b251b203 Mon Sep 17 00:00:00 2001 From: shweht Date: Sun, 6 Dec 2020 23:59:32 -0500 Subject: [PATCH 2/3] make truncation fix --- sentiment/bert_eval.py | 2 +- sentiment/sentiment.py | 3 +- sentiment/sentiment_helpers.py | 64 ++++++++++------------------------ 3 files changed, 21 insertions(+), 48 deletions(-) diff --git a/sentiment/bert_eval.py b/sentiment/bert_eval.py index d13951e..3c3960a 100644 --- a/sentiment/bert_eval.py +++ b/sentiment/bert_eval.py @@ -50,7 +50,7 @@ def load(self, path:str): Takes in a tweet and calculates a sentiment prediction confidences """ def score(self, text): - encoding = self.tokenizer(text, return_tensors="pt", padding=True) + encoding = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=35) inputs = encoding["input_ids"].to(self.device) logits = self.model(inputs, labels=None)[0] temp = torch.flatten(logits.cpu()) diff --git a/sentiment/sentiment.py b/sentiment/sentiment.py index e8a4d89..ba65127 100644 --- a/sentiment/sentiment.py +++ b/sentiment/sentiment.py @@ -35,8 +35,7 @@ bert = BertSentiment(config.model_path) #Initialize elasticsearch settings -print(config.elasticsearch_verify_certs) -es = Elasticsearch(hosts=[config.elasticsearch_host], +es = Elasticsearch(hosts=[config.elasticsearch_host], verify_certs=config.elasticsearch_verify_certs, timeout=config.elasticsearch_timeout_secs) diff --git a/sentiment/sentiment_helpers.py b/sentiment/sentiment_helpers.py index bdebd2b..219aeef 100644 --- a/sentiment/sentiment_helpers.py +++ b/sentiment/sentiment_helpers.py @@ -1,47 +1,6 @@ import re def get_query(): -# query = { -# "_source": [ -# "text", -# "full_text", -# "extended_tweet.full_text", -# "quoted_status.text", -# "quoted_status.full_text", -# "quoted_status.extended_tweet.full_text" -# ], -# "query": { -# "bool": { -# "filter": [ -# { -# "bool": { -# "must_not": [ -# { -# "exists": { -# "field": "sentiment.vader.primary" -# } -# }, -# { -# "exists": { -# "field": "sentiment.bert.scores" -# } -# } -# ] -# } -# }, -# { -# "bool": { -# "must_not": { -# "exists": { -# "field": "retweeted_status.id" -# } -# } -# } -# } -# ] -# } -# } -# } query = { "_source": [ "text", @@ -56,11 +15,26 @@ def get_query(): "filter": [ { "bool": { - "must_not": { - "exists": { - "field": "sentiment.vader.primary" + "should": [{ + "bool": { + "must_not": { + "exists": { + "field": "sentiment.vader.primary" + } + } } - } + }, + { + "bool": { + "must_not": { + "exists": { + "field": "sentiment.bert.class" + } + } + } + } + ], + "minimum_should_match" : 1 } }, { From d95de62b2143453d9d11a7eb2432187c03a3fade Mon Sep 17 00:00:00 2001 From: shweht Date: Tue, 15 Dec 2020 21:23:52 -0500 Subject: [PATCH 3/3] fix up bert model and bert update script --- sentiment/bert_eval.py | 33 ++++++++++++--------- sentiment/bert_sentiment.py | 18 ++++++------ sentiment/config.json | 10 ++++--- sentiment/sentiment.py | 54 +++++++++++++++++++--------------- sentiment/sentiment_helpers.py | 2 +- 5 files changed, 66 insertions(+), 51 deletions(-) diff --git a/sentiment/bert_eval.py b/sentiment/bert_eval.py index 3c3960a..225fdb2 100644 --- a/sentiment/bert_eval.py +++ b/sentiment/bert_eval.py @@ -1,6 +1,6 @@ import torch import requests -from transformers import AutoTokenizer +from transformers import AutoTokenizer, AutoConfig, BertForSequenceClassification from torch.nn.functional import softmax from typing import List @@ -15,11 +15,11 @@ class BertSentiment(): @param path: local relative path of the bert model @param remote: defaults to empty, if specified will download model from url """ - def __init__(self, path: str, remote: str=""): + def __init__(self, path: str, config: str, remote: str=""): if len(remote) != 0: self.download(remote) self.tokenizer = tokenizer - self.load(path) + self.load(path, config) """ Downloads bert model from remote @@ -38,11 +38,14 @@ def download(self, remote: str, dest: str) -> str: """ Loads pytorch model in for inference - @patam path: local path to the bert model + @param path: local path to the bert model + @param config: local path to bert config """ - def load(self, path:str): - self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') - self.model = torch.load(path) + def load(self, path:str, config:str): + self.device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu') + self.config = AutoConfig.from_pretrained(config) + self.model = BertForSequenceClassification(self.config) + self.model.load_state_dict(torch.load(path, self.device)) self.model.to(self.device) self.model.eval() @@ -50,11 +53,13 @@ def load(self, path:str): Takes in a tweet and calculates a sentiment prediction confidences """ def score(self, text): - encoding = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=35) - inputs = encoding["input_ids"].to(self.device) - logits = self.model(inputs, labels=None)[0] - temp = torch.flatten(logits.cpu()) - preds = softmax(temp, dim=0) - sentiment = mapping[torch.argmax(preds).item()] - return preds.tolist(), sentiment + encodings = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=35) + inputs = encodings["input_ids"].to(self.device) + with torch.no_grad(): + logits = self.model(inputs, labels=None)[0] + preds = softmax(logits.cpu(), dim=1) + infer = torch.argmax(preds, dim=1) + sentiment = [mapping[p.item()] for p in infer] + infer = infer - 1 + return preds.tolist(), sentiment, infer.tolist() diff --git a/sentiment/bert_sentiment.py b/sentiment/bert_sentiment.py index bcc6619..c934d04 100644 --- a/sentiment/bert_sentiment.py +++ b/sentiment/bert_sentiment.py @@ -10,7 +10,7 @@ import numpy as np import pandas as pd import torch -import torch.nn as nna +import torch.nn as nn import pickle from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report @@ -188,7 +188,7 @@ def format_time(elapsed): """ # declare a model, export to GPU if available and set to training -device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') +device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu') model = AutoModelForSequenceClassification.from_pretrained( "digitalepidemiologylab/covid-twitter-bert-v2", @@ -235,7 +235,8 @@ def format_time(elapsed): labels = batch['labels'].to(device) # feed the batch of data into the model - loss, logits = model(input_ids, attention_mask=attention_mask, labels=labels) + outputs = model(input_ids, attention_mask=attention_mask, labels=labels) + loss, logits = outputs[0], outputs[1] total_train_loss += loss.item() # clip the norm of the gradients to 1 @@ -269,7 +270,8 @@ def format_time(elapsed): # feed tata into model without training with torch.no_grad(): - loss, logits = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels) + outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels) + loss, logits = outputs[0], outputs[1] total_eval_loss += loss.item() logits = logits.detach().cpu().numpy() @@ -295,10 +297,7 @@ def format_time(elapsed): print(" Validation Loss: {0:.2f}".format(avg_val_loss)) print(" Validation took: {:}".format(validation_time)) -torch.save(model, './sentiment.pt') -with open('training_data.pickle', 'wb') as f: - pickle.dump({'accuracy': accuracies, 'training_loss': training_losses, 'val_loss': val_losses}, f) - +torch.save(model.state_dict(), './bert_sentiment.pt') """ upload model to dropbox in case runtime disconects @@ -351,7 +350,8 @@ def largeUpload(file_path, dest_path): b_labels = batch['labels'].to(device) with torch.no_grad(): - loss, logits = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels) + outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels) + loss, logits = outputs[0], outputs[1] loss_data = np.append(loss_data, loss.item()) logits = logits.detach().cpu().numpy() diff --git a/sentiment/config.json b/sentiment/config.json index d502dd5..178bd11 100644 --- a/sentiment/config.json +++ b/sentiment/config.json @@ -2,13 +2,15 @@ "py/object": "config.Config", "elasticsearch_host": "localhost", "elasticsearch_verify_certs": false, - "elasticsearch_index_name": "coronavirus-data2", - "elasticsearch_batch_size": 500, + "elasticsearch_index_name": "coronavirus-data-masks", + "elasticsearch_batch_size": 1000, "elasticsearch_timeout_secs": 30, "sleep_idle_secs": 5, "sleep_not_idle_secs": 0.01, - "log_level": "WARNING", + "log_level": "INFO", "semeval_dataset" : "semeval", "semeval_url" : "https://www.dropbox.com/s/byzr8yoda6bua1b/2017_English_final.zip?dl=1", - "thresholds" : [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] + "thresholds" : [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45], + "model_path" : "bert_sentiment.pt", + "model_config" : "bert_config.config" } diff --git a/sentiment/sentiment.py b/sentiment/sentiment.py index ba65127..31c6c42 100644 --- a/sentiment/sentiment.py +++ b/sentiment/sentiment.py @@ -32,7 +32,7 @@ #Load vader sentiment intensity analyzer vader = SentimentIntensityAnalyzer() -bert = BertSentiment(config.model_path) +bert = BertSentiment(config.model_path, config.model_config) #Initialize elasticsearch settings es = Elasticsearch(hosts=[config.elasticsearch_host], @@ -60,40 +60,48 @@ #Run sentiment analysis on the batch logging.info("Found {0} unscored docs. Calculating sentiment scores with Vader and Bert...".format(len(hits))) - updates = [] - for hit in hits: - text, quoted_text = sentiment_helpers.get_tweet_text(hit) - text = sentiment_helpers.clean_text_for_vader(text) - scores, result = bert.score(text) + texts, quoted_texts = zip(*map(sentiment_helpers.get_tweet_text, hits)) + texts = list(map(sentiment_helpers.clean_text_for_vader, texts)) + quoted = {index: quote for index, quote in map(lambda x: (x[0], sentiment_helpers.clean_text_for_vader(x[1])), filter(lambda y: y[1] is not None, enumerate(quoted_texts)))} + quoted_concat = {index: "{0} {1}".format(quote, texts[index]) for index, quote in quoted.items()} + + # use bert to batch score text inputs + bert_texts = bert.score(texts) + bert_quoted = bert.score(list(quoted.values())) + bert_concat = bert.score(list(quoted_concat.values())) + bert_quoted = {key: val for key, val in zip(quoted.keys(), zip(*bert_quoted))} + bert_concat = {key: val for key, val in zip(quoted_concat.keys(), zip(*bert_concat))} + + def create_update(hit): action = { "_op_type": "update", - "_id": hit.meta["id"], + "_id": hit[1].meta["id"], "doc": { "sentiment": { "vader": { - "primary": vader.polarity_scores(text)["compound"] + "primary": vader.polarity_scores(texts[hit[0]])["compound"] }, "bert" : { - "scores": scores, - "class": result + "scores": bert_texts[0][hit[0]], + "class": bert_texts[1][hit[0]], + "primary": bert_texts[2][hit[0]] } } } } - if quoted_text is not None: - quoted_text = sentiment_helpers.clean_text_for_vader(quoted_text) - quoted_concat_text = "{0} {1}".format(quoted_text, text) - quoted_scores, quoted_class = bert.score(quoted_text) - quoted_concat_scores, quoted_concat_class = bert.score(quoted_concat_text) - action["doc"]["sentiment"]["vader"]["quoted"] = vader.polarity_scores(quoted_text)["compound"] - action["doc"]["sentiment"]["vader"]["quoted_concat"] = vader.polarity_scores(quoted_concat_text)["compound"] - action["doc"]["sentiment"]["bert"]["quoted_scores"] = quoted_scores - action["doc"]["sentiment"]["bert"]["quoted_class"] = quoted_class - action["doc"]["sentiment"]["bert"]["quoted_concat_scores"] = quoted_concat_scores - action["doc"]["sentiment"]["bert"]["quoted_concat_class"] = quoted_concat_class - - updates.append(action) + if hit[0] in quoted: + action["doc"]["sentiment"]["vader"]["quoted"] = vader.polarity_scores(quoted_texts[hit[0]])["compound"] + action["doc"]["sentiment"]["vader"]["quoted_concat"] = vader.polarity_scores(quoted_concat[hit[0]])["compound"] + action["doc"]["sentiment"]["bert"]["quoted"] = bert_quoted[hit[0]][2] + action["doc"]["sentiment"]["bert"]["quoted_scores"] = bert_quoted[hit[0]][0] + action["doc"]["sentiment"]["bert"]["quoted_class"] = bert_quoted[hit[0]][1] + action["doc"]["sentiment"]["bert"]["quoted_concat"] = bert_concat[hit[0]][2] + action["doc"]["sentiment"]["bert"]["quoted_concat_scores"] = bert_concat[hit[0]][0] + action["doc"]["sentiment"]["bert"]["quoted_concat_class"] = bert_concat[hit[0]][1] + return action + updates = list(map(create_update, enumerate(hits))) + #Issue the bulk update request logging.info("Making bulk request to Elasticsearch with {0} update actions...".format(len(updates))) bulk(es, updates, index=config.elasticsearch_index_name, chunk_size=len(updates)) diff --git a/sentiment/sentiment_helpers.py b/sentiment/sentiment_helpers.py index 219aeef..c635267 100644 --- a/sentiment/sentiment_helpers.py +++ b/sentiment/sentiment_helpers.py @@ -28,7 +28,7 @@ def get_query(): "bool": { "must_not": { "exists": { - "field": "sentiment.bert.class" + "field": "sentiment.bert.primary" } } }