Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bert Sentiment Scoring Script #53

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,6 @@ twitter_monitor/tmlog.txt
embedder/embedderlog.txt
twitter_monitor/jdllog.txt
sentiment/sentimentlog.txt
sentiment/semeval*
sentiment/sentiment.pt
analysis/snapshots/*.Rdata
65 changes: 65 additions & 0 deletions sentiment/bert_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import torch
import requests
from transformers import AutoTokenizer, AutoConfig, BertForSequenceClassification
from torch.nn.functional import softmax
from typing import List

# global constants
MODEL_NAME = 'digitalepidemiologylab/covid-twitter-bert-v2'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
mapping = { 0: 'negative', 1: 'neutral', 2: 'positive' }

class BertSentiment():
"""
Initializes a bert model used for evaluation
@param path: local relative path of the bert model
@param remote: defaults to empty, if specified will download model from url
"""
def __init__(self, path: str, config: str, remote: str=""):
if len(remote) != 0:
self.download(remote)
self.tokenizer = tokenizer
self.load(path, config)

"""
Downloads bert model from remote
@param remote: url location of bert model
@param dest: destination path where model will be downloaded to
"""
def download(self, remote: str, dest: str) -> str:
try:
res = requests.get(remote, allow_redirects=True)
with open(dest, "wb") as f:
f.write(res.content)
return dest
except:
print("Could not download model")
return None

"""
Loads pytorch model in for inference
@param path: local path to the bert model
@param config: local path to bert config
"""
def load(self, path:str, config:str):
self.device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
self.config = AutoConfig.from_pretrained(config)
self.model = BertForSequenceClassification(self.config)
self.model.load_state_dict(torch.load(path, self.device))
self.model.to(self.device)
self.model.eval()

"""
Takes in a tweet and calculates a sentiment prediction confidences
"""
def score(self, text):
encodings = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=35)
inputs = encodings["input_ids"].to(self.device)
with torch.no_grad():
logits = self.model(inputs, labels=None)[0]
preds = softmax(logits.cpu(), dim=1)
infer = torch.argmax(preds, dim=1)
sentiment = [mapping[p.item()] for p in infer]
infer = infer - 1
return preds.tolist(), sentiment, infer.tolist()

18 changes: 9 additions & 9 deletions sentiment/bert_sentiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import numpy as np
import pandas as pd
import torch
import torch.nn as nna
import torch.nn as nn
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
Expand Down Expand Up @@ -188,7 +188,7 @@ def format_time(elapsed):
"""

# declare a model, export to GPU if available and set to training
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')

model = AutoModelForSequenceClassification.from_pretrained(
"digitalepidemiologylab/covid-twitter-bert-v2",
Expand Down Expand Up @@ -235,7 +235,8 @@ def format_time(elapsed):
labels = batch['labels'].to(device)

# feed the batch of data into the model
loss, logits = model(input_ids, attention_mask=attention_mask, labels=labels)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss, logits = outputs[0], outputs[1]
total_train_loss += loss.item()

# clip the norm of the gradients to 1
Expand Down Expand Up @@ -269,7 +270,8 @@ def format_time(elapsed):

# feed tata into model without training
with torch.no_grad():
loss, logits = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
loss, logits = outputs[0], outputs[1]
total_eval_loss += loss.item()

logits = logits.detach().cpu().numpy()
Expand All @@ -295,10 +297,7 @@ def format_time(elapsed):
print(" Validation Loss: {0:.2f}".format(avg_val_loss))
print(" Validation took: {:}".format(validation_time))

torch.save(model, './sentiment.pt')
with open('training_data.pickle', 'wb') as f:
pickle.dump({'accuracy': accuracies, 'training_loss': training_losses, 'val_loss': val_losses}, f)

torch.save(model.state_dict(), './bert_sentiment.pt')

"""
upload model to dropbox in case runtime disconects
Expand Down Expand Up @@ -351,7 +350,8 @@ def largeUpload(file_path, dest_path):
b_labels = batch['labels'].to(device)

with torch.no_grad():
loss, logits = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
loss, logits = outputs[0], outputs[1]
loss_data = np.append(loss_data, loss.item())

logits = logits.detach().cpu().numpy()
Expand Down
10 changes: 6 additions & 4 deletions sentiment/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
"py/object": "config.Config",
"elasticsearch_host": "localhost",
"elasticsearch_verify_certs": false,
"elasticsearch_index_name": "coronavirus-data2",
"elasticsearch_batch_size": 500,
"elasticsearch_index_name": "coronavirus-data-masks",
"elasticsearch_batch_size": 1000,
"elasticsearch_timeout_secs": 30,
"sleep_idle_secs": 5,
"sleep_not_idle_secs": 0.01,
"log_level": "WARNING",
"log_level": "INFO",
"semeval_dataset" : "semeval",
"semeval_url" : "https://www.dropbox.com/s/byzr8yoda6bua1b/2017_English_final.zip?dl=1",
"thresholds" : [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
"thresholds" : [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45],
"model_path" : "bert_sentiment.pt",
"model_config" : "bert_config.config"
}
56 changes: 39 additions & 17 deletions sentiment/sentiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import sentiment_helpers
import time
import logging
from bert_eval import BertSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
Expand Down Expand Up @@ -31,8 +32,10 @@
#Load vader sentiment intensity analyzer
vader = SentimentIntensityAnalyzer()

bert = BertSentiment(config.model_path, config.model_config)

#Initialize elasticsearch settings
es = Elasticsearch(hosts=[config.elasticsearch_host],
es = Elasticsearch(hosts=[config.elasticsearch_host],
verify_certs=config.elasticsearch_verify_certs,
timeout=config.elasticsearch_timeout_secs)

Expand All @@ -56,30 +59,49 @@
continue

#Run sentiment analysis on the batch
logging.info("Found {0} unscored docs. Calculating sentiment scores with Vader...".format(len(hits)))
updates = []
for hit in hits:
text, quoted_text = sentiment_helpers.get_tweet_text(hit)
text = sentiment_helpers.clean_text_for_vader(text)
logging.info("Found {0} unscored docs. Calculating sentiment scores with Vader and Bert...".format(len(hits)))
texts, quoted_texts = zip(*map(sentiment_helpers.get_tweet_text, hits))
texts = list(map(sentiment_helpers.clean_text_for_vader, texts))
quoted = {index: quote for index, quote in map(lambda x: (x[0], sentiment_helpers.clean_text_for_vader(x[1])), filter(lambda y: y[1] is not None, enumerate(quoted_texts)))}
quoted_concat = {index: "{0} {1}".format(quote, texts[index]) for index, quote in quoted.items()}

# use bert to batch score text inputs
bert_texts = bert.score(texts)
bert_quoted = bert.score(list(quoted.values()))
bert_concat = bert.score(list(quoted_concat.values()))
bert_quoted = {key: val for key, val in zip(quoted.keys(), zip(*bert_quoted))}
bert_concat = {key: val for key, val in zip(quoted_concat.keys(), zip(*bert_concat))}

def create_update(hit):
action = {
"_op_type": "update",
"_id": hit.meta["id"],
"_id": hit[1].meta["id"],
"doc": {
"sentiment": {
"vader": {
"primary": vader.polarity_scores(text)["compound"]
}
"primary": vader.polarity_scores(texts[hit[0]])["compound"]
},
"bert" : {
"scores": bert_texts[0][hit[0]],
"class": bert_texts[1][hit[0]],
"primary": bert_texts[2][hit[0]]
}
}
}
}
if quoted_text is not None:
quoted_text = sentiment_helpers.clean_text_for_vader(quoted_text)
quoted_concat_text = "{0} {1}".format(quoted_text, text)
action["doc"]["sentiment"]["vader"]["quoted"] = vader.polarity_scores(quoted_text)["compound"]
action["doc"]["sentiment"]["vader"]["quoted_concat"] = vader.polarity_scores(quoted_concat_text)["compound"]

updates.append(action)
if hit[0] in quoted:
action["doc"]["sentiment"]["vader"]["quoted"] = vader.polarity_scores(quoted_texts[hit[0]])["compound"]
action["doc"]["sentiment"]["vader"]["quoted_concat"] = vader.polarity_scores(quoted_concat[hit[0]])["compound"]
action["doc"]["sentiment"]["bert"]["quoted"] = bert_quoted[hit[0]][2]
action["doc"]["sentiment"]["bert"]["quoted_scores"] = bert_quoted[hit[0]][0]
action["doc"]["sentiment"]["bert"]["quoted_class"] = bert_quoted[hit[0]][1]
action["doc"]["sentiment"]["bert"]["quoted_concat"] = bert_concat[hit[0]][2]
action["doc"]["sentiment"]["bert"]["quoted_concat_scores"] = bert_concat[hit[0]][0]
action["doc"]["sentiment"]["bert"]["quoted_concat_class"] = bert_concat[hit[0]][1]
return action

updates = list(map(create_update, enumerate(hits)))

#Issue the bulk update request
logging.info("Making bulk request to Elasticsearch with {0} update actions...".format(len(updates)))
bulk(es, updates, index=config.elasticsearch_index_name, chunk_size=len(updates))
Expand All @@ -89,4 +111,4 @@
time.sleep(config.sleep_not_idle_secs)

except Exception as ex:
logging.exception("Exception occurred while polling or processing a batch.")
logging.exception("Exception occurred while polling or processing a batch.")
25 changes: 20 additions & 5 deletions sentiment/sentiment_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,26 @@ def get_query():
"filter": [
{
"bool": {
"must_not": {
"exists": {
"field": "sentiment.vader.primary"
"should": [{
"bool": {
"must_not": {
"exists": {
"field": "sentiment.vader.primary"
}
}
}
}
},
{
"bool": {
"must_not": {
"exists": {
"field": "sentiment.bert.primary"
}
}
}
}
],
"minimum_should_match" : 1
}
},
{
Expand Down Expand Up @@ -55,4 +70,4 @@ def clean_text_for_vader(text):
text = re.sub(r"http\S+", "", text)
text = re.sub(r" +", " ", text)
text = text.strip()
return text
return text