-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from FernanOrtega/dev
First beta version
- Loading branch information
Showing
9 changed files
with
3,112 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -127,3 +127,6 @@ dmypy.json | |
|
||
# Pyre type checker | ||
.pyre/ | ||
|
||
# PyCharm | ||
.idea |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
SpaCy==2.2.3 | ||
Flask==1.1.1 | ||
gunicorn==20.0.4 | ||
numpy==1.18.1 | ||
snowballstemmer==2.0.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
from typing import Dict | ||
|
||
import snowballstemmer | ||
import spacy | ||
|
||
from spacy.tokens.doc import Doc | ||
from spacy.tokens.span import Span | ||
from spacy.tokens.token import Token | ||
from itertools import groupby | ||
|
||
from sentimentanalysis.dataloader import load_dict | ||
|
||
|
||
class SentimentAnalysis(object): | ||
def __init__(self, language="es"): | ||
self.__nlp = spacy.load("es_core_news_md") | ||
stemmer = StemmerPipe(language) | ||
annotator = SentimentAnnotatorPipe(language) | ||
|
||
self.__nlp.add_pipe(stemmer) | ||
self.__nlp.add_pipe(annotator) | ||
|
||
def compute_sentiment(self, text: str, language="es") -> Dict: | ||
result = {} | ||
doc = self.__nlp(text) | ||
|
||
result["per_sentence_sentiment"] = self.__compute_per_sentence_sentiment(doc) | ||
result["global_sentiment"] = self.__compute_global_sentiment(doc) | ||
|
||
return result | ||
|
||
def __compute_per_sentence_sentiment(self, doc: Doc) -> Dict: | ||
|
||
result = {} | ||
|
||
for i, sent in enumerate(doc.sents): | ||
max_score = 0.0 | ||
min_score = 0.0 | ||
|
||
for token in sent: | ||
score = token._.sentiment_weight * token._.negation_weight | ||
if score > 0: | ||
score = max(1.0, score + token._.booster_weight) | ||
if score > max_score: | ||
max_score = max | ||
elif score < 0: | ||
score = min(-1.0, score - token._.booster_weight) | ||
if score < min_score: | ||
min_score = score | ||
|
||
sentence_score = max_score + min_score | ||
sent._.sentiment_weight = sentence_score | ||
result[str(sent)] = sentence_score | ||
|
||
return result | ||
|
||
def __compute_global_sentiment(self, doc: Doc) -> float: | ||
|
||
max_score = 0.0 | ||
min_score = 0.0 | ||
|
||
for sent in doc.sents: | ||
if sent._.sentiment_weight > max_score: | ||
max_score = sent._.sentiment_weight | ||
elif sent._.sentiment_weight < min_score: | ||
min_score = sent._.sentiment_weight | ||
|
||
return max_score + min_score | ||
|
||
|
||
class SentimentAnnotatorPipe(object): | ||
def __init__(self, language: str = "es"): | ||
self.__sentiment_words = load_dict(language, "sentiment_words.csv") | ||
self.__boosters = load_dict(language, "boosters.csv") | ||
self.__negations = load_dict(language, "negations.csv") | ||
Span.set_extension("sentiment_weight", default=0.0, force=True) | ||
Token.set_extension("sentiment_weight", default=0.0, force=True) | ||
Token.set_extension("negation_weight", default=1.0, force=True) | ||
Token.set_extension("booster_weight", default=0.0, force=True) | ||
|
||
def __call__(self, doc: Doc) -> Doc: | ||
self.__annotate_sentiment_words(doc) | ||
self.__annotate_negations_and_boosters(doc) | ||
|
||
return doc | ||
|
||
def __annotate_sentiment_words(self, doc: Doc) -> None: | ||
for token in doc: | ||
if token.pos_ == "ADJ" and not token.is_stop: | ||
sentiment_weight = self.__sentiment_words.get(token._.stem, 0.0) | ||
if sentiment_weight != 0.0: | ||
token._.booster_weight = self.__get_self_boosters(token) | ||
token._.sentiment_weight = sentiment_weight | ||
|
||
def __annotate_negations_and_boosters(self, doc: Doc) -> None: | ||
for sentence in doc.sents: | ||
for i, token in enumerate(sentence): | ||
if token in self.__negations: | ||
influenced_token = self.__get_influenced_token(sentence, i) | ||
if influenced_token: | ||
influenced_token._.negation_weight = ( | ||
self.__negations.get(token) * -1 | ||
) | ||
elif token in self.__boosters: | ||
influenced_token = self.__get_influenced_token(sentence, i) | ||
if influenced_token: | ||
influenced_token._.booster_weight += self.__boosters.get(token) | ||
|
||
def __get_influenced_token(self, sentence: Span, influencer_index: int) -> Token: | ||
|
||
result = None | ||
for i in range(1, len(sentence)): | ||
for j in [-1, 1]: | ||
candidate_index = influencer_index + i * j | ||
if 0 <= candidate_index < len(sentence): | ||
candidate = sentence[candidate_index] | ||
|
||
if candidate._.sentiment_weight != 0.0: | ||
result = candidate | ||
break | ||
|
||
return result | ||
|
||
def __get_self_boosters(self, token: Token) -> float: | ||
|
||
return ( | ||
1.0 | ||
if (token.shape_.count("X") / len(token)) > 0.8 | ||
or self.__max_rep_letters(token) >= 3 | ||
else 0.0 | ||
) | ||
|
||
def __max_rep_letters(self, token: Token) -> int: | ||
|
||
return sorted( | ||
[(letter, len(list(group))) for letter, group in groupby(token.lower_)], | ||
key=lambda i: i[1], | ||
reverse=True, | ||
)[0][1] | ||
|
||
|
||
class StemmerPipe(object): | ||
def __init__(self, language="es"): | ||
self.__stemmer = snowballstemmer.stemmer("spanish") | ||
Token.set_extension("stem", default="") | ||
|
||
def __call__(self, doc: Doc) -> Doc: | ||
for token in doc: | ||
token._.stem = self.__stemmer.stemWord(token.lemma_) | ||
|
||
return doc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import json | ||
from typing import Union | ||
|
||
from flask import Flask, request, Response | ||
from werkzeug.exceptions import HTTPException, InternalServerError | ||
|
||
from sentimentanalysis.analizer import SentimentAnalysis | ||
|
||
|
||
def get_response(body: Union[str, dict], status: int = 200): | ||
json_body = {("result" if status == 200 else "error"): body} | ||
|
||
return Response(json.dumps(json_body), status, mimetype="application/json") | ||
|
||
|
||
app = Flask(__name__) | ||
sent_analysis = SentimentAnalysis() | ||
|
||
|
||
@app.errorhandler(Exception) | ||
def handle_exception(e): | ||
# pass through HTTP errors | ||
if isinstance(e, HTTPException): | ||
return e | ||
|
||
# now you're handling non-HTTP exceptions only | ||
return get_response(f"500. HTTP Exception. Exception: {e}", 500) | ||
|
||
|
||
@app.errorhandler(InternalServerError) | ||
def handle_500(e): | ||
original = getattr(e, "original_exception", None) | ||
|
||
if original is None: | ||
return get_response("500. Unhandled Internal Server Error", 500) | ||
|
||
# wrapped unhandled error | ||
return get_response(f"500. Handled Internal Server Error: {original}", 500) | ||
|
||
|
||
@app.route("/sentimentanalysis", methods=["POST"]) | ||
def get_sentiment_analysis(): | ||
if not request.is_json: | ||
message = "Incorrect mimetype, must be 'application/json'." | ||
status_code = 415 | ||
else: | ||
request_body = request.get_json() | ||
if "text" not in request_body: | ||
message = "'text' attribute not present in request body" | ||
status_code = 422 | ||
else: | ||
text = request_body["text"] | ||
status_code = 200 | ||
message = sent_analysis.compute_sentiment(text) | ||
|
||
return get_response(message, status_code, message_as_json=True) | ||
|
||
|
||
@app.route("/") | ||
def root(): | ||
return "Everything is working fine" | ||
|
||
|
||
if __name__ == "__main__": | ||
app.run(host="0.0.0.0", port=5000) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
import csv | ||
import os | ||
from typing import Dict, List | ||
|
||
BASE_URL = f"{os.path.dirname(os.path.realpath(__file__))}/resources" | ||
|
||
|
||
def load_dict(language: str, file_name: str) -> Dict: | ||
|
||
with open(os.path.join(BASE_URL, language, file_name)) as csvfile: | ||
csv_reader = csv.reader(csvfile) | ||
result = {row[0]: float(row[1]) for row in csv_reader} | ||
|
||
return result | ||
|
||
|
||
def load_string_file(language: str, file_name: str) -> List: | ||
with open(os.path.join(BASE_URL, language, file_name)) as str_file: | ||
result = [line.replace("\n", "") for line in str_file] | ||
|
||
return result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
muy,1.0 | ||
much,1.0 | ||
bastant,1.0 | ||
demasi,1.0 | ||
mas,1.0 | ||
gran,1.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
nunc,1.0 | ||
jamas,1.0 | ||
no,1.0 | ||
poc,1.0 | ||
sin,1.0 | ||
sin embarg,1.0 | ||
nad de,1.0 | ||
no signif que,1.0 |
Oops, something went wrong.