Merge pull request #1 from FernanOrtega/dev

First beta version
FernanOrtega · Mar 12, 2020 · 87d98d3 · 87d98d3
2 parents 9aa7182 + d45ff7f
commit 87d98d3
Show file tree

Hide file tree

Showing 9 changed files with 3,112 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,6 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# PyCharm
+.idea
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,5 @@
+SpaCy==2.2.3
+Flask==1.1.1
+gunicorn==20.0.4
+numpy==1.18.1
+snowballstemmer==2.0.0
diff --git a/sentimentanalysis/analizer.py b/sentimentanalysis/analizer.py
@@ -0,0 +1,151 @@
+from typing import Dict
+
+import snowballstemmer
+import spacy
+
+from spacy.tokens.doc import Doc
+from spacy.tokens.span import Span
+from spacy.tokens.token import Token
+from itertools import groupby
+
+from sentimentanalysis.dataloader import load_dict
+
+
+class SentimentAnalysis(object):
+    def __init__(self, language="es"):
+        self.__nlp = spacy.load("es_core_news_md")
+        stemmer = StemmerPipe(language)
+        annotator = SentimentAnnotatorPipe(language)
+
+        self.__nlp.add_pipe(stemmer)
+        self.__nlp.add_pipe(annotator)
+
+    def compute_sentiment(self, text: str, language="es") -> Dict:
+        result = {}
+        doc = self.__nlp(text)
+
+        result["per_sentence_sentiment"] = self.__compute_per_sentence_sentiment(doc)
+        result["global_sentiment"] = self.__compute_global_sentiment(doc)
+
+        return result
+
+    def __compute_per_sentence_sentiment(self, doc: Doc) -> Dict:
+
+        result = {}
+
+        for i, sent in enumerate(doc.sents):
+            max_score = 0.0
+            min_score = 0.0
+
+            for token in sent:
+                score = token._.sentiment_weight * token._.negation_weight
+                if score > 0:
+                    score = max(1.0, score + token._.booster_weight)
+                    if score > max_score:
+                        max_score = max
+                elif score < 0:
+                    score = min(-1.0, score - token._.booster_weight)
+                    if score < min_score:
+                        min_score = score
+
+            sentence_score = max_score + min_score
+            sent._.sentiment_weight = sentence_score
+            result[str(sent)] = sentence_score
+
+        return result
+
+    def __compute_global_sentiment(self, doc: Doc) -> float:
+
+        max_score = 0.0
+        min_score = 0.0
+
+        for sent in doc.sents:
+            if sent._.sentiment_weight > max_score:
+                max_score = sent._.sentiment_weight
+            elif sent._.sentiment_weight < min_score:
+                min_score = sent._.sentiment_weight
+
+        return max_score + min_score
+
+
+class SentimentAnnotatorPipe(object):
+    def __init__(self, language: str = "es"):
+        self.__sentiment_words = load_dict(language, "sentiment_words.csv")
+        self.__boosters = load_dict(language, "boosters.csv")
+        self.__negations = load_dict(language, "negations.csv")
+        Span.set_extension("sentiment_weight", default=0.0, force=True)
+        Token.set_extension("sentiment_weight", default=0.0, force=True)
+        Token.set_extension("negation_weight", default=1.0, force=True)
+        Token.set_extension("booster_weight", default=0.0, force=True)
+
+    def __call__(self, doc: Doc) -> Doc:
+        self.__annotate_sentiment_words(doc)
+        self.__annotate_negations_and_boosters(doc)
+
+        return doc
+
+    def __annotate_sentiment_words(self, doc: Doc) -> None:
+        for token in doc:
+            if token.pos_ == "ADJ" and not token.is_stop:
+                sentiment_weight = self.__sentiment_words.get(token._.stem, 0.0)
+                if sentiment_weight != 0.0:
+                    token._.booster_weight = self.__get_self_boosters(token)
+                    token._.sentiment_weight = sentiment_weight
+
+    def __annotate_negations_and_boosters(self, doc: Doc) -> None:
+        for sentence in doc.sents:
+            for i, token in enumerate(sentence):
+                if token in self.__negations:
+                    influenced_token = self.__get_influenced_token(sentence, i)
+                    if influenced_token:
+                        influenced_token._.negation_weight = (
+                            self.__negations.get(token) * -1
+                        )
+                elif token in self.__boosters:
+                    influenced_token = self.__get_influenced_token(sentence, i)
+                    if influenced_token:
+                        influenced_token._.booster_weight += self.__boosters.get(token)
+
+    def __get_influenced_token(self, sentence: Span, influencer_index: int) -> Token:
+
+        result = None
+        for i in range(1, len(sentence)):
+            for j in [-1, 1]:
+                candidate_index = influencer_index + i * j
+                if 0 <= candidate_index < len(sentence):
+                    candidate = sentence[candidate_index]
+
+                    if candidate._.sentiment_weight != 0.0:
+                        result = candidate
+                        break
+
+        return result
+
+    def __get_self_boosters(self, token: Token) -> float:
+
+        return (
+            1.0
+            if (token.shape_.count("X") / len(token)) > 0.8
+            or self.__max_rep_letters(token) >= 3
+            else 0.0
+        )
+
+    def __max_rep_letters(self, token: Token) -> int:
+
+        return sorted(
+            [(letter, len(list(group))) for letter, group in groupby(token.lower_)],
+            key=lambda i: i[1],
+            reverse=True,
+        )[0][1]
+
+
+class StemmerPipe(object):
+    def __init__(self, language="es"):
+        self.__stemmer = snowballstemmer.stemmer("spanish")
+        Token.set_extension("stem", default="")
+
+    def __call__(self, doc: Doc) -> Doc:
+        for token in doc:
+            token._.stem = self.__stemmer.stemWord(token.lemma_)
+
+        return doc
diff --git a/sentimentanalysis/api.py b/sentimentanalysis/api.py
@@ -0,0 +1,65 @@
+import json
+from typing import Union
+
+from flask import Flask, request, Response
+from werkzeug.exceptions import HTTPException, InternalServerError
+
+from sentimentanalysis.analizer import SentimentAnalysis
+
+
+def get_response(body: Union[str, dict], status: int = 200):
+    json_body = {("result" if status == 200 else "error"): body}
+
+    return Response(json.dumps(json_body), status, mimetype="application/json")
+
+
+app = Flask(__name__)
+sent_analysis = SentimentAnalysis()
+
+
+@app.errorhandler(Exception)
+def handle_exception(e):
+    # pass through HTTP errors
+    if isinstance(e, HTTPException):
+        return e
+
+    # now you're handling non-HTTP exceptions only
+    return get_response(f"500. HTTP Exception. Exception: {e}", 500)
+
+
+@app.errorhandler(InternalServerError)
+def handle_500(e):
+    original = getattr(e, "original_exception", None)
+
+    if original is None:
+        return get_response("500. Unhandled Internal Server Error", 500)
+
+    # wrapped unhandled error
+    return get_response(f"500. Handled Internal Server Error: {original}", 500)
+
+
+@app.route("/sentimentanalysis", methods=["POST"])
+def get_sentiment_analysis():
+    if not request.is_json:
+        message = "Incorrect mimetype, must be 'application/json'."
+        status_code = 415
+    else:
+        request_body = request.get_json()
+        if "text" not in request_body:
+            message = "'text' attribute not present in request body"
+            status_code = 422
+        else:
+            text = request_body["text"]
+            status_code = 200
+            message = sent_analysis.compute_sentiment(text)
+
+    return get_response(message, status_code, message_as_json=True)
+
+
+@app.route("/")
+def root():
+    return "Everything is working fine"
+
+
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=5000)
diff --git a/sentimentanalysis/dataloader.py b/sentimentanalysis/dataloader.py
@@ -0,0 +1,21 @@
+import csv
+import os
+from typing import Dict, List
+
+BASE_URL = f"{os.path.dirname(os.path.realpath(__file__))}/resources"
+
+
+def load_dict(language: str, file_name: str) -> Dict:
+
+    with open(os.path.join(BASE_URL, language, file_name)) as csvfile:
+        csv_reader = csv.reader(csvfile)
+        result = {row[0]: float(row[1]) for row in csv_reader}
+
+    return result
+
+
+def load_string_file(language: str, file_name: str) -> List:
+    with open(os.path.join(BASE_URL, language, file_name)) as str_file:
+        result = [line.replace("\n", "") for line in str_file]
+
+    return result
diff --git a/sentimentanalysis/resources/es/boosters.csv b/sentimentanalysis/resources/es/boosters.csv
@@ -0,0 +1,6 @@
+muy,1.0
+much,1.0
+bastant,1.0
+demasi,1.0
+mas,1.0
+gran,1.0
diff --git a/sentimentanalysis/resources/es/negations.csv b/sentimentanalysis/resources/es/negations.csv
@@ -0,0 +1,8 @@
+nunc,1.0
+jamas,1.0
+no,1.0
+poc,1.0
+sin,1.0
+sin embarg,1.0
+nad de,1.0
+no signif que,1.0
-Original file line number
+Diff line change
@@ Expand Up / @@ -127,3 +127,6 @@ dmypy.json @@
     # Pyre type checker
     .pyre/
+    # PyCharm
+    .idea