deeppavlov · oserikov · Sep 22, 2023 · Sep 26, 2023 · Sep 26, 2023 · Sep 26, 2023
diff --git a/annotators/emotion_detection/Dockerfile b/annotators/emotion_detection/Dockerfile
@@ -0,0 +1,38 @@
+FROM nvidia/cuda:12.1.1-base-ubuntu20.04
+
+RUN apt update
+RUN apt install -y python3.9
+RUN apt install -y git python3-pip
+
+ARG VIDEO_PRETRAINED
+ARG TEXT_PRETRAINED
+ARG MODEL_PATH
+ARG MULTIMODAL_MODEL
+ARG REDUNDANT_FEATURES
+
+ENV VIDEO_PRETRAINED=$VIDEO_PRETRAINED
+ENV TEXT_PRETRAINED=$TEXT_PRETRAINED
+ENV MULTIMODAL_MODEL=$MULTIMODAL_MODEL
+ENV MODEL_PATH=$MODEL_PATH
+ENV REDUNDANT_FEATURES=$REDUNDANT_FEATURES
+
+WORKDIR /src
+
+COPY . /src
+RUN mkdir /data
+RUN pip install -r requirements.txt
+
+RUN apt install -y ffmpeg=7:4.2.7-0ubuntu0.1 libsm6=2:1.2.3-1 libxext6=2:1.3.4-0ubuntu1
+
+RUN pip install gdown==4.7.1
+
+RUN git clone https://github.com/anna-a-m/MultimodalERC /data/repo && cd /data/repo && git reset --hard 84097d442b23b5a9238b5090a04e2625741314ae
+
+RUN mv -f /data/repo/* /data/ && rm -rf /data/repo
+
+RUN touch /data/multimodal_concat/__init__.py
+
+RUN apt-get install -y wget
+
+RUN wget -O models http://files.deeppavlov.ai/dream_data/emotion_detection/emotion_detection_v1.tar.gz && tar -xf models -C /data/
+RUN wget -O redundant_feat http://files.deeppavlov.ai/dream_data/emotion_detection/redundant_feat.txt && mv -f redundant_feat /data/
diff --git a/annotators/emotion_detection/aux.py b/annotators/emotion_detection/aux.py
@@ -0,0 +1,4 @@
+import sys
+
+sys.path.append("/data")
+sys.path.append("/data/multimodal_concat")
diff --git a/annotators/emotion_detection/requirements.txt b/annotators/emotion_detection/requirements.txt
@@ -0,0 +1,16 @@
+pandas==1.5.3
+scikit-learn==1.3.0
+tqdm==4.64.1
+opencv-python==4.7.0.68
+opensmile==2.4.2
+sentry-sdk==1.15.0
+torch==1.13.1
+transformers==4.31.0
+fastapi==0.103.0 
+blinker==1.5.0
+pydantic==2.3.0
+numpy==1.24.4
+starlette==0.27.0
+uvicorn==0.23.2
+Pillow==9.3.0
+wandb==0.13.9
diff --git a/annotators/emotion_detection/server.py b/annotators/emotion_detection/server.py
@@ -0,0 +1,199 @@
+import logging
+import os
+import opensmile
+import torch
+import numpy as np
+import sentry_sdk
+import cv2
+import aux  # noqa: F401
+
+from multimodal_concat.models import MultimodalClassificationModel, MainModel
+from multimodal_concat.utils import prepare_models
+
+from fastapi import FastAPI
+from fastapi.encoders import jsonable_encoder
+from pydantic import BaseModel
+from starlette.middleware.cors import CORSMiddleware
+from transformers import AutoTokenizer, AutoProcessor
+from typing import List
+from urllib.request import urlretrieve
+
+sentry_sdk.init(dsn=os.getenv("SENTRY_DSN"))
+
+label2id = {
+    "anger": 0,
+    "disgust": 1,
+    "fear": 2,
+    "joy": 3,
+    "neutral": 4,
+    "sadness": 5,
+    "surprise": 6,
+}
+num_labels = 7
+text_model, video_model, audio_model = prepare_models(num_labels, os.getenv("MODEL_PATH"))
+
+logger = logging.getLogger(__name__)
+
+
+def sample_frame_indices(seg_len, clip_len=16, frame_sample_rate=4, mode="video"):
+    converted_len = int(clip_len * frame_sample_rate)
+    converted_len = min(converted_len, seg_len - 1)
+    end_idx = np.random.randint(converted_len, seg_len)
+    start_idx = end_idx - converted_len
+    if mode == "video":
+        indices = np.linspace(start_idx, end_idx, num=clip_len)
+    else:
+        indices = np.linspace(start_idx, end_idx, num=clip_len * frame_sample_rate)
+    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+    return indices
+
+
+def get_frames(
+    file_path,
+    clip_len=16,
+):
+    cap = cv2.VideoCapture(file_path)
+    v_len = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    indices = sample_frame_indices(v_len)
+
+    frames = []
+    for fn in range(v_len):
+        success, frame = cap.read()
+        if success is False:
+            continue
+        if fn in indices:
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            res = cv2.resize(frame, dsize=(224, 224), interpolation=cv2.INTER_CUBIC)
+            frames.append(res)
+    cap.release()
+
+    if len(frames) < clip_len:
+        add_num = clip_len - len(frames)
+        frames_to_add = [frames[-1]] * add_num
+        frames.extend(frames_to_add)
+
+    return frames
+
+
+def create_final_model():
+    multi_model = MultimodalClassificationModel(
+        text_model,
+        video_model,
+        audio_model,
+        num_labels,
+        input_size=4885,
+        hidden_size=512,
+    )
+    checkpoint = torch.load(os.getenv("MULTIMODAL_MODEL"))
+    multi_model.load_state_dict(checkpoint)
+
+    device = "cuda"
+    return MainModel(multi_model, device=device)
+
+
+def process_text(input_tokens: str):
+    text_model_name = os.getenv("TEXT_PRETRAINED")
+    logger.info(f"{text_model_name}")
+    tokenizer = AutoTokenizer.from_pretrained(text_model_name)
+
+    return tokenizer(
+        input_tokens,
+        padding="max_length",
+        truncation=True,
+        max_length=128,
+        return_tensors="pt",
+    )
+
+
+def process_video(video_path: str):
+    video_frames = get_frames(video_path)
+
+    video_model_name = os.getenv("VIDEO_PRETRAINED")
+    video_feature_extractor = AutoProcessor.from_pretrained(video_model_name)
+
+    return video_feature_extractor(videos=video_frames, return_tensors="pt")
+
+
+def process_audio(file_path: str):
+    smile = opensmile.Smile(
+        opensmile.FeatureSet.ComParE_2016,
+        opensmile.FeatureLevel.Functionals,
+        sampling_rate=16000,
+        resample=True,
+        num_workers=5,
+        verbose=True,
+    )
+
+    redundant_features = os.getenv("REDUNDANT_FEATURES")
+    with open(redundant_features, "r") as features_file:
+        redundant_features_list = features_file.read().split(",")
+
+    audio_features = smile.process_files([file_path])
+    audio_features = audio_features.drop(columns=redundant_features_list, inplace=False)
+    return audio_features.values.reshape(audio_features.shape[0], 1, audio_features.shape[1])
+
+
+def inference(text: str, video_path: str):
+    text_encoding = process_text(text)
+    video_encoding = process_video(video_path)
+    audio_features = process_audio(video_path)
+    batch = {
+        "text": text_encoding,
+        "video": video_encoding,
+        "audio": audio_features,
+        "label": None,
+    }
+    label = final_model(batch)
+    id2label = {v: k for k, v in label2id.items()}
+    return id2label[int(label.detach().cpu())]
+
+
+def predict_emotion(text: str, video_path: str):
+    try:
+        logger.warning(f"{inference(text, video_path)}")
+        return inference(text, video_path)
+    except Exception as e:
+        sentry_sdk.capture_exception(e)
+        raise e
+
+
+final_model = create_final_model()
+
+
+class EmotionsPayload(BaseModel):
+    personality: List[str]
+    video_path: List[str]
+
+
+def subinfer(msg_text: str, video_path: str):
+    emotion = "Emotion detection unsuccessfull. An error occured during inference."
+    filepath = "undefined"
+    try:
+        filename = video_path.split("=")[-1]
+        filepath = f"/data/{filename}"
+        urlretrieve(video_path, filepath)
+        if not os.path.exists(filepath):
+            raise ValueError(f"Failed to retrieve videofile from {filepath}")
+        emotion = predict_emotion(msg_text + " ", filepath)
+        logger.info(f"Detected emotion: {jsonable_encoder(emotion)}")
+    except Exception as e:
+        raise ValueError(f"The message format is correct, but: {e}")
+
+    return emotion
+
+
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+@app.post("/model")
+def infer(payload: EmotionsPayload):
+    logger.info(f"Emotion Detection: {payload}")
+    emotion = [subinfer(p[0], p[1]) for p in zip(payload.personality, payload.video_path)]
+    return jsonable_encoder(emotion)
diff --git a/annotators/emotion_detection/service_configs/emotion-detection/environment.yml b/annotators/emotion_detection/service_configs/emotion-detection/environment.yml
@@ -0,0 +1,9 @@
+SERVICE_PORT: 8040
+SERVICE_NAME: emotion_detection
+CUDA_VISIBLE_DEVICES: 0
+VIDEO_PRETRAINED: "microsoft/xclip-base-patch32"
+EXT_PRETRAINED: "bert-large-uncased"
+MULTIMODAL_MODEL: "final_model.pt"
+REDUNDANT_FEATURES: "redundant_features.txt"
+MODEL_PATH: "/data/"
+PREFIX: "Detect emotions:"
diff --git a/annotators/emotion_detection/service_configs/emotion-detection/service.yml b/annotators/emotion_detection/service_configs/emotion-detection/service.yml
@@ -0,0 +1,29 @@
+name: emotion-detection
+endpoints:
+- model
+compose:
+  env_file:
+  - .env
+  build:
+    args:
+      SERVICE_PORT: 8040
+      SERVICE_NAME: emotion_detection
+      VIDEO_PRETRAINED: "microsoft/xclip-base-patch32"
+      TEXT_PRETRAINED: "bert-large-uncased"
+      MULTIMODAL_MODEL: "final_model.pt"
+      REDUNDANT_FEATURES: "redundant_features.txt"
+      MODEL_PATH: "/data/"
+      PREFIX: "Detect emotions:"
+    context: .
+    dockerfile: ./annotators/emotion_detection/Dockerfile
+  command: uvicorn server:app --host 0.0.0.0 --port 8040
+  deploy:
+    resources:
+      limits:
+        memory: 1G
+      reservations:
+        memory: 1G
+  environment:
+      - CUDA_VISIBLE_DEVICES=0
+  ports:
+  - 8040:8040
diff --git a/annotators/german_translation_pa/Dockerfile b/annotators/german_translation_pa/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.9-slim
+
+WORKDIR /app
+
+COPY requirements.txt .
+
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY . .
+
+EXPOSE 8181
+
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8181"]
diff --git a/annotators/german_translation_pa/requirements.txt b/annotators/german_translation_pa/requirements.txt
@@ -0,0 +1,5 @@
+fastapi==0.95.2
+uvicorn==0.22.0
+requests==2.31.0
+pydantic==1.10.12
+gigachat==0.1.35
diff --git a/annotators/german_translation_pa/server.py b/annotators/german_translation_pa/server.py
@@ -0,0 +1,55 @@
+import os
+import logging
+from typing import List
+
+from fastapi import FastAPI, HTTPException
+from fastapi.encoders import jsonable_encoder
+from pydantic import BaseModel
+
+from gigachat import GigaChat
+from gigachat.models import Chat
+
+app = FastAPI()
+
+class TextInput(BaseModel):
+    sentences: List[str]
+
+if not all([os.getenv("GIGACHAT_CREDENTIAL"), os.getenv("GIGACHAT_SCOPE")]):
+    logging.error("ENV VARIABLES FOR GIGACHAT ARE NOT SET, THE SERVICE WILL NOT WORK")
+
+@app.post("/translate")
+def translate_text(payload: TextInput):
+    gigachat_api_key = os.getenv("GIGACHAT_CREDENTIAL")
+    gigachat_org = os.getenv("GIGACHAT_SCOPE")
+
+    if not all([gigachat_api_key, gigachat_org]):
+        logging.error("Gigachat credentials are not set")
+        raise HTTPException(status_code=500, detail="Gigachat credentials are not set")
+
+    translated_text = []
+
+    for msg in payload.sentences:
+        try:
+            giga = GigaChat(credentials=gigachat_api_key, verify_ssl_certs=False)
+
+            messages = [
+                {
+                    "role": "system",
+                    "content": "You are a translator that translates English text into German."
+                },
+                {
+                    "role": "user",
+                    "content": msg
+                }
+            ]
+
+            payload = Chat(messages=messages, scope=gigachat_org)
+
+            response = giga.chat(payload)
+
+            translated_text += [response.choices[0].message.content.strip()]
+            logging.info(f"Translated text: {translated_text}")
+        except Exception as e:
+            logging.exception("Error during translation")
+            raise HTTPException(status_code=500, detail=str(e))
+    return jsonable_encoder([{"batch": translated_text}])
diff --git a/annotators/kbqa/tests/test_kbqa.py b/annotators/kbqa/tests/test_kbqa.py
@@ -13,7 +13,7 @@
         ),
         (
             {"x_init": ["How old is Donald Trump?"], "entities": [["Donald Trump"]], "entity_tags": [[["per", 1.0]]]},
-            "Donald Trump is 77 years old.",
+            "Donald Trump is 78 years old.",
         ),
     ],
 )