deeppavlov · RLKRo · Oct 24, 2022 · Nov 3, 2022 · Nov 3, 2022 · Nov 3, 2022
diff --git a/.env_file b/.env_file
@@ -14,10 +14,12 @@ MON_PORT=8765
 YDB_ENDPOINT=grpc://localhost:2136
 YDB_DATABASE=/local
 YDB_ANONYMOUS_CREDENTIALS=1
+RASA_API_KEY=rasa
+PYTHONHASHSEED=42
 CLICKHOUSE_DB=test
 CLICKHOUSE_USER=username
 CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT=1
 CLICKHOUSE_PASSWORD=pass
 SUPERSET_USERNAME=superset
 SUPERSET_PASSWORD=superset
-SUPERSET_METADATA_PORT=5433
+SUPERSET_METADATA_PORT=5433
diff --git a/.github/workflows/build_and_publish_docs.yml b/.github/workflows/build_and_publish_docs.yml
@@ -35,6 +35,16 @@ jobs:
         with:
           pandoc-version: '3.1.6'
 
+      - name: Create gdf_account.json
+        uses: jsdaniell/[email protected]
+        with:
+          name: "gdf_account.json"
+          json: ${{ secrets.GDF_ACCOUNT_JSON }}
+
+      - name: write realpath to env
+        run: |
+            echo "GDF_ACCOUNT_JSON=$(realpath ./gdf_account.json)" >> $GITHUB_ENV
+
       - name: install dependencies
         run: |
             make venv
@@ -45,6 +55,8 @@ jobs:
           TG_API_ID: ${{ secrets.TG_API_ID }}
           TG_API_HASH: ${{ secrets.TG_API_HASH }}
           TG_BOT_USERNAME: ${{ secrets.TG_BOT_USERNAME }}
+          GDF_ACCOUNT_JSON: ${{ env.GDF_ACCOUNT_JSON }}
+          HF_API_KEY: ${{ secrets.HF_API_KEY }}
         run: |
             make doc
 

diff --git a/.github/workflows/test_coverage.yml b/.github/workflows/test_coverage.yml
@@ -50,11 +50,23 @@ jobs:
           rm -rf /tmp/backup
           touch venv  # disable venv target
 
+      - name: Create gdf_account.json
+        uses: jsdaniell/[email protected]
+        with:
+          name: "gdf_account.json"
+          json: ${{ secrets.GDF_ACCOUNT_JSON }}
+
+      - name: write realpath to env
+        run: |
+            echo "GDF_ACCOUNT_JSON=$(realpath ./gdf_account.json)" >> $GITHUB_ENV
+
       - name: run tests
         env:
           TG_BOT_TOKEN: ${{ secrets.TG_BOT_TOKEN }}
           TG_API_ID: ${{ secrets.TG_API_ID }}
           TG_API_HASH: ${{ secrets.TG_API_HASH }}
           TG_BOT_USERNAME: ${{ secrets.TG_BOT_USERNAME }}
+          HF_API_KEY: ${{ secrets.HF_API_KEY }}
+          GDF_ACCOUNT_JSON: ${{ env.GDF_ACCOUNT_JSON }}
         run: |
           make test TEST_ALLOW_SKIP=telegram
diff --git a/.github/workflows/test_full.yml b/.github/workflows/test_full.yml
@@ -43,18 +43,30 @@ jobs:
           python -m pip install -e .[test_full]
         shell: bash
 
+      - name: Create gdf_account.json
+        uses: jsdaniell/[email protected]
+        with:
+          name: "gdf_account.json"
+          json: ${{ secrets.GDF_ACCOUNT_JSON }}
+
+      - name: write realpath to env
+        run: |
+            echo "GDF_ACCOUNT_JSON=$(realpath ./gdf_account.json)" >> $GITHUB_ENV
+
       - name: run pytest
         env:
           TG_BOT_TOKEN: ${{ secrets.TG_BOT_TOKEN }}
           TG_API_ID: ${{ secrets.TG_API_ID }}
           TG_API_HASH: ${{ secrets.TG_API_HASH }}
           TG_BOT_USERNAME: ${{ secrets.TG_BOT_USERNAME }}
+          HF_API_KEY: ${{ secrets.HF_API_KEY }}
+          GDF_ACCOUNT_JSON: ${{ env.GDF_ACCOUNT_JSON }}
         run: |
           if [ "$RUNNER_OS" == "Linux" ]; then
               source <(cat .env_file | sed 's/=/=/' | sed 's/^/export /')
               pytest --tb=long -vv --cache-clear --no-cov --allow-skip=telegram tests/
           else
-              pytest -m "not docker" --tb=long -vv --cache-clear --no-cov --allow-skip=telegram,docker tests/
+              pytest -m "not docker" --tb=long -vv --cache-clear --no-cov --allow-skip=telegram,docker,huggingface,rasa,dialogflow tests/
           fi
         shell: bash
   test_no_deps:
@@ -77,12 +89,24 @@ jobs:
           python -m pip install -e .[tests]
         shell: bash
 
+      - name: Create gdf_account.json
+        uses: jsdaniell/[email protected]
+        with:
+          name: "gdf_account.json"
+          json: ${{ secrets.GDF_ACCOUNT_JSON }}
+
+      - name: write realpath to env
+        run: |
+            echo "GDF_ACCOUNT_JSON=$(realpath ./gdf_account.json)" >> $GITHUB_ENV
+
       - name: run pytest
         env:
           TG_BOT_TOKEN: ${{ secrets.TG_BOT_TOKEN }}
           TG_API_ID: ${{ secrets.TG_API_ID }}
           TG_API_HASH: ${{ secrets.TG_API_HASH }}
           TG_BOT_USERNAME: ${{ secrets.TG_BOT_USERNAME }}
+          HF_API_KEY: ${{ secrets.HF_API_KEY }}
+          GDF_ACCOUNT_JSON: ${{ env.GDF_ACCOUNT_JSON }}
         run: |
           source <(cat .env_file | sed 's/=/=/' | sed 's/^/export /')
           pytest --tb=long -vv --cache-clear --no-cov --allow-skip=all tests/

diff --git a/compose.yml b/compose.yml
@@ -163,6 +163,16 @@ services:
       - "4317:4317"    # OTLP over gRPC receiver
       - "4318:4318"    # OTLP over HTTP receiver
 
+  rasa:
+    env_file: [.env_file]
+    profiles:
+      - ext
+    build:
+      context: ./dff/utils/docker
+      dockerfile: dockerfile_extended_conditions
+    ports:
+      - "5005:5005"
+
 volumes:
   ch-data:
   dashboard-data:

diff --git a/dff/script/extras/conditions/conditions.py b/dff/script/extras/conditions/conditions.py
@@ -0,0 +1,111 @@
+"""
+Conditions
+------------
+
+This module provides condition functions for annotation processing.
+"""
+from typing import Callable, Optional, List
+from functools import singledispatch
+
+try:
+    from sklearn.metrics.pairwise import cosine_similarity
+
+    sklearn_available = True
+except ImportError:
+    sklearn_available = False
+from dff.script import Context
+from dff.pipeline import Pipeline
+from dff.script.extras.conditions.dataset import DatasetItem
+from dff.script.extras.conditions.utils import LABEL_KEY
+from dff.script.extras.conditions.models.base_model import ExtrasBaseModel
+
+
+@singledispatch
+def has_cls_label(label, namespace: Optional[str] = None, threshold: float = 0.9):
+    """
+    Use this condition, when you need to check, whether the probability
+    of a particular label for the last annotated user utterance surpasses the threshold.
+
+    :param label: String name or a reference to a DatasetItem object, or a collection thereof.
+    :param namespace: Namespace key of a particular model that should detect the dataset_item.
+        If not set, all namespaces will be searched for the required dataset_item.
+    :param threshold: The minimal label probability that triggers a positive response
+        from the function.
+    """
+    raise NotImplementedError
+
+
+@has_cls_label.register(str)
+def _(label, namespace: Optional[str] = None, threshold: float = 0.9):
+    def has_cls_label_innner(ctx: Context, _) -> bool:
+        if LABEL_KEY not in ctx.framework_states:
+            return False
+        if namespace is not None:
+            return ctx.framework_states[LABEL_KEY].get(namespace, {}).get(label, 0) >= threshold
+        scores = [item.get(label, 0) for item in ctx.framework_states[LABEL_KEY].values()]
+        comparison_array = [item >= threshold for item in scores]
+        return any(comparison_array)
+
+    return has_cls_label_innner
+
+
+@has_cls_label.register(DatasetItem)
+def _(label, namespace: Optional[str] = None, threshold: float = 0.9) -> Callable[[Context, Pipeline], bool]:
+    def has_cls_label_innner(ctx: Context, _) -> bool:
+        if LABEL_KEY not in ctx.framework_states:
+            return False
+        if namespace is not None:
+            return ctx.framework_states[LABEL_KEY].get(namespace, {}).get(label.label, 0) >= threshold
+        scores = [item.get(label.label, 0) for item in ctx.framework_states[LABEL_KEY].values()]
+        comparison_array = [item >= threshold for item in scores]
+        return any(comparison_array)
+
+    return has_cls_label_innner
+
+
+@has_cls_label.register(list)
+def _(label, namespace: Optional[str] = None, threshold: float = 0.9):
+    def has_cls_label_innner(ctx: Context, pipeline: Pipeline) -> bool:
+        if LABEL_KEY not in ctx.framework_states:
+            return False
+        scores = [has_cls_label(item, namespace, threshold)(ctx, pipeline) for item in label]
+        for score in scores:
+            if score >= threshold:
+                return True
+        return False
+
+    return has_cls_label_innner
+
+
+def has_match(
+    model: ExtrasBaseModel,
+    positive_examples: Optional[List[str]],
+    negative_examples: Optional[List[str]] = None,
+    threshold: float = 0.9,
+):
+    """
+    Use this condition, if you need to check whether the last request matches
+    any of the pre-defined intent utterances.
+    The model passed to this function should be in the fit state.
+
+    :param model: Any model from the :py:mod:`~dff.script.extras.conditions.models.local.cosine_matchers` module.
+    :param positive_examples: Utterances that the request should match.
+    :param negative_examples: Utterances that the request should not match.
+    :param threshold: Similarity threshold that triggers a positive response from the function.
+    """
+    if negative_examples is None:
+        negative_examples = []
+
+    def has_match_inner(ctx: Context, _) -> bool:
+        if not (ctx.last_request and ctx.last_request.text):
+            return False
+        input_vector = model.transform(ctx.last_request.text)
+        positive_vectors = [model.transform(item) for item in positive_examples]
+        negative_vectors = [model.transform(item) for item in negative_examples]
+        positive_sims = [cosine_similarity(input_vector, item)[0][0] for item in positive_vectors]
+        negative_sims = [cosine_similarity(input_vector, item)[0][0] for item in negative_vectors]
+        max_pos_sim = max(positive_sims)
+        max_neg_sim = 0 if len(negative_sims) == 0 else max(negative_sims)
+        return bool(max_pos_sim > threshold > max_neg_sim)
+
+    return has_match_inner
diff --git a/dff/script/extras/conditions/dataset.py b/dff/script/extras/conditions/dataset.py
@@ -0,0 +1,109 @@
+"""
+Dataset
+--------
+
+This module contains data structures that are required to parse items from files
+and parse requests and responses to and from various APIs.
+
+"""
+from pathlib import Path
+import json
+from typing import List, Dict, Union
+
+from pydantic import BaseModel, Field, field_validator, model_validator
+
+try:
+    from yaml import load, SafeLoader
+
+    pyyaml_available = True
+except ImportError:
+    pyyaml_available = False
+
+
+class DatasetItem(BaseModel, arbitrary_types_allowed=True):
+    """
+    Data structure for storing labeled utterances.
+
+    :param label: Raw classification label.
+    :param samples: Utterance examples. At least one sentence is required.
+    """
+
+    label: str
+    samples: List[Union[List[str], Dict[str, str], str]] = Field(default_factory=list, min_length=1)
+    categorical_code: int = Field(default=0)
+
+
+class Dataset(BaseModel, arbitrary_types_allowed=True):
+    """
+    Data structure for storing multiple :py:class:`~DatasetItem` objects.
+
+    :param items: Can be initialized either with a list or with a dict
+        of :py:class:`~DatasetItem` objects.
+        Makes each item accessible by its label.
+    """
+
+    items: Dict[str, DatasetItem] = Field(default_factory=dict)
+    flat_items: list = Field(default_factory=list)
+    """`flat_items` field is populated automatically using objects from the `items` field."""
+
+    def __getitem__(self, idx: str):
+        return self.flat_items[idx]
+
+    def __len__(self):
+        return len(self.flat_items)
+
+    @classmethod
+    def _get_path(cls, file: str):
+        if isinstance(file, Path):
+            file_path = file
+        else:
+            file_path = Path(file)
+        if not file_path.exists() or not file_path.is_file():
+            raise OSError(f"File does not exist: {file}")
+        return file_path
+
+    @classmethod
+    def parse_json(cls, file: Union[str, Path]):
+        file_path = cls._get_path(file)
+        items = json.load(file_path.open("r", encoding="utf-8"))
+        return cls(items=[DatasetItem.model_validate(item) for item in items])
+
+    @classmethod
+    def parse_jsonl(cls, file: Union[str, Path]):
+        file_path = cls._get_path(file)
+        lines = file_path.open("r", encoding="utf-8").readlines()
+        items = [DatasetItem.model_validate_json(line) for line in lines]
+        return cls(items=items)
+
+    @classmethod
+    def parse_yaml(cls, file: Union[str, Path]):
+        if not pyyaml_available:
+            raise ImportError("`pyyaml` package missing. Try `pip install dff[ext].`")
+        file_path = cls._get_path(file)
+        raw_items = load(file_path.open("r", encoding="utf-8").read(), SafeLoader)["items"]
+        items = [DatasetItem.model_validate(item) for item in raw_items]
+        return cls(items=items)
+
+    @field_validator("items", mode="before")
+    @classmethod
+    def pre_validate_items(cls, value: Union[Dict[str, DatasetItem], List[DatasetItem]]):
+        if isinstance(value, list):  # if items were passed as a list, cast them to a dict
+            new_value = [DatasetItem.model_validate(item) for item in value]
+            item_labels = [item.label for item in new_value]
+            value = {label: item for label, item in zip(item_labels, new_value)}
+
+        return value
+
+    # @root_validator
+    @model_validator(mode="after")
+    def post_validation(self):
+        items: Dict[str, DatasetItem] = self.items
+        for idx, key in enumerate(items.keys()):
+            items[key].categorical_code = idx
+
+        sentences = [sentence for dataset_item in items.values() for sentence in dataset_item.samples]
+        pred_labels = [
+            label for dataset_item in items.values() for label in [dataset_item.label] * len(dataset_item.samples)
+        ]
+        self.flat_items = list(zip(sentences, pred_labels))
+        return self
diff --git a/dff/script/extras/conditions/models/__init__.py b/dff/script/extras/conditions/models/__init__.py
@@ -0,0 +1,9 @@
+from .local.classifiers.huggingface import HFClassifier  # noqa: F401
+from .local.classifiers.regex import RegexClassifier, RegexModel  # noqa: F401
+from .local.classifiers.sklearn import SklearnClassifier  # noqa: F401
+from .local.cosine_matchers.gensim import GensimMatcher  # noqa: F401
+from .local.cosine_matchers.huggingface import HFMatcher  # noqa: F401
+from .local.cosine_matchers.sklearn import SklearnMatcher  # noqa: F401
+from .remote_api.google_dialogflow_model import GoogleDialogFlowModel, AsyncGoogleDialogFlowModel  # noqa: F401
+from .remote_api.rasa_model import AsyncRasaModel, RasaModel  # noqa: F401
+from .remote_api.hf_api_model import AsyncHFAPIModel, HFAPIModel  # noqa: F401