deeppavlov · RLKRo · Oct 24, 2022 · Nov 3, 2022 · Nov 3, 2022 · Nov 3, 2022
diff --git a/.env_file b/.env_file
@@ -17,10 +17,12 @@ MON_PORT=8765
 YDB_ENDPOINT=grpc://localhost:2136
 YDB_DATABASE=/local
 YDB_ANONYMOUS_CREDENTIALS=1
+RASA_API_KEY=rasa
+PYTHONHASHSEED=42
 CLICKHOUSE_DB=test
 CLICKHOUSE_USER=username
 CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT=1
 CLICKHOUSE_PASSWORD=pass
 SUPERSET_USERNAME=superset
 SUPERSET_PASSWORD=superset
-SUPERSET_METADATA_PORT=5433
+SUPERSET_METADATA_PORT=5433
diff --git a/.github/workflows/test_coverage.yml b/.github/workflows/test_coverage.yml
@@ -35,9 +35,21 @@ jobs:
           python -m pip install --upgrade pip poetry==1.8.5
           python -m poetry install --with test,tutorials --all-extras --no-ansi --no-interaction
 
+      - name: Create gdf_account.json
+        uses: jsdaniell/[email protected]
+        with:
+          name: "gdf_account.json"
+          json: ${{ secrets.GDF_ACCOUNT_JSON }}
+
+      - name: write realpath to env
+        run: |
+            echo "GDF_ACCOUNT_JSON=$(realpath ./gdf_account.json)" >> $GITHUB_ENV
+
       - name: run tests
         env:
           TG_BOT_TOKEN: ${{ secrets.TG_BOT_TOKEN }}
           TG_BOT_USERNAME: ${{ secrets.TG_BOT_USERNAME }}
+          HF_API_KEY: ${{ secrets.HF_API_KEY }}
+          GDF_ACCOUNT_JSON: ${{ env.GDF_ACCOUNT_JSON }}
         run: |
           python -m poetry run poe test_all
diff --git a/.github/workflows/test_full.yml b/.github/workflows/test_full.yml
@@ -37,10 +37,22 @@ jobs:
           python -m pip install --upgrade pip poetry==1.8.5
           python -m poetry install --with test,tutorials --all-extras --no-ansi --no-interaction
 
+      - name: Create gdf_account.json
+        uses: jsdaniell/[email protected]
+        with:
+          name: "gdf_account.json"
+          json: ${{ secrets.GDF_ACCOUNT_JSON }}
+
+      - name: write realpath to env
+        run: |
+            echo "GDF_ACCOUNT_JSON=$(realpath ./gdf_account.json)" >> $GITHUB_ENV
+
       - name: run pytest
         env:
           TG_BOT_TOKEN: ${{ secrets.TG_BOT_TOKEN }}
           TG_BOT_USERNAME: ${{ secrets.TG_BOT_USERNAME }}
+          HF_API_KEY: ${{ secrets.HF_API_KEY }}
+          GDF_ACCOUNT_JSON: ${{ env.GDF_ACCOUNT_JSON }}
         run: |
           python -m poetry run poe test_no_cov
 
@@ -59,9 +71,21 @@ jobs:
           python -m pip install --upgrade pip poetry==1.8.5
           python -m poetry install --with test --no-ansi --no-interaction
 
+      - name: Create gdf_account.json
+        uses: jsdaniell/[email protected]
+        with:
+          name: "gdf_account.json"
+          json: ${{ secrets.GDF_ACCOUNT_JSON }}
+
+      - name: write realpath to env
+        run: |
+            echo "GDF_ACCOUNT_JSON=$(realpath ./gdf_account.json)" >> $GITHUB_ENV
+
       - name: run pytest
         env:
           TG_BOT_TOKEN: ${{ secrets.TG_BOT_TOKEN }}
           TG_BOT_USERNAME: ${{ secrets.TG_BOT_USERNAME }}
+          HF_API_KEY: ${{ secrets.HF_API_KEY }}
+          GDF_ACCOUNT_JSON: ${{ env.GDF_ACCOUNT_JSON }}
         run: |
           python -m poetry run poe test_no_deps
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -147,6 +147,7 @@ Tests are configured via [`.env_file`](.env_file).
 Chatsky uses docker images for two purposes:
 1. Database images for integration testing.
 2. Images for statistics collection.
+3. Setting up Rasa framework for working with extended conditions.
 
 The first group can be launched via
 
@@ -164,9 +165,15 @@ docker compose --profile stats up
 
 This will download and launch Superset Dashboard, Clickhouse, OpenTelemetry Collector.
 
-To launch both groups run
+The third group can be launched via
+
+```bash
+docker compose --profile ext up
+```
+
+To launch all groups run
 ```bash
-docker compose --profile context_storage --profile stats up
+docker compose --profile context_storage --profile stats --profile ext up
 ```
 
 This will be done automatically when running `poetry run poe test_all`.

diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-![Chatsky](https://raw.githubusercontent.com/deeppavlov/chatsky/master/docs/source/_static/images/Chatsky-full-dark.svg)
+![Chatsky](docs/source/_static/images/Chatsky-full-dark.svg)
 
 [![Documentation Status](https://github.com/deeppavlov/chatsky/workflows/build_and_publish_docs/badge.svg?branch=dev)](https://deeppavlov.github.io/chatsky)
 [![Codestyle](https://github.com/deeppavlov/chatsky/workflows/codestyle/badge.svg?branch=dev)](https://github.com/deeppavlov/chatsky/actions/workflows/codestyle.yml)

diff --git a/chatsky/__rebuild_pydantic_models__.py b/chatsky/__rebuild_pydantic_models__.py
@@ -10,6 +10,8 @@
 from chatsky.core.ctx_utils import ServiceState, FrameworkData, ContextMainInfo
 from chatsky.core.service import PipelineComponent
 from chatsky.llm import LLM_API
+from chatsky.ml.models.base_model import ExtrasBaseAPIModel
+from chatsky.ml.models.hf_api_model import HFAPIModel
 
 ContextMainInfo.model_rebuild()
 ContextDict.model_rebuild()

diff --git a/chatsky/conditions/ml.py b/chatsky/conditions/ml.py
@@ -0,0 +1,85 @@
+"""
+Conditions
+------------
+
+This module provides condition functions for annotation processing.
+"""
+
+from typing import Optional, List
+
+try:
+    # !!! remove sklearn, use pure python instead
+    from sklearn.metrics.pairwise import cosine_similarity
+
+    sklearn_available = True
+except ImportError:
+    sklearn_available = False
+from chatsky import Context
+from chatsky.conditions.standard import BaseCondition
+from chatsky.ml.models.base_model import ExtrasBaseAPIModel
+
+
+class HasLabel(BaseCondition):
+    """
+    Use this condition, when you need to check, whether the probability
+    of a particular label for the last annotated user utterance surpasses the threshold.
+
+    :param label: String name or a reference to a DatasetItem object, or a collection thereof.
+    :param namespace: Namespace key of a particular model that should detect the dataset_item.
+        If not set, all namespaces will be searched for the required dataset_item.
+    :param threshold: The minimal label probability that triggers a positive response
+        from the function.
+    """
+
+    label: str
+    model_name: str
+    threshold: float = 0.9
+
+    async def call(self, ctx: Context) -> bool:
+        model = ctx.pipeline.models[self.model_name]
+        # Predict labels for the last request
+        # and store them in framework_data with uuid of the model as a key
+        await model(ctx)
+        if model.model_id not in ctx.framework_data.models_labels:
+            return False
+        if model.model_id is not None:
+            return ctx.framework_data.models_labels.get(model.model_id, {}).get(self.label, 0) >= self.threshold
+        scores = [item.get(self.label, 0) for item in ctx.framework_data.models_labels.values()]
+        comparison_array = [item >= self.threshold for item in scores]
+        return any(comparison_array)
+
+
+class HasMatch(BaseCondition):
+    """
+    Use this condition, if you need to check whether the last request matches
+    any of the pre-defined intent utterances.
+    The model passed to this function should be in the fit state.
+
+    :param model: Any model from the :py:mod:`~chatsky.ml.models.local.cosine_matchers` module.
+    :param positive_examples: Utterances that the request should match.
+    :param negative_examples: Utterances that the request should not match.
+    :param threshold: Similarity threshold that triggers a positive response from the function.
+    """
+
+    model_name: str
+    positive_examples: Optional[List[str]]
+    negative_examples: Optional[List[str]] = []
+    threshold: float = 0.9
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    async def call(self, ctx: Context) -> bool:
+        if not (ctx.last_request and ctx.last_request.text):
+            return False
+
+        model = ctx.pipeline.models[self.model_name]
+
+        input_vector = model.transform(ctx.last_request.text)
+        positive_vectors = [model.transform(item) for item in self.positive_examples]
+        negative_vectors = [model.transform(item) for item in self.negative_examples]
+        positive_sims = [cosine_similarity(input_vector, item)[0][0] for item in positive_vectors]
+        negative_sims = [cosine_similarity(input_vector, item)[0][0] for item in negative_vectors]
+        max_pos_sim = max(positive_sims)
+        max_neg_sim = 0 if len(negative_sims) == 0 else max(negative_sims)
+        return bool(max_pos_sim > self.threshold > max_neg_sim)
diff --git a/chatsky/core/ctx_utils.py b/chatsky/core/ctx_utils.py
@@ -64,6 +64,11 @@ class FrameworkData(BaseModel, arbitrary_types_allowed=True):
     "Enables complex stats collection across multiple turns."
     slot_manager: SlotManager = Field(default_factory=SlotManager)
     "Stores extracted slots."
+    models_labels: Dict[str, Dict[str, float]] = Field(default_factory=dict)
+    """
+    Stores labels predicted by models.
+    The key is the model id, the value is a dictionary with labels and their probabilities.
+    """
 
 
 class ContextMainInfo(BaseModel):

diff --git a/chatsky/core/pipeline.py b/chatsky/core/pipeline.py
@@ -32,6 +32,7 @@
 from chatsky.core.script_parsing import JSONImporter, Path
 
 if TYPE_CHECKING:
+    from chatsky.ml.models.base_model import ExtrasBaseAPIModel
     from chatsky.llm.llm_api import LLM_API
 
 logger = logging.getLogger(__name__)
@@ -82,7 +83,7 @@ class Pipeline(BaseModel, extra="forbid", arbitrary_types_allowed=True):
     """
     Slots configuration.
     """
-    models: Dict[str, LLM_API] = Field(default_factory=dict)
+    models: Dict[str, Union[LLM_API, ExtrasBaseAPIModel]] = Field(default_factory=dict)
     """
     LLM models to be made available in custom functions.
     """

diff --git a/chatsky/ml/__init__.py b/chatsky/ml/__init__.py
@@ -0,0 +1 @@
+# -*- coding: utf-8 -*-
diff --git a/chatsky/ml/models/__init__.py b/chatsky/ml/models/__init__.py
@@ -0,0 +1,3 @@
+from .google_dialogflow_model import GoogleDialogFlowModel  # noqa: F401
+from .rasa_model import RasaModel  # noqa: F401
+from .hf_api_model import HFAPIModel  # noqa: F401
diff --git a/chatsky/ml/models/base_model.py b/chatsky/ml/models/base_model.py
@@ -0,0 +1,58 @@
+"""
+Base Model
+-----------
+This module defines an abstract interface for label-scoring models, :py:class:`~ExtrasBaseModel`.
+When defining custom label-scoring models, always inherit from this class.
+"""
+
+from copy import copy
+from abc import ABC, abstractmethod
+
+from chatsky import Context
+
+import uuid
+
+
+class ExtrasBaseAPIModel(ABC):
+    """
+    Base class for label-scoring models running on remote server and accessed via API.
+    Predicted scores for labels are stored in :py:class:`~chatsky.script.Context.framework_data`.
+    """
+
+    def __init__(self) -> None:
+        self.model_id = uuid.uuid4()
+
+    def __deepcopy__(self, *args, **kwargs):
+        return copy(self)
+
+    @abstractmethod
+    async def predict(self, request: str) -> dict:
+        """
+        Predict the probability of one or several classes.
+
+        :param request: Target request string.
+        """
+        raise NotImplementedError
+
+    async def transform(self, request: str):
+        """
+        Get a numeric representation of the input data.
+
+        :param request: Target request string.
+        """
+        raise NotImplementedError
+
+    async def __call__(self, ctx: Context):
+        """
+        Saves the retrieved labels to a subspace inside the `framework_states` field of the context.
+        Creates the missing namespaces, if necessary.
+        """
+
+        if ctx.last_request and ctx.last_request.text:
+            labels: dict = await self.predict(ctx.last_request.text)
+        else:
+            labels = dict()
+
+        ctx.framework_data.models_labels[self.model_id] = labels
+
+        return ctx
diff --git a/chatsky/ml/models/google_dialogflow_model.py b/chatsky/ml/models/google_dialogflow_model.py
@@ -0,0 +1,84 @@
+"""
+Google Dialogflow Model
+------------------------
+
+The module allows you to use Google Dialogflow as a service
+to gain insights about user intents.
+"""
+
+import uuid
+import json
+from pathlib import Path
+from async_lru import alru_cache
+
+from chatsky.ml.models.base_model import ExtrasBaseAPIModel
+
+try:
+    from google.cloud import dialogflow_v2
+    from google.oauth2 import service_account
+
+    dialogflow_available = True
+except ImportError:
+    dialogflow_v2 = None
+    service_account = None
+    dialogflow_available = False
+
+
+class GoogleDialogFlowModel(ExtrasBaseAPIModel):
+    """
+    This class implements an asynchronous connection to Google Dialogflow for dialog annotation.
+    Note, that before you use the class, you need to set up a Dialogflow project,
+    create intents, and train a language model, which can be easily done
+    using the Dialogflow web interface (see the official
+    `instructions <https://cloud.google.com/dialogflow/es/docs/quick/build-agent>`_).
+    After this is done, you should obtain a service account JSON file from Google
+    and pass it to this class, using :py:meth:`~from_file` method.
+
+    :param model: A parsed service account json for your dialogflow project.
+        Calling json.load() on the file obtained from Google is sufficient to get the
+        credentials object. Alternatively, use :py:meth:`~from_file` method.
+    :param namespace_key: Name of the namespace in framework states that the model will be using.
+    :param language: Language parameter is passed to the Dialogflow wrapper.
+    """
+
+    def __init__(
+        self,
+        model: dict,
+        *,
+        language: str = "en",
+    ) -> None:
+        if not dialogflow_available:
+            raise ImportError("`google-cloud-dialogflow` package missing. Try `pip install chatsky[dialogflow]`.")
+        super().__init__()
+        self._language = language
+        if isinstance(model, dict):
+            info = model
+        else:
+            raise ValueError("Please, pass the service account credentials as dict.")
+
+        self._credentials = service_account.Credentials.from_service_account_info(info)
+
+    @classmethod
+    def from_file(cls, filename: str, language: str = "en"):
+        """
+        :param filename: Path to the Dialogflow credentials saved as JSON.
+        :param language: The language parameter is forwarded to the underlying
+            Dialogflow wrapper.
+        """
+        assert Path(filename).exists(), f"Path {filename} does not exist."
+        with open(filename, "r", encoding="utf-8") as file:
+            info = json.load(file)
+        return cls(model=info, language=language)
+
+    @alru_cache(maxsize=10)
+    async def predict(self, request: str) -> dict:
+        session_id = uuid.uuid4()
+        session_client = dialogflow_v2.SessionsAsyncClient(credentials=self._credentials)
+        session_path = session_client.session_path(self._credentials.project_id, session_id)
+        query_input = dialogflow_v2.QueryInput(text=dialogflow_v2.TextInput(text=request, language_code=self._language))
+        request = dialogflow_v2.DetectIntentRequest(session=session_path, query_input=query_input)
+        response = await session_client.detect_intent(request=request)
+        result: dialogflow_v2.QueryResult = response.query_result
+        if result.intent is not None:
+            return {result.intent.display_name: result.intent_detection_confidence}
+        return {}