From 2714e7c839137a9fc617270c086167d273a16f1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Pstr=C4=85g?= <michal.pstrag@icloud.com>
Date: Fri, 9 Aug 2024 13:13:06 +0000
Subject: [PATCH] feat(extra): prompt tuning (#79)

---
 benchmarks/README.md                          |   4 +-
 extra/README.md                               |  13 +++
 extra/prompt_tuning/README.md                 |  42 ++++++++
 extra/prompt_tuning/config/config.yaml        |   7 ++
 .../prompt_tuning/config/data/superhero.yaml  |   4 +
 .../config/llm/claude-3-haiku.yaml            |   2 +
 .../config/llm/claude-3-opus.yaml             |   2 +
 .../config/llm/claude-3.5-sonnet.yaml         |   2 +
 .../config/llm/gpt-3.5-turbo.yaml             |   2 +
 .../prompt_tuning/config/llm/gpt-4-turbo.yaml |   2 +
 extra/prompt_tuning/config/llm/gpt-4o.yaml    |   2 +
 .../program/filtering-assessor-baseline.yaml  |   2 +
 .../program/filtering-assessor-cot.yaml       |   2 +
 extra/prompt_tuning/evaluate.py               | 101 ++++++++++++++++++
 extra/prompt_tuning/tuning/__init__.py        |   0
 extra/prompt_tuning/tuning/loaders.py         |  69 ++++++++++++
 .../prompt_tuning/tuning/metrics/__init__.py  |   3 +
 extra/prompt_tuning/tuning/metrics/iql.py     |  19 ++++
 .../prompt_tuning/tuning/programs/__init__.py |   8 ++
 extra/prompt_tuning/tuning/programs/iql.py    |  49 +++++++++
 .../tuning/signatures/__init__.py             |   3 +
 extra/prompt_tuning/tuning/signatures/iql.py  |  20 ++++
 extra/prompt_tuning/tuning/utils.py           |  43 ++++++++
 setup.cfg                                     |  16 +--
 24 files changed, 407 insertions(+), 10 deletions(-)
 create mode 100644 extra/README.md
 create mode 100644 extra/prompt_tuning/README.md
 create mode 100644 extra/prompt_tuning/config/config.yaml
 create mode 100644 extra/prompt_tuning/config/data/superhero.yaml
 create mode 100644 extra/prompt_tuning/config/llm/claude-3-haiku.yaml
 create mode 100644 extra/prompt_tuning/config/llm/claude-3-opus.yaml
 create mode 100644 extra/prompt_tuning/config/llm/claude-3.5-sonnet.yaml
 create mode 100644 extra/prompt_tuning/config/llm/gpt-3.5-turbo.yaml
 create mode 100644 extra/prompt_tuning/config/llm/gpt-4-turbo.yaml
 create mode 100644 extra/prompt_tuning/config/llm/gpt-4o.yaml
 create mode 100644 extra/prompt_tuning/config/program/filtering-assessor-baseline.yaml
 create mode 100644 extra/prompt_tuning/config/program/filtering-assessor-cot.yaml
 create mode 100644 extra/prompt_tuning/evaluate.py
 create mode 100644 extra/prompt_tuning/tuning/__init__.py
 create mode 100644 extra/prompt_tuning/tuning/loaders.py
 create mode 100644 extra/prompt_tuning/tuning/metrics/__init__.py
 create mode 100644 extra/prompt_tuning/tuning/metrics/iql.py
 create mode 100644 extra/prompt_tuning/tuning/programs/__init__.py
 create mode 100644 extra/prompt_tuning/tuning/programs/iql.py
 create mode 100644 extra/prompt_tuning/tuning/signatures/__init__.py
 create mode 100644 extra/prompt_tuning/tuning/signatures/iql.py
 create mode 100644 extra/prompt_tuning/tuning/utils.py

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 0a549fb5..c4aa8fd1 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -4,10 +4,10 @@ This folder contains scripts that produce reproducible timings and evaluation me
 
 ## Setup environment
 
-Before installing any package, make sure you have Python 3.8 or higher installed on your machine. From the root directory of the project, install the dependencies:
+Before installing any package, make sure you have Python 3.9 or higher installed on your machine. From the root directory of the project, install the dependencies:
 
 ```bash
-pip install -e '.[benchmarks]'
+pip install -e '.[dev]'
 ```
 
 ## Benchmark list
diff --git a/extra/README.md b/extra/README.md
new file mode 100644
index 00000000..b783af05
--- /dev/null
+++ b/extra/README.md
@@ -0,0 +1,13 @@
+# Extra
+
+This folder contains scripts for researching stuff related to dbally. Links are provided where descriptions exist:
+
+- [`Prompt tuning`](prompt_tuning/README.md)
+
+## Setup environment
+
+Before installing any package, make sure you have Python 3.9 or higher installed on your machine. From the root directory of the project, install the dependencies:
+
+```bash
+pip install -e '.[dev]'
+```
diff --git a/extra/prompt_tuning/README.md b/extra/prompt_tuning/README.md
new file mode 100644
index 00000000..e5c86a5a
--- /dev/null
+++ b/extra/prompt_tuning/README.md
@@ -0,0 +1,42 @@
+# Prompt tuning
+
+This folder contains scripts for prompt tuning and evaluation. Prompts (programs) used in dbally:
+
+- `FILTERING_ASSESSOR` - assesses whether a question requires filtering.
+
+All evaluations are run on a dev split of the [BIRD](https://bird-bench.github.io/) dataset. For now, one configuration is available to run the suite against the `superhero` database.
+
+## Usage
+
+Run evalution of filtering assessor baseline on the `superhero` database with `gpt-3.5-turbo`:
+
+```bash
+python evaluate.py program=filtering-assessor-baseline
+```
+
+Test multiple programs:
+
+```bash
+python evaluate.py --multirun program=filtering-assessor-baseline,filtering-assessor-cot
+```
+
+Compare prompt performance on multiple LLMs:
+
+```bash
+python evaluate.py --multirun program=filtering-assessor-baseline llm=gpt-3.5-turbo,claude-3.5-sonnet
+```
+
+### Log to Neptune
+
+Before running the evaluation with Neptune, configure the following environment variables:
+
+```bash
+export NEPTUNE_API_TOKEN="API_TOKEN"
+export NEPTUNE_PROJECT="WORKSPACE_NAME/PROJECT_NAME"
+```
+
+Export evaluation results to Neptune:
+
+```bash
+python evaluate.py neptune=True
+```
diff --git a/extra/prompt_tuning/config/config.yaml b/extra/prompt_tuning/config/config.yaml
new file mode 100644
index 00000000..9aed0232
--- /dev/null
+++ b/extra/prompt_tuning/config/config.yaml
@@ -0,0 +1,7 @@
+defaults:
+  - data: superhero
+  - llm: gpt-3.5-turbo
+  - program: filtering-assessor-baseline
+  - _self_
+
+neptune: False
diff --git a/extra/prompt_tuning/config/data/superhero.yaml b/extra/prompt_tuning/config/data/superhero.yaml
new file mode 100644
index 00000000..23412721
--- /dev/null
+++ b/extra/prompt_tuning/config/data/superhero.yaml
@@ -0,0 +1,4 @@
+path: "micpst/bird-iql"
+split: "dev"
+db_ids: ["superhero"]
+difficulties: ["simple", "moderate", "challenging"]
diff --git a/extra/prompt_tuning/config/llm/claude-3-haiku.yaml b/extra/prompt_tuning/config/llm/claude-3-haiku.yaml
new file mode 100644
index 00000000..96ce6ae4
--- /dev/null
+++ b/extra/prompt_tuning/config/llm/claude-3-haiku.yaml
@@ -0,0 +1,2 @@
+model_name: claude-3-haiku-20240307
+provider: Claude
diff --git a/extra/prompt_tuning/config/llm/claude-3-opus.yaml b/extra/prompt_tuning/config/llm/claude-3-opus.yaml
new file mode 100644
index 00000000..466c91cb
--- /dev/null
+++ b/extra/prompt_tuning/config/llm/claude-3-opus.yaml
@@ -0,0 +1,2 @@
+model_name: claude-3-opus-20240229
+provider: Claude
diff --git a/extra/prompt_tuning/config/llm/claude-3.5-sonnet.yaml b/extra/prompt_tuning/config/llm/claude-3.5-sonnet.yaml
new file mode 100644
index 00000000..9b1ebec3
--- /dev/null
+++ b/extra/prompt_tuning/config/llm/claude-3.5-sonnet.yaml
@@ -0,0 +1,2 @@
+model_name: claude-3-5-sonnet-20240620
+provider: Claude
diff --git a/extra/prompt_tuning/config/llm/gpt-3.5-turbo.yaml b/extra/prompt_tuning/config/llm/gpt-3.5-turbo.yaml
new file mode 100644
index 00000000..da52dc69
--- /dev/null
+++ b/extra/prompt_tuning/config/llm/gpt-3.5-turbo.yaml
@@ -0,0 +1,2 @@
+model_name: gpt-3.5-turbo
+provider: OpenAI
diff --git a/extra/prompt_tuning/config/llm/gpt-4-turbo.yaml b/extra/prompt_tuning/config/llm/gpt-4-turbo.yaml
new file mode 100644
index 00000000..458b37b4
--- /dev/null
+++ b/extra/prompt_tuning/config/llm/gpt-4-turbo.yaml
@@ -0,0 +1,2 @@
+model_name: gpt-4-turbo
+provider: OpenAI
diff --git a/extra/prompt_tuning/config/llm/gpt-4o.yaml b/extra/prompt_tuning/config/llm/gpt-4o.yaml
new file mode 100644
index 00000000..ed176a9a
--- /dev/null
+++ b/extra/prompt_tuning/config/llm/gpt-4o.yaml
@@ -0,0 +1,2 @@
+model_name: gpt-4o
+provider: OpenAI
diff --git a/extra/prompt_tuning/config/program/filtering-assessor-baseline.yaml b/extra/prompt_tuning/config/program/filtering-assessor-baseline.yaml
new file mode 100644
index 00000000..e0e6855d
--- /dev/null
+++ b/extra/prompt_tuning/config/program/filtering-assessor-baseline.yaml
@@ -0,0 +1,2 @@
+type: FILTERING_ASSESSOR
+name: FilteringAssessorBaseline
diff --git a/extra/prompt_tuning/config/program/filtering-assessor-cot.yaml b/extra/prompt_tuning/config/program/filtering-assessor-cot.yaml
new file mode 100644
index 00000000..bd7f8850
--- /dev/null
+++ b/extra/prompt_tuning/config/program/filtering-assessor-cot.yaml
@@ -0,0 +1,2 @@
+type: FILTERING_ASSESSOR
+name: FilteringAssessorCoT
diff --git a/extra/prompt_tuning/evaluate.py b/extra/prompt_tuning/evaluate.py
new file mode 100644
index 00000000..35bcf2e8
--- /dev/null
+++ b/extra/prompt_tuning/evaluate.py
@@ -0,0 +1,101 @@
+import asyncio
+import logging
+from enum import Enum
+from pathlib import Path
+
+import dspy
+import hydra
+import neptune
+from dspy.evaluate import Evaluate
+from neptune.utils import stringify_unsupported
+from omegaconf import DictConfig
+from tuning.loaders import IQLGenerationDataLoader
+from tuning.metrics import filtering_assess_acc
+from tuning.programs import PROGRAMS
+from tuning.utils import save, serialize_results
+
+logging.getLogger("httpx").setLevel(logging.ERROR)
+logging.getLogger("anthropic").setLevel(logging.ERROR)
+log = logging.getLogger(__name__)
+
+
+class EvaluationType(Enum):
+    """
+    Enum representing the evaluation type.
+    """
+
+    FILTERING_ASSESSOR = "FILTERING_ASSESSOR"
+
+
+EVALUATION_DATALOADERS = {
+    EvaluationType.FILTERING_ASSESSOR.value: IQLGenerationDataLoader,
+}
+
+EVALUATION_METRICS = {
+    EvaluationType.FILTERING_ASSESSOR.value: filtering_assess_acc,
+}
+
+
+async def evaluate(config: DictConfig) -> None:
+    """
+    Function running evaluation for all datasets and evaluation tasks defined in hydra config.
+
+    Args:
+        config: Hydra configuration.
+    """
+    log.info("Starting evaluation: %s", config.program.name)
+
+    dataloader = EVALUATION_DATALOADERS[config.program.type](config)
+    metric = EVALUATION_METRICS[config.program.type]
+    program = PROGRAMS[config.program.name]()
+
+    dataset = await dataloader.load()
+
+    lm = dspy.__dict__[config.llm.provider](model=config.llm.model_name)
+    dspy.settings.configure(lm=lm)
+
+    evaluator = Evaluate(
+        devset=dataset,
+        metric=metric,
+        num_threads=32,
+        display_progress=True,
+        return_outputs=True,
+    )
+    metric, results = evaluator(program)
+
+    log.info("Evaluation finished. Saving results...")
+
+    output_dir = Path(hydra.core.hydra_config.HydraConfig.get().runtime.output_dir)
+    results_file = output_dir / "results.json"
+    save(results_file, results=serialize_results(results))
+
+    log.info("Evaluation results saved under directory: %s", output_dir)
+
+    if config.neptune:
+        run = neptune.init_run()
+        run["sys/tags"].add(
+            [
+                config.program.type,
+                config.program.name,
+                *config.data.db_ids,
+                *config.data.difficulties,
+            ]
+        )
+        run["config"] = stringify_unsupported(config)
+        run["evaluation/metrics/ACC"] = stringify_unsupported(metric)
+        run["evaluation/results.json"].upload(results_file.as_posix())
+
+
+@hydra.main(config_path="config", config_name="config", version_base="3.2")
+def main(config: DictConfig) -> None:
+    """
+    Function running evaluation for all datasets and evaluation tasks defined in hydra config.
+
+    Args:
+        config: Hydra configuration.
+    """
+    asyncio.run(evaluate(config))
+
+
+if __name__ == "__main__":
+    main()  # pylint: disable=no-value-for-parameter
diff --git a/extra/prompt_tuning/tuning/__init__.py b/extra/prompt_tuning/tuning/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/extra/prompt_tuning/tuning/loaders.py b/extra/prompt_tuning/tuning/loaders.py
new file mode 100644
index 00000000..2cc7dc96
--- /dev/null
+++ b/extra/prompt_tuning/tuning/loaders.py
@@ -0,0 +1,69 @@
+from abc import ABC, abstractmethod
+from typing import Dict, Iterable, List
+
+import dspy.datasets
+from dspy import Example
+
+
+class DataLoader(ABC):
+    """
+    Data loader.
+    """
+
+    def __init__(self, config: Dict) -> None:
+        self.config = config
+
+    @abstractmethod
+    async def load(self) -> Iterable:
+        """
+        Load the data.
+
+        Returns:
+            The loaded data.
+        """
+
+
+class HuggingFaceDataLoader(DataLoader):
+    """
+    Hugging Face data loader.
+    """
+
+    async def load(self) -> List[Example]:
+        """
+        Load the data from Hugging Face.
+
+        Returns:
+            The loaded data.
+        """
+        dataloader = dspy.datasets.DataLoader()
+        dataset = dataloader.from_huggingface(
+            dataset_name=self.config.data.path, split=self.config.data.split, input_keys=("question",)
+        )
+        return [
+            data
+            for data in dataset
+            if data["question"]
+            if (
+                data["db_id"] in self.config.data.db_ids
+                if self.config.data.db_ids
+                else True and data["difficulty"] in self.config.data.difficulties
+                if self.config.data.difficulties
+                else True
+            )
+        ]
+
+
+class IQLGenerationDataLoader(HuggingFaceDataLoader):
+    """
+    Data loader for IQL generation evaluation.
+    """
+
+    async def load(self) -> List[Example]:
+        """
+        Load the data from Hugging Face and filter out samples without views.
+
+        Returns:
+            The loaded data.
+        """
+        dataset = await super().load()
+        return [data for data in dataset if data["view_name"]]
diff --git a/extra/prompt_tuning/tuning/metrics/__init__.py b/extra/prompt_tuning/tuning/metrics/__init__.py
new file mode 100644
index 00000000..56615738
--- /dev/null
+++ b/extra/prompt_tuning/tuning/metrics/__init__.py
@@ -0,0 +1,3 @@
+from .iql import filtering_assess_acc
+
+__all__ = ["filtering_assess_acc"]
diff --git a/extra/prompt_tuning/tuning/metrics/iql.py b/extra/prompt_tuning/tuning/metrics/iql.py
new file mode 100644
index 00000000..d47b0689
--- /dev/null
+++ b/extra/prompt_tuning/tuning/metrics/iql.py
@@ -0,0 +1,19 @@
+from typing import Dict
+
+from dspy import Prediction
+
+
+def filtering_assess_acc(gold: Dict, pred: Prediction) -> bool:
+    """
+    IQL decision metric.
+
+    Args:
+        gold: The ground truth data point.
+        pred: The prediction.
+
+    Returns:
+        The decision metric.
+    """
+    return ((gold["iql_filters"] is None and not gold["iql_filters_unsupported"]) and not pred.decision) or (
+        (gold["iql_filters"] is not None or gold["iql_filters_unsupported"]) and pred.decision
+    )
diff --git a/extra/prompt_tuning/tuning/programs/__init__.py b/extra/prompt_tuning/tuning/programs/__init__.py
new file mode 100644
index 00000000..1961d77d
--- /dev/null
+++ b/extra/prompt_tuning/tuning/programs/__init__.py
@@ -0,0 +1,8 @@
+from .iql import FilteringAssessorBaseline, FilteringAssessorCoT
+
+PROGRAMS = {
+    FilteringAssessorBaseline.__name__: FilteringAssessorBaseline,
+    FilteringAssessorCoT.__name__: FilteringAssessorCoT,
+}
+
+__all__ = ["PROGRAMS", "FilteringAssessorBaseline", "FilteringAssessorCoT"]
diff --git a/extra/prompt_tuning/tuning/programs/iql.py b/extra/prompt_tuning/tuning/programs/iql.py
new file mode 100644
index 00000000..2a20da47
--- /dev/null
+++ b/extra/prompt_tuning/tuning/programs/iql.py
@@ -0,0 +1,49 @@
+from dspy import ChainOfThought, Module, Predict, Prediction
+
+from ..signatures.iql import CheckQuestionFiltering
+
+
+class FilteringAssessorBaseline(Module):
+    """
+    Program that assesses whether a question requires filtering.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.decide = Predict(CheckQuestionFiltering)
+
+    def forward(self, question: str) -> Prediction:
+        """
+        Assess whether a question requires filtering.
+
+        Args:
+            question: The question to assess.
+
+        Returns:
+            The prediction.
+        """
+        decision = self.decide(question=question).decision
+        return Prediction(decision=decision.lower() == "true")
+
+
+class FilteringAssessorCoT(Module):
+    """
+    Program that assesses whether a question requires filtering.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.decide = ChainOfThought(CheckQuestionFiltering)
+
+    def forward(self, question: str) -> Prediction:
+        """
+        Assess whether a question requires filtering.
+
+        Args:
+            question: The question to assess.
+
+        Returns:
+            The prediction.
+        """
+        decision = self.decide(question=question).decision
+        return Prediction(decision=decision.lower() == "true")
diff --git a/extra/prompt_tuning/tuning/signatures/__init__.py b/extra/prompt_tuning/tuning/signatures/__init__.py
new file mode 100644
index 00000000..dd3be583
--- /dev/null
+++ b/extra/prompt_tuning/tuning/signatures/__init__.py
@@ -0,0 +1,3 @@
+from .iql import CheckQuestionFiltering
+
+__all__ = ["CheckQuestionFiltering"]
diff --git a/extra/prompt_tuning/tuning/signatures/iql.py b/extra/prompt_tuning/tuning/signatures/iql.py
new file mode 100644
index 00000000..273edf60
--- /dev/null
+++ b/extra/prompt_tuning/tuning/signatures/iql.py
@@ -0,0 +1,20 @@
+from dspy import InputField, OutputField, Signature
+
+
+class CheckQuestionFiltering(Signature):
+    """
+    Given a question, determine whether the answer requires initial data filtering in order to compute it.
+    Initial data filtering is a process in which the result set is reduced to only include the rows that
+    meet certain criteria specified in the question.
+    """
+
+    question = InputField(
+        prefix="Question: ",
+    )
+    decision = OutputField(
+        prefix="Decision: ",
+        desc=(
+            "indicates whether the answer to the question requires initial data filtering. "
+            "(Respond with True or False)"
+        ),
+    )
diff --git a/extra/prompt_tuning/tuning/utils.py b/extra/prompt_tuning/tuning/utils.py
new file mode 100644
index 00000000..94f06122
--- /dev/null
+++ b/extra/prompt_tuning/tuning/utils.py
@@ -0,0 +1,43 @@
+import json
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Any, List, Tuple
+
+
+def serialize_results(results: List[Tuple]) -> Any:
+    """
+    Serialize the results to a JSON serializable format.
+
+    Args:
+        results: The results to be serialized.
+
+    Returns:
+        The serialized results.
+    """
+    return [
+        {
+            "question": example["question"],
+            "reference": score == prediction.decision,
+            "prediction": prediction.decision,
+        }
+        for (example, prediction, score) in results
+    ]
+
+
+def save(file_path: Path, **data: Any) -> None:
+    """
+    Save the data to a file. Add the current timestamp and Python version to the data.
+
+    Args:
+        file_path: The path to the file.
+        data: The data to be saved.
+    """
+    current_time = datetime.now()
+
+    data["_timestamp"] = current_time.isoformat()
+    data["_python_version"] = sys.version
+    data["_interpreter_path"] = sys.executable
+
+    with open(file_path, "w", encoding="utf-8") as file:
+        json.dump(data, file, indent=4)
diff --git a/setup.cfg b/setup.cfg
index 8ea3dff2..b6552e0e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -51,14 +51,6 @@ chromadb =
     tenacity~=8.3.0
 langsmith=
     langsmith~=0.1.57
-examples =
-    pydantic~=2.6.0
-    pydantic_settings~=2.1.0
-    psycopg2-binary~=2.9.9
-benchmarks =
-    datasets~=2.20.0
-    hydra-core~=1.3.2
-    neptune~=1.6.3
 elasticsearch =
     elasticsearch~=8.13.1
 gradio =
@@ -68,6 +60,14 @@ local =
     accelerate~=0.31.0
     torch~=2.2.1
     transformers~=4.41.2
+dev =
+    datasets~=2.20.0
+    dspy-ai~=2.4.13
+    hydra-core~=1.3.2
+    neptune~=1.6.3
+    pydantic~=2.6.0
+    pydantic_settings~=2.1.0
+    psycopg2-binary~=2.9.9
 
 [options.packages.find]
 where = src