feat: simplification of Document Search Evaluation interface (#258)

Co-authored-by: Michał Pstrąg <[email protected]>
deepsense-ai · Jan 17, 2025 · f62a084 · f62a084
1 parent 0fa1308
commit f62a084
Show file tree

Hide file tree

Showing 66 changed files with 900 additions and 769 deletions.
diff --git a/docs/how-to/evaluate/custom_dataloader.md b/docs/how-to/evaluate/custom_dataloader.md
@@ -1,6 +1,6 @@
 # How to create custom DataLoader for Ragbits evaluation
 
-Ragbits provides a base interface for data loading, `ragbits.evaluate.loaders.base.DataLoader`, designed specifically for evaluation purposes. A ready-to-use implementation, `ragbits.evaluate.loaders.hf.HFLoader`, is available for handling datasets in huggingface format.
+Ragbits provides a base interface for data loading, `ragbits.evaluate.dataloaders.base.DataLoader`, designed specifically for evaluation purposes. A ready-to-use implementation, `ragbits.evaluate.dataloaders.hf.HFLoader`, is available for handling datasets in huggingface format.
 
 To create a custom DataLoader for your specific needs, you need to implement the `load` method in a class that inherits from the `DataLoader` interface.
 

diff --git a/docs/how-to/evaluate/optimize.md b/docs/how-to/evaluate/optimize.md
@@ -5,7 +5,7 @@ Ragbits provides a feature that allows users to automatically configure hyperpar
 - The optimized pipeline must inherit from `ragbits.evaluate.pipelines.base.EvaluationPipeline`.
 - The definition of optimized metrics must adhere to the `ragbits.evaluate.metrics.base.Metric` interface.
 - These metrics should be gathered into an instance of `ragbits.evaluate.metrics.base.MetricSet`.
-- An instance of a class inheriting from `ragbits.evaluate.metrics.loader.base.DataLoader` must be provided as the data source for optimization.
+- An instance of a class inheriting from `ragbits.evaluate.dataloaders.base.DataLoader` must be provided as the data source for optimization.
 
 ## Supported Parameter Types
 
@@ -69,7 +69,7 @@ Next, we define the data loader. We'll use Ragbits generation stack to create an
 
 
 ```python
-from ragbits.evaluate.loaders.base import DataLoader, DataT
+from ragbits.evaluate.dataloaders.base import DataLoader, DataT
 from ragbits.core.llms.litellm import LiteLLM
 from ragbits.core.prompt import Prompt
 from pydantic import BaseModel

diff --git a/...ples/evaluation/document-search/README.md → ...uation/document-search/advanced/README.md b/...ples/evaluation/document-search/README.md → ...uation/document-search/advanced/README.md
@@ -1,35 +1,39 @@
 # Document Search Evaluation
 
-## Ingest
+## Evaluation
+
+### Evaluation on ingested data
 
 ```sh
-uv run ingest.py
+uv run evaluate.py
 ```
 
 ```sh
-uv run ingest.py +experiments=chunking-250
+uv run evaluate.py +experiments=chunking-250
 ```
 
 ```sh
-uv run ingest.py --multirun +experiments=chunking-250,chunking-500,chunking-1000
+uv run evaluate.py --multirun +experiments=chunking-250,chunking-500,chunking-1000
 ```
 
-## Evaluate
+### Logging
 
 ```sh
-uv run evaluate.py
+uv run evaluate.py logger.local=True
 ```
 
 ```sh
-uv run evaluate.py +experiments=chunking-250
+uv run evaluate.py logger.neptune=True
 ```
 
+## Optimization
+
 ```sh
-uv run evaluate.py --multirun +experiments=chunking-250,chunking-500,chunking-1000
+uv run optimize.py
 ```
 
-### Log to Neptune
+### Monitoring
 
 ```sh
-uv run evaluate.py neptune.run=True
+uv run optimize.py neptune_callback=True
 ```
diff --git a/examples/evaluation/document-search/advanced/config/dataloader/hf.yaml b/examples/evaluation/document-search/advanced/config/dataloader/hf.yaml
@@ -0,0 +1,4 @@
+type: ragbits.evaluate.dataloaders.hf:HFDataLoader
+config:
+  path: "micpst/hf-docs-retrieval"
+  split: "train"
diff --git a/examples/evaluation/document-search/advanced/config/experiments/chunking-1000.yaml b/examples/evaluation/document-search/advanced/config/experiments/chunking-1000.yaml
@@ -0,0 +1,21 @@
+# @package _global_
+
+task:
+  name: chunking-1000
+
+pipeline:
+  config:
+    providers:
+      txt:
+        config:
+          chunking_kwargs:
+            max_characters: 1000
+            new_after_n_chars: 200
+      md:
+        config:
+          chunking_kwargs:
+            max_characters: 1000
+            new_after_n_chars: 200
+    vector_store:
+      config:
+        index_name: chunk-1000
diff --git a/examples/evaluation/document-search/advanced/config/experiments/chunking-250.yaml b/examples/evaluation/document-search/advanced/config/experiments/chunking-250.yaml
@@ -0,0 +1,21 @@
+# @package _global_
+
+task:
+  name: chunking-250
+
+pipeline:
+  config:
+    providers:
+      txt:
+        config:
+          chunking_kwargs:
+            max_characters: 250
+            new_after_n_chars: 50
+      md:
+        config:
+          chunking_kwargs:
+            max_characters: 250
+            new_after_n_chars: 50
+    vector_store:
+      config:
+        index_name: chunk-250
diff --git a/examples/evaluation/document-search/advanced/config/experiments/chunking-500.yaml b/examples/evaluation/document-search/advanced/config/experiments/chunking-500.yaml
@@ -0,0 +1,21 @@
+# @package _global_
+
+task:
+  name: chunking-500
+
+pipeline:
+  config:
+    providers:
+      txt:
+        config:
+          chunking_kwargs:
+            max_characters: 500
+            new_after_n_chars: 100
+      md:
+        config:
+          chunking_kwargs:
+            max_characters: 500
+            new_after_n_chars: 100
+    vector_store:
+      config:
+        index_name: chunk-500
diff --git a/examples/evaluation/document-search/advanced/config/metrics/precision_recall_f1.yaml b/examples/evaluation/document-search/advanced/config/metrics/precision_recall_f1.yaml
@@ -0,0 +1,7 @@
+precision_recall_f1:
+  type: ragbits.evaluate.metrics.document_search:DocumentSearchPrecisionRecallF1
+  config:
+    matching_strategy:
+      type: RougeChunkMatch
+      config:
+        threshold: 0.5
diff --git a/examples/evaluation/document-search/advanced/config/metrics/ranked_retrieval.yaml b/examples/evaluation/document-search/advanced/config/metrics/ranked_retrieval.yaml
@@ -0,0 +1,7 @@
+ranked_retrieval:
+  type: ragbits.evaluate.metrics.document_search:DocumentSearchRankedRetrievalMetrics
+  config:
+    matching_strategy:
+      type: RougeChunkMatch
+      config:
+        threshold: 0.5
diff --git a/examples/evaluation/document-search/advanced/config/optimization.yaml b/examples/evaluation/document-search/advanced/config/optimization.yaml
@@ -0,0 +1,14 @@
+defaults:
+  - [email protected]: hf
+  - [email protected]: document_search_optimization
+  - [email protected]:
+    - precision_recall_f1
+    - ranked_retrieval
+  - _self_
+
+optimizer:
+  direction: maximize
+  n_trials: 5
+  max_retries_for_trial: 1
+
+neptune_callback: False
diff --git a/examples/evaluation/document-search/advanced/config/pipeline/document_search.yaml b/examples/evaluation/document-search/advanced/config/pipeline/document_search.yaml
@@ -0,0 +1,10 @@
+defaults:
+  - [email protected]: litellm
+  - [email protected]_store: chroma
+  - [email protected]: noop
+  - [email protected]: noop
+  - [email protected]: unstructured
+  - [email protected]: hf
+  - _self_
+
+type: ragbits.evaluate.pipelines.document_search:DocumentSearchPipeline
diff --git a/...les/evaluation/document-search/advanced/config/pipeline/document_search_optimization.yaml b/...les/evaluation/document-search/advanced/config/pipeline/document_search_optimization.yaml
@@ -0,0 +1,10 @@
+defaults:
+  - [email protected]: litellm_optimization
+  - [email protected]_store: chroma
+  - [email protected]: noop
+  - [email protected]: noop
+  - [email protected]: unstructured_optimization
+  - [email protected]: hf
+  - _self_
+
+type: ragbits.evaluate.pipelines.document_search:DocumentSearchPipeline
diff --git a/...rch/config/pipeline/embedder/litellm.yaml → ...ced/config/pipeline/embedder/litellm.yaml b/...rch/config/pipeline/embedder/litellm.yaml → ...ced/config/pipeline/embedder/litellm.yaml
@@ -1,6 +1,6 @@
 type: ragbits.core.embeddings.litellm:LiteLLMEmbeddings
 config:
   model: "text-embedding-3-small"
-  options:
+  default_options:
     dimensions: 768
     encoding_format: float
diff --git a/...peline/embedder/litellm_opt_template.yaml → ...peline/embedder/litellm_optimization.yaml b/...peline/embedder/litellm_opt_template.yaml → ...peline/embedder/litellm_optimization.yaml
@@ -3,15 +3,15 @@ config:
   optimize: true
   choices:
     - model: "text-embedding-3-small"
-      options:
+      default_options:
         dimensions:
           optimize: true
           range:
             - 32
             - 512
         encoding_format: float
     - model: "text-embedding-3-large"
-      options:
+      default_options:
         dimensions:
           optimize: true
           range:

diff --git a/...nfig/pipeline/providers/unstructured.yaml → ...nfig/pipeline/providers/unstructured.yaml b/...nfig/pipeline/providers/unstructured.yaml → ...nfig/pipeline/providers/unstructured.yaml
diff --git a/.../providers/unstructured_opt_template.yaml → .../providers/unstructured_optimization.yaml b/.../providers/unstructured_opt_template.yaml → .../providers/unstructured_optimization.yaml
diff --git a/...earch/config/pipeline/rephraser/noop.yaml → ...anced/config/pipeline/rephraser/noop.yaml b/...earch/config/pipeline/rephraser/noop.yaml → ...anced/config/pipeline/rephraser/noop.yaml
diff --git a/...search/config/pipeline/reranker/noop.yaml → ...vanced/config/pipeline/reranker/noop.yaml b/...search/config/pipeline/reranker/noop.yaml → ...vanced/config/pipeline/reranker/noop.yaml
diff --git a/examples/evaluation/document-search/advanced/config/pipeline/source/hf.yaml b/examples/evaluation/document-search/advanced/config/pipeline/source/hf.yaml
@@ -0,0 +1,4 @@
+type: ragbits.document_search.documents.sources:HuggingFaceSource
+config:
+  path: "micpst/hf-docs"
+  split: "train[:5]"
diff --git a/.../config/pipeline/vector_store/chroma.yaml → .../config/pipeline/vector_store/chroma.yaml b/.../config/pipeline/vector_store/chroma.yaml → .../config/pipeline/vector_store/chroma.yaml
@@ -1,10 +1,8 @@
 type: ragbits.core.vector_stores.chroma:ChromaVectorStore
 config:
   client:
-    type: PersistentClient
-    config:
-      path: chroma
-  index_name: default
+    type: EphemeralClient
+  index_name: baseline
   distance_method: l2
   default_options:
     k: 3

diff --git a/examples/evaluation/document-search/advanced/config/retrieval.yaml b/examples/evaluation/document-search/advanced/config/retrieval.yaml
@@ -0,0 +1,11 @@
+defaults:
+  - dataloader: hf
+  - pipeline: document_search
+  - metrics:
+    - precision_recall_f1
+    - ranked_retrieval
+  - _self_
+
+logger:
+  local: True
+  neptune: False
diff --git a/examples/evaluation/document-search/advanced/evaluate.py b/examples/evaluation/document-search/advanced/evaluate.py
@@ -0,0 +1,56 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "ragbits-core[chroma]",
+#     "ragbits-document-search[huggingface]",
+#     "ragbits-evaluate[relari]",
+# ]
+# ///
+import asyncio
+import logging
+from typing import cast
+
+import hydra
+from omegaconf import DictConfig, OmegaConf
+
+from ragbits.evaluate.evaluator import Evaluator
+from ragbits.evaluate.utils import log_evaluation_to_file, log_evaluation_to_neptune
+
+logging.getLogger("LiteLLM").setLevel(logging.ERROR)
+logging.getLogger("httpx").setLevel(logging.ERROR)
+
+
+async def evaluate(config: DictConfig) -> None:
+    """
+    Document search evaluation runner.
+
+    Args:
+        config: Hydra configuration.
+    """
+    print("Starting evaluation...")
+
+    evaluator_config = cast(dict, OmegaConf.to_container(config))
+    results = await Evaluator.run_from_config(evaluator_config)
+
+    if config.logger.local:
+        output_dir = log_evaluation_to_file(results)
+        print(f"Evaluation results saved under directory: {output_dir}")
+
+    if config.logger.neptune:
+        log_evaluation_to_neptune(results, config)
+        print("Evaluation results uploaded to Neptune")
+
+
+@hydra.main(config_path="config", config_name="retrieval", version_base="3.2")
+def main(config: DictConfig) -> None:
+    """
+    Runs the evaluation process.
+
+    Args:
+        config: Hydra configuration.
+    """
+    asyncio.run(evaluate(config))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/evaluation/document-search/advanced/optimize.py b/examples/evaluation/document-search/advanced/optimize.py
@@ -0,0 +1,40 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "ragbits-core[chroma]",
+#     "ragbits-document-search[huggingface]",
+#     "ragbits-evaluate[relari]",
+# ]
+# ///
+import logging
+from typing import cast
+
+import hydra
+from omegaconf import DictConfig, OmegaConf
+
+from ragbits.evaluate.optimizer import Optimizer
+from ragbits.evaluate.utils import log_optimization_to_file
+
+logging.getLogger("LiteLLM").setLevel(logging.ERROR)
+logging.getLogger("httpx").setLevel(logging.ERROR)
+
+
+@hydra.main(config_path="config", config_name="optimization", version_base="3.2")
+def main(config: DictConfig) -> None:
+    """
+    Runs the optimization process.
+
+    Args:
+        config: Hydra configuration.
+    """
+    print("Starting optimization...")
+
+    optimizer_config = cast(dict, OmegaConf.to_container(config))
+    configs_with_scores = Optimizer.run_from_config(optimizer_config)
+
+    output_dir = log_optimization_to_file(configs_with_scores)
+    print(f"Optimization results saved under directory: {output_dir}")
+
+
+if __name__ == "__main__":
+    main()