diff --git a/packages/ragbits-evaluate/examples/document-search/config/config.yaml b/packages/ragbits-evaluate/examples/document-search/config/config.yaml
index 505fd2327..04eb09201 100644
--- a/packages/ragbits-evaluate/examples/document-search/config/config.yaml
+++ b/packages/ragbits-evaluate/examples/document-search/config/config.yaml
@@ -3,6 +3,9 @@ defaults:
   - setup: baseline
   - _self_
 
+task:
+  name: document-search
+
 neptune:
   project: ragbits
   run: False
diff --git a/packages/ragbits-evaluate/examples/document-search/config/data/hf-docs.yaml b/packages/ragbits-evaluate/examples/document-search/config/data/hf-docs.yaml
index 4763cdbba..27ce786f5 100644
--- a/packages/ragbits-evaluate/examples/document-search/config/data/hf-docs.yaml
+++ b/packages/ragbits-evaluate/examples/document-search/config/data/hf-docs.yaml
@@ -1,2 +1,6 @@
-path: "m-ric/huggingface_doc_qa_eval"
-split: "train"
+ingest:
+  path: "micpst/hf-docs"
+  split: "train"
+eval:
+  path: "micpst/hf-docs-retrieval"
+  split: "train"
diff --git a/packages/ragbits-evaluate/examples/document-search/config/setup/baseline.yaml b/packages/ragbits-evaluate/examples/document-search/config/setup/baseline.yaml
index 2dd3c3398..f6caad81d 100644
--- a/packages/ragbits-evaluate/examples/document-search/config/setup/baseline.yaml
+++ b/packages/ragbits-evaluate/examples/document-search/config/setup/baseline.yaml
@@ -1 +1 @@
-name: BASELINE
+name: Baseline
diff --git a/packages/ragbits-evaluate/examples/document-search/evaluate.py b/packages/ragbits-evaluate/examples/document-search/evaluate.py
index 620a212b3..d805cbcbc 100644
--- a/packages/ragbits-evaluate/examples/document-search/evaluate.py
+++ b/packages/ragbits-evaluate/examples/document-search/evaluate.py
@@ -3,16 +3,15 @@
 from pathlib import Path
 
 import hydra
-import neptune
 from hydra.core.hydra_config import HydraConfig
-from neptune.utils import stringify_unsupported
 from omegaconf import DictConfig
 
 from ragbits.evaluate.evaluator import Evaluator
 from ragbits.evaluate.loaders import HuggingFaceDataLoader
-from ragbits.evaluate.metrics import MetricSet
-from ragbits.evaluate.pipelines import DocumentSearchEvaluationPipeline
-from ragbits.evaluate.utils import save
+from ragbits.evaluate.metrics import DocumentSearchPrecision, DocumentSearchRecall, MetricSet
+from ragbits.evaluate.metrics.document_search import DocumentSearchF1
+from ragbits.evaluate.pipelines import DocumentSearchPipeline
+from ragbits.evaluate.utils import log_to_file, log_to_neptune
 
 logging.getLogger("LiteLLM").setLevel(logging.ERROR)
 logging.getLogger("httpx").setLevel(logging.ERROR)
@@ -29,10 +28,14 @@ async def bench(config: DictConfig) -> None:
     log.info("Starting evaluation: %s", config.setup.name)
 
     dataloader = HuggingFaceDataLoader(config)
-    pipeline = DocumentSearchEvaluationPipeline(config)
-    metrics = MetricSet()(config)
-
-    evaluator = Evaluator(task="document_search")
+    pipeline = DocumentSearchPipeline(config)
+    metrics = MetricSet(
+        DocumentSearchPrecision,
+        DocumentSearchRecall,
+        DocumentSearchF1,
+    )(config)
+
+    evaluator = Evaluator()
     results = await evaluator.compute(
         pipeline=pipeline,
         dataloader=dataloader,
@@ -42,23 +45,13 @@ async def bench(config: DictConfig) -> None:
     log.info("Evaluation finished. Saving results...")
 
     output_dir = Path(HydraConfig.get().runtime.output_dir)
-    metrics_file = output_dir / "metrics.json"
-    results_file = output_dir / "results.json"
+    log_to_file(results, output_dir)
 
-    save(metrics_file, metrics=results["metrics"], time_perf=results["time_perf"])
-    save(results_file, results=results["results"])
+    if config.neptune.run:
+        log_to_neptune(config, results, output_dir)
 
     log.info("Evaluation results saved under directory: %s", output_dir)
 
-    if config.neptune.run:
-        run = neptune.init_run(project=config.neptune.project)
-        run["sys/tags"].add(config.setup.name)
-        run["config"] = stringify_unsupported(config)
-        run["evaluation/metrics"] = stringify_unsupported(results["metrics"])
-        run["evaluation/time_perf"] = stringify_unsupported(results["time_perf"])
-        run["evaluation/metrics.json"].upload(metrics_file.as_posix())
-        run["evaluation/results.json"].upload(results_file.as_posix())
-
 
 @hydra.main(config_path="config", config_name="config", version_base="3.2")
 def main(config: DictConfig) -> None:
diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/__init__.py b/packages/ragbits-evaluate/src/ragbits/evaluate/__init__.py
index 875c2ea6a..f11b38d0e 100644
--- a/packages/ragbits-evaluate/src/ragbits/evaluate/__init__.py
+++ b/packages/ragbits-evaluate/src/ragbits/evaluate/__init__.py
@@ -1,8 +1,9 @@
 from .evaluator import Evaluator
 from .loaders import DataLoader, HuggingFaceDataLoader
-from .metrics import Metric, MetricSet
-from .pipelines import DocumentSearchEvaluationPipeline
-from .utils import save
+from .metrics.base import Metric, MetricSet
+from .pipelines.base import EvaluationPipeline, EvaluationResult
+from .pipelines.document_search import DocumentSearchPipeline, DocumentSearchResult
+from .utils import log_to_file, log_to_neptune
 
 __all__ = [
     "Evaluator",
@@ -10,6 +11,10 @@
     "HuggingFaceDataLoader",
     "MetricSet",
     "Metric",
-    "DocumentSearchEvaluationPipeline",
-    "save",
+    "EvaluationPipeline",
+    "DocumentSearchPipeline",
+    "EvaluationResult",
+    "DocumentSearchResult",
+    "log_to_file",
+    "log_to_neptune",
 ]
diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/evaluator.py b/packages/ragbits-evaluate/src/ragbits/evaluate/evaluator.py
index def383e56..ff410233f 100644
--- a/packages/ragbits-evaluate/src/ragbits/evaluate/evaluator.py
+++ b/packages/ragbits-evaluate/src/ragbits/evaluate/evaluator.py
@@ -14,15 +14,6 @@ class Evaluator:
     Evaluator class.
     """
 
-    def __init__(self, task: str) -> None:
-        """
-        Constructs the evaluator.
-
-        Args:
-            task: The task for the evaluator.
-        """
-        self.task = task
-
     async def compute(
         self,
         pipeline: EvaluationPipeline,
@@ -41,6 +32,7 @@ async def compute(
             The evaluation results.
         """
         dataset = await dataloader.load()
+        await pipeline.prepare()
         results, perf_results = await self._call_pipeline(pipeline, dataset)
         computed_metrics = self._compute_metrics(metrics, results)
         processed_results = self._results_processor(results)
diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/loaders.py b/packages/ragbits-evaluate/src/ragbits/evaluate/loaders.py
index 50b592f7d..e5b61af71 100644
--- a/packages/ragbits-evaluate/src/ragbits/evaluate/loaders.py
+++ b/packages/ragbits-evaluate/src/ragbits/evaluate/loaders.py
@@ -39,6 +39,6 @@ async def load(self) -> HFData:
             The loaded data.
         """
         return load_dataset(
-            path=self.config.data.path,
-            split=self.config.data.split,
+            path=self.config.data.eval.path,
+            split=self.config.data.eval.split,
         )
diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/metrics/__init__.py b/packages/ragbits-evaluate/src/ragbits/evaluate/metrics/__init__.py
index 8a8b6e240..c4d10448c 100644
--- a/packages/ragbits-evaluate/src/ragbits/evaluate/metrics/__init__.py
+++ b/packages/ragbits-evaluate/src/ragbits/evaluate/metrics/__init__.py
@@ -1,3 +1,4 @@
 from .base import Metric, MetricSet
+from .document_search import DocumentSearchPrecision, DocumentSearchRecall
 
-__all__ = ["Metric", "MetricSet"]
+__all__ = ["Metric", "MetricSet", "DocumentSearchPrecision", "DocumentSearchRecall"]
diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/metrics/base.py b/packages/ragbits-evaluate/src/ragbits/evaluate/metrics/base.py
index ce7ba2eb6..0ebf7235a 100644
--- a/packages/ragbits-evaluate/src/ragbits/evaluate/metrics/base.py
+++ b/packages/ragbits-evaluate/src/ragbits/evaluate/metrics/base.py
@@ -1,13 +1,15 @@
 from abc import ABC, abstractmethod
-from typing import Any, Optional
+from typing import Any, Generic, Optional, TypeVar
 
 from omegaconf import DictConfig
 from typing_extensions import Self
 
 from ..pipelines import EvaluationResult
 
+ResultT = TypeVar("ResultT", bound=EvaluationResult)
 
-class Metric(ABC):
+
+class Metric(Generic[ResultT], ABC):
     """
     Base class for metrics.
     """
@@ -20,10 +22,10 @@ def __init__(self, config: Optional[DictConfig] = None) -> None:
             config: The metric configuration.
         """
         super().__init__()
-        self.config = config or {}
+        self.config = config or DictConfig({})
 
     @abstractmethod
-    def compute(self, results: list[EvaluationResult]) -> dict[str, Any]:
+    def compute(self, results: list[ResultT]) -> dict[str, Any]:
         """
         Compute the metric.
 
@@ -35,12 +37,12 @@ def compute(self, results: list[EvaluationResult]) -> dict[str, Any]:
         """
 
 
-class MetricSet:
+class MetricSet(Generic[ResultT]):
     """
     Represents a set of metrics.
     """
 
-    def __init__(self, *metrics: type[Metric]) -> None:
+    def __init__(self, *metrics: type[Metric[ResultT]]) -> None:
         """
         Initializes the metric set.
 
@@ -48,7 +50,7 @@ def __init__(self, *metrics: type[Metric]) -> None:
             metrics: The metrics.
         """
         self._metrics = metrics
-        self.metrics: list[Metric] = []
+        self.metrics: list[Metric[ResultT]] = []
 
     def __call__(self, config: Optional[DictConfig] = None) -> Self:
         """
@@ -63,7 +65,7 @@ def __call__(self, config: Optional[DictConfig] = None) -> Self:
         self.metrics = [metric(config) for metric in self._metrics]
         return self
 
-    def compute(self, results: list[EvaluationResult]) -> dict[str, Any]:
+    def compute(self, results: list[ResultT]) -> dict[str, Any]:
         """
         Compute the metrics.
 
diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/metrics/document_search.py b/packages/ragbits-evaluate/src/ragbits/evaluate/metrics/document_search.py
new file mode 100644
index 000000000..000f3014d
--- /dev/null
+++ b/packages/ragbits-evaluate/src/ragbits/evaluate/metrics/document_search.py
@@ -0,0 +1,109 @@
+from typing import Any, Optional
+
+from omegaconf import DictConfig
+
+from ragbits.evaluate.metrics.base import Metric
+from ragbits.evaluate.pipelines.document_search import DocumentSearchResult
+
+
+class DocumentSearchPrecision(Metric[DocumentSearchResult]):
+    """
+    Precision measures the accuracy of the retrieved documents. It is the ratio of the number of relevant
+    documents retrieved to the total number of documents retrieved.
+
+    Precision = Total Number of Relevent Documents Retrieved / Total Number of Documents Retrieved
+
+    Precision evaluates: "Out of all the documents that the system retrieved, how many were actually relevant?”
+    """
+
+    def compute(self, results: list[DocumentSearchResult]) -> dict[str, Any]:
+        """
+        Compute the metric.
+
+        Args:
+            results: The evaluation results.
+
+        Returns:
+            The computed metric.
+        """
+        total_relevant_documents_retrieved = sum(
+            len(set(result.reference_passages) & set(result.predicted_passages)) for result in results
+        )
+        total_documents_retrieved = sum(len(set(result.predicted_passages)) for result in results)
+
+        return {
+            "DOCUMENT_SEARCH/PRECISION": (total_relevant_documents_retrieved / total_documents_retrieved)
+            if total_documents_retrieved > 0
+            else 0.0,
+        }
+
+
+class DocumentSearchRecall(Metric[DocumentSearchResult]):
+    """
+    Recall measures the comprehensiveness of the retrieved documents. It is the ratio of the number of relevant
+    documents retrieved to the total number of relevant documents in the database for the given query.
+
+    Recall = Total Number of Relevant Documents Retrieved / Total Number of Relevant Documents in the Database
+
+    Recall evaluates: "Out of all the relevant documents that exist in the database,
+    how many did the system manage to retrieve?"
+    """
+
+    def compute(self, results: list[DocumentSearchResult]) -> dict[str, Any]:
+        """
+        Compute the metric.
+
+        Args:
+            results: The evaluation results.
+
+        Returns:
+            The computed metric.
+        """
+        total_relevant_documents_retrieved = sum(
+            len(set(result.reference_passages) & set(result.predicted_passages)) for result in results
+        )
+        total_relevant_documents = sum(len(set(result.reference_passages)) for result in results)
+
+        return {
+            "DOCUMENT_SEARCH/RECALL": (total_relevant_documents_retrieved / total_relevant_documents)
+            if total_relevant_documents > 0
+            else 0.0,
+        }
+
+
+class DocumentSearchF1(Metric[DocumentSearchResult]):
+    """
+    F1 Score is the harmonic mean of precision and recall. It is the weighted average of Precision and Recall.
+
+    F1 = 2 * (Precision * Recall) / (Precision + Recall)
+    """
+
+    def __init__(self, config: Optional[DictConfig] = None) -> None:
+        """
+        Initializes the metric.
+
+        Args:
+            config: The metric configuration.
+        """
+        super().__init__(config)
+        self.precision = DocumentSearchPrecision(config)
+        self.recall = DocumentSearchRecall(config)
+
+    def compute(self, results: list[DocumentSearchResult]) -> dict[str, Any]:
+        """
+        Compute the metric.
+
+        Args:
+            results: The evaluation results.
+
+        Returns:
+            The computed metric.
+        """
+        precision = self.precision.compute(results)["DOCUMENT_SEARCH/PRECISION"]
+        recall = self.recall.compute(results)["DOCUMENT_SEARCH/RECALL"]
+
+        return {
+            "DOCUMENT_SEARCH/F1": (2 * (precision * recall) / (precision + recall))
+            if (precision + recall) > 0
+            else 0.0,
+        }
diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/__init__.py b/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/__init__.py
index 9ba40746a..eacb77a26 100644
--- a/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/__init__.py
+++ b/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/__init__.py
@@ -1,8 +1,9 @@
 from .base import EvaluationPipeline, EvaluationResult
-from .document_search import DocumentSearchEvaluationPipeline
+from .document_search import DocumentSearchPipeline, DocumentSearchResult
 
 __all__ = [
-    "DocumentSearchEvaluationPipeline",
+    "DocumentSearchPipeline",
+    "DocumentSearchResult",
     "EvaluationPipeline",
     "EvaluationResult",
 ]
diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/base.py b/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/base.py
index 991871aae..9a790041f 100644
--- a/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/base.py
+++ b/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/base.py
@@ -6,7 +6,7 @@
 
 
 @dataclass
-class EvaluationResult:
+class EvaluationResult(ABC):
     """
     Represents the result of a single evaluation.
     """
@@ -25,7 +25,12 @@ def __init__(self, config: Optional[DictConfig] = None) -> None:
             config: The evaluation pipeline configuration.
         """
         super().__init__()
-        self.config = config or {}
+        self.config = config or DictConfig({})
+
+    async def prepare(self) -> None:
+        """
+        Prepares the document search evaluation pipeline.
+        """
 
     @abstractmethod
     async def __call__(self, data: dict[str, Any]) -> EvaluationResult:
diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/document_search.py b/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/document_search.py
index 1afa86f72..d0dffee03 100644
--- a/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/document_search.py
+++ b/packages/ragbits-evaluate/src/ragbits/evaluate/pipelines/document_search.py
@@ -1,14 +1,90 @@
-from typing import Any
+from dataclasses import dataclass
+from functools import cached_property
+from typing import Any, Optional
+
+from datasets import load_dataset
+from omegaconf import DictConfig
+from tqdm.asyncio import tqdm
+
+from ragbits.core.embeddings import LiteLLMEmbeddings
+from ragbits.core.vector_store import InMemoryVectorStore
+from ragbits.document_search.documents.document import DocumentMeta
+from ragbits.document_search.documents.element import TextElement
+
+try:
+    from ragbits.document_search import DocumentSearch
+except ImportError:
+    HAS_RAGBITS_DOCUMENT_SEARCH = False
+else:
+    HAS_RAGBITS_DOCUMENT_SEARCH = True
 
 from .base import EvaluationPipeline, EvaluationResult
 
 
-class DocumentSearchEvaluationPipeline(EvaluationPipeline):
+@dataclass
+class DocumentSearchResult(EvaluationResult):
+    """
+    Represents the result of a single evaluation.
+    """
+
+    question: str
+    reference_passages: list[str]
+    predicted_passages: list[str]
+
+
+class DocumentSearchPipeline(EvaluationPipeline):
     """
     Document search evaluation pipeline.
     """
 
-    async def __call__(self, data: dict[str, Any]) -> EvaluationResult:
+    def __init__(self, config: Optional[DictConfig] = None) -> None:
+        """
+        Initializes the document search evaluation pipeline.
+
+        Raises:
+            ImportError: If the ragbits-document-search package is not installed.
+        """
+        super().__init__(config)
+        if not HAS_RAGBITS_DOCUMENT_SEARCH:
+            raise ImportError("You need to install the 'ragbits-document-search' package to use this pipeline.")
+
+    @cached_property
+    def documents(self) -> list[DocumentMeta]:
+        """
+        Returns the documents to be ingested.
+
+        Returns:
+            The documents to be ingested.
+        """
+        # TODO: Implement HF doc loader.
+        docs = load_dataset(
+            path=self.config.data.ingest.path,
+            split=self.config.data.ingest.split,
+        )
+        return [DocumentMeta.create_text_document_from_literal(doc["content"]) for doc in docs]
+
+    @cached_property
+    def document_search(self) -> DocumentSearch:
+        """
+        Returns the document search instance.
+
+        Returns:
+            The document search instance.
+        """
+        return DocumentSearch(
+            embedder=LiteLLMEmbeddings(),
+            vector_store=InMemoryVectorStore(),
+        )
+
+    async def prepare(self) -> None:
+        """
+        Prepares the document search evaluation pipeline.
+        """
+        await tqdm.gather(
+            *[self.document_search.ingest_document(document) for document in self.documents], desc="Ingestion"
+        )
+
+    async def __call__(self, data: dict[str, Any]) -> DocumentSearchResult:
         """
         Runs the document search evaluation pipeline.
 
@@ -18,4 +94,9 @@ async def __call__(self, data: dict[str, Any]) -> EvaluationResult:
         Returns:
             The evaluation result.
         """
-        return EvaluationResult()
+        elements = await self.document_search.search(data["question"])
+        return DocumentSearchResult(
+            question=data["question"],
+            reference_passages=data["passages"],
+            predicted_passages=[element.content for element in elements if isinstance(element, TextElement)],
+        )
diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/utils.py b/packages/ragbits-evaluate/src/ragbits/evaluate/utils.py
index 7bca9dbaf..36e903241 100644
--- a/packages/ragbits-evaluate/src/ragbits/evaluate/utils.py
+++ b/packages/ragbits-evaluate/src/ragbits/evaluate/utils.py
@@ -4,8 +4,12 @@
 from pathlib import Path
 from typing import Any
 
+import neptune
+from neptune.utils import stringify_unsupported
+from omegaconf import DictConfig
 
-def save(file_path: Path, **data: Any) -> None:
+
+def _save(file_path: Path, **data: Any) -> None:
     """
     Save the data to a file. Add the current timestamp and Python version to the data.
 
@@ -21,3 +25,41 @@ def save(file_path: Path, **data: Any) -> None:
 
     with open(file_path, "w", encoding="utf-8") as file:
         json.dump(data, file, indent=4)
+
+
+def log_to_file(results: dict[str, Any], output_dir: Path) -> None:
+    """
+    Log the evaluation results locally.
+
+    Args:
+        results: The evaluation results.
+        output_dir: The output directory.
+    """
+    metrics_file = output_dir / "metrics.json"
+    results_file = output_dir / "results.json"
+
+    _save(metrics_file, metrics=results["metrics"], time_perf=results["time_perf"])
+    _save(results_file, results=results["results"])
+
+
+def log_to_neptune(config: DictConfig, results: dict[str, Any], output_dir: Path) -> None:
+    """
+    Log the evaluation results to Neptune.
+
+    Args:
+        config: The Hydra configuration.
+        results: The evaluation results.
+        output_dir: The output directory.
+    """
+    run = neptune.init_run(project=config.neptune.project)
+    run["sys/tags"].add(
+        [
+            config.task.name,
+            config.setup.name,
+        ]
+    )
+    run["config"] = stringify_unsupported(config)
+    run["evaluation/metrics"] = stringify_unsupported(results["metrics"])
+    run["evaluation/time_perf"] = stringify_unsupported(results["time_perf"])
+    run["evaluation/metrics.json"].upload((output_dir / "metrics.json").as_posix())
+    run["evaluation/results.json"].upload((output_dir / "results.json").as_posix())
diff --git a/uv.lock b/uv.lock
index 23f2b39e5..82205c530 100644
--- a/uv.lock
+++ b/uv.lock
@@ -3581,10 +3581,6 @@ source = { editable = "packages/ragbits-evaluate" }
 dependencies = [
     { name = "datasets" },
     { name = "hydra-core" },
-]
-
-[package.optional-dependencies]
-neptune = [
     { name = "neptune" },
 ]
 
@@ -3600,8 +3596,8 @@ dev = [
 [package.metadata]
 requires-dist = [
     { name = "datasets", specifier = "~=3.0.1" },
-    { name = "hydra-core", specifier = ">=1.3.2" },
-    { name = "neptune", marker = "extra == 'neptune'", specifier = "~=1.12.0" },
+    { name = "hydra-core", specifier = "~=1.3.2" },
+    { name = "neptune", specifier = "~=1.12.0" },
 ]
 
 [package.metadata.requires-dev]