Merge branch 'main' into kc/support-for-different-metadatastores-in-v…

…ectorstore
deepsense-ai · Oct 24, 2024 · e7d7fc0 · e7d7fc0
2 parents 4d655c3 + 99d91f9
commit e7d7fc0
Show file tree

Hide file tree

Showing 60 changed files with 2,399 additions and 152 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md
@@ -0,0 +1,4 @@
+### Checklist
+
+- [ ] I have updated the documentation accordingly.
+- [ ] I have updated the CHANGELOG.md file accordingly.
diff --git a/.github/workflows/semantic_release.yml b/.github/workflows/semantic_release.yml
@@ -0,0 +1,67 @@
+name: Semantic Release
+
+on:
+  workflow_dispatch:
+    inputs:
+      updateType:
+        description: "version update type"
+        required: true
+        type: choice
+        default: "patch"
+        options:
+          - "major"
+          - "minor"
+          - "patch"
+      packageName:
+        description: "name of the package to update"
+        required: true
+        type: choice
+        options:
+          - "ragbits"
+          - "ragbits-cli"
+          - "ragbits-core"
+          - "ragbits-document-search"
+
+jobs:
+  release:
+    runs-on: ubuntu-latest
+    permissions:
+      id-token: write
+      contents: write
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v2
+        with:
+          version: "0.4.10"
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+
+      - name: Create release branch
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+          git checkout -b release/${{ github.event.inputs.packageName }}-$(date +%Y-%m-%d)
+
+      - name: Update packages
+        id: packages_update
+        run: |
+          echo old_version=`grep version packages/${{ github.event.inputs.packageName }}/pyproject.toml | cut -d \" -f2` >> $GITHUB_OUTPUT
+          uv run scripts/update_ragbits_package.py ${{ github.event.inputs.packageName }} ${{ github.event.inputs.updateType }}
+          echo new_version=`grep version packages/${{ github.event.inputs.packageName }}/pyproject.toml | cut -d \" -f2` >> $GITHUB_OUTPUT
+          uv sync
+
+      - name: Create PR with updated packages
+        run: |
+          COMMIT_MESSAGE="release(${{ github.event.inputs.packageName }}): update to v${{ steps.packages_update.outputs.new_version }}"
+          git add .
+          git commit -m "$COMMIT_MESSAGE"
+          git push -u origin HEAD
+          gh pr create -B main --title "$COMMIT_MESSAGE" \
+          --body 'Update ${{ github.event.inputs.packageName }} version from ${{ steps.packages_update.outputs.old_version }} to ${{ steps.packages_update.outputs.new_version }}'
+        env:
+          GH_TOKEN: ${{ secrets.GH_TOKEN }}
diff --git a/.libraries-whitelist.txt b/.libraries-whitelist.txt
@@ -1,4 +1,5 @@
 pkg_resources
 tiktoken
 chardet
-chroma-hnswlib
+chroma-hnswlib
+rouge
diff --git a/examples/document-search/chroma.py b/examples/document-search/chroma.py
@@ -11,14 +11,15 @@
 
 from ragbits.core.embeddings import LiteLLMEmbeddings
 from ragbits.core.vector_store.chromadb_store import ChromaDBStore
-from ragbits.document_search import DocumentSearch
+from ragbits.document_search import DocumentSearch, SearchConfig
 from ragbits.document_search.documents.document import DocumentMeta
 
 documents = [
     DocumentMeta.create_text_document_from_literal("RIP boiled water. You will be mist."),
     DocumentMeta.create_text_document_from_literal(
         "Why programmers don't like to swim? Because they're scared of the floating points."
     ),
+    DocumentMeta.create_text_document_from_literal("This one is completely unrelated."),
 ]
 
 
@@ -37,8 +38,16 @@ async def main():
 
     await document_search.ingest(documents)
 
-    results = await document_search.search("I'm boiling my water and I need a joke")
-    print(results)
+    print()
+    print("All documents:")
+    all_documents = await vector_store.list()
+    print([doc.metadata["content"] for doc in all_documents])
+
+    query = "I'm boiling my water and I need a joke"
+    print()
+    print(f"Documents similar to: {query}")
+    results = await document_search.search(query, search_config=SearchConfig(vector_store_kwargs={"k": 2}))
+    print([element.get_key() for element in results])
 
 
 if __name__ == "__main__":

diff --git a/examples/document-search/from_config.py b/examples/document-search/from_config.py
@@ -32,6 +32,18 @@
     },
     "reranker": {"type": "ragbits.document_search.retrieval.rerankers.noop:NoopReranker"},
     "providers": {"txt": {"type": "DummyProvider"}},
+    "rephraser": {
+        "type": "LLMQueryRephraser",
+        "config": {
+            "llm": {
+                "type": "LiteLLM",
+                "config": {
+                    "model_name": "gpt-4-turbo",
+                },
+            },
+            "prompt": "QueryRephraserPrompt",
+        },
+    },
 }
 
 

diff --git a/examples/evaluation/document-search/README.md b/examples/evaluation/document-search/README.md
@@ -0,0 +1,35 @@
+# Document Search Evaluation
+
+## Ingest
+
+```sh
+uv run ingest.py
+```
+
+```sh
+uv run ingest.py +experiments=chunking-250
+```
+
+```sh
+uv run ingest.py --multirun +experiments=chunking-250,chunking-500,chunking-1000
+```
+
+## Evaluate
+
+```sh
+uv run evaluate.py
+```
+
+```sh
+uv run evaluate.py +experiments=chunking-250
+```
+
+```sh
+uv run evaluate.py --multirun +experiments=chunking-250,chunking-500,chunking-1000
+```
+
+### Log to Neptune
+
+```sh
+uv run evaluate.py neptune.run=True
+```
diff --git a/examples/evaluation/document-search/config/data/corpus.yaml b/examples/evaluation/document-search/config/data/corpus.yaml
@@ -0,0 +1,4 @@
+name: "hf-docs"
+path: "micpst/hf-docs"
+split: "train"
+num_docs: 5
diff --git a/examples/evaluation/document-search/config/data/qa.yaml b/examples/evaluation/document-search/config/data/qa.yaml
@@ -0,0 +1,3 @@
+name: "hf-docs-retrieval"
+path: "micpst/hf-docs-retrieval"
+split: "train"
diff --git a/examples/evaluation/document-search/config/embedder/litellm.yaml b/examples/evaluation/document-search/config/embedder/litellm.yaml
@@ -0,0 +1,6 @@
+type: LiteLLMEmbeddings
+config:
+  model: "text-embedding-3-small"
+  options:
+    dimensions: 768
+    encoding_format: float
diff --git a/examples/evaluation/document-search/config/experiments/chunking-1000.yaml b/examples/evaluation/document-search/config/experiments/chunking-1000.yaml
@@ -0,0 +1,20 @@
+# @package _global_
+
+task:
+  name: chunking-1000
+
+# used only for ingestion
+providers:
+  txt:
+    config:
+      chunking_kwargs:
+        max_characters: 1000
+  md:
+    config:
+      chunking_kwargs:
+        max_characters: 1000
+
+# used for both ingestion and evaluation
+vector_store:
+  config:
+    index_name: chunk-1000
diff --git a/examples/evaluation/document-search/config/experiments/chunking-250.yaml b/examples/evaluation/document-search/config/experiments/chunking-250.yaml
@@ -0,0 +1,20 @@
+# @package _global_
+
+task:
+  name: chunking-250
+
+# used only for ingestion
+providers:
+  txt:
+    config:
+      chunking_kwargs:
+        max_characters: 250
+  md:
+    config:
+      chunking_kwargs:
+        max_characters: 250
+
+# used for both ingestion and evaluation
+vector_store:
+  config:
+    index_name: chunk-250
diff --git a/examples/evaluation/document-search/config/experiments/chunking-500.yaml b/examples/evaluation/document-search/config/experiments/chunking-500.yaml
@@ -0,0 +1,20 @@
+# @package _global_
+
+task:
+  name: chunking-500
+
+# used only for ingestion
+providers:
+  txt:
+    config:
+      chunking_kwargs:
+        max_characters: 500
+  md:
+    config:
+      chunking_kwargs:
+        max_characters: 500
+
+# used for both ingestion and evaluation
+vector_store:
+  config:
+    index_name: chunk-500
diff --git a/examples/evaluation/document-search/config/ingestion.yaml b/examples/evaluation/document-search/config/ingestion.yaml
@@ -0,0 +1,6 @@
+defaults:
+  - data: corpus
+  - embedder: litellm
+  - providers: unstructured
+  - vector_store: chroma
+  - _self_
diff --git a/examples/evaluation/document-search/config/providers/unstructured.yaml b/examples/evaluation/document-search/config/providers/unstructured.yaml
@@ -0,0 +1,25 @@
+txt:
+  type: UnstructuredDefaultProvider
+  config:
+    use_api: false
+    partition_kwargs:
+      strategy: hi_res
+    chunking_kwargs:
+      include_orig_elements: true
+      max_characters: 1000
+      new_after_n_chars: 1000
+      overlap: 0
+      overlap_all: 0
+
+md:
+  type: UnstructuredDefaultProvider
+  config:
+    use_api: false
+    partition_kwargs:
+      strategy: hi_res
+    chunking_kwargs:
+      include_orig_elements: true
+      max_characters: 1000
+      new_after_n_chars: 1000
+      overlap: 0
+      overlap_all: 0
diff --git a/examples/evaluation/document-search/config/rephraser/noop.yaml b/examples/evaluation/document-search/config/rephraser/noop.yaml
@@ -0,0 +1 @@
+type: NoopQueryRephraser
diff --git a/examples/evaluation/document-search/config/reranker/noop.yaml b/examples/evaluation/document-search/config/reranker/noop.yaml
@@ -0,0 +1 @@
+type: NoopReranker
diff --git a/examples/evaluation/document-search/config/retrieval.yaml b/examples/evaluation/document-search/config/retrieval.yaml
@@ -0,0 +1,26 @@
+defaults:
+  - data: qa
+  - embedder: litellm
+  - providers: unstructured
+  - vector_store: chroma
+  - rephraser: noop
+  - reranker: noop
+  - _self_
+
+task:
+  name: default
+  type: document-search
+
+metrics:
+  DocumentSearchPrecisionRecallF1:
+    matching_strategy: RougeChunkMatch
+    options:
+      threshold: 0.5
+  DocumentSearchRankedRetrievalMetrics:
+    matching_strategy: RougeChunkMatch
+    options:
+      threshold: 0.5
+
+neptune:
+  project: ragbits
+  run: False
diff --git a/examples/evaluation/document-search/config/vector_store/chroma.yaml b/examples/evaluation/document-search/config/vector_store/chroma.yaml
@@ -0,0 +1,9 @@
+type: ChromaDBStore
+config:
+  chroma_client:
+    type: PersistentClient
+    config:
+      path: chroma
+  embedding_function:
+    type: ragbits.core.embeddings.litellm:LiteLLMEmbeddings
+  index_name: default
diff --git a/examples/evaluation/document-search/evaluate.py b/examples/evaluation/document-search/evaluate.py
@@ -0,0 +1,67 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "ragbits-document-search",
+#     "ragbits-evaluate[relari]",
+#     "ragbits[litellm,chromadb]",
+# ]
+# ///
+import asyncio
+import logging
+
+import hydra
+from omegaconf import DictConfig
+
+from ragbits.evaluate.evaluator import Evaluator
+from ragbits.evaluate.loaders.hf import HFDataLoader
+from ragbits.evaluate.metrics.document_search import document_search_metrics
+from ragbits.evaluate.pipelines.document_search import DocumentSearchPipeline
+from ragbits.evaluate.utils import log_to_file, log_to_neptune, setup_neptune
+
+logging.getLogger("LiteLLM").setLevel(logging.ERROR)
+logging.getLogger("httpx").setLevel(logging.ERROR)
+log = logging.getLogger(__name__)
+
+
+async def bench(config: DictConfig) -> None:
+    """
+    Function running evaluation for all datasets and evaluation tasks defined in hydra config.
+
+    Args:
+        config: Hydra configuration.
+    """
+    run = setup_neptune(config)
+
+    log.info("Starting evaluation...")
+
+    dataloader = HFDataLoader(config.data)
+    pipeline = DocumentSearchPipeline(config)
+    metrics = document_search_metrics(config.metrics)
+
+    evaluator = Evaluator()
+    results = await evaluator.compute(
+        pipeline=pipeline,
+        dataloader=dataloader,
+        metrics=metrics,
+    )
+
+    output_dir = log_to_file(results)
+    if run:
+        log_to_neptune(run, results, output_dir)
+
+    log.info("Evaluation results saved under directory: %s", output_dir)
+
+
+@hydra.main(config_path="config", config_name="retrieval", version_base="3.2")
+def main(config: DictConfig) -> None:
+    """
+    Function running evaluation for all datasets and evaluation tasks defined in hydra config.
+
+    Args:
+        config: Hydra configuration.
+    """
+    asyncio.run(bench(config))
+
+
+if __name__ == "__main__":
+    main()  # pylint: disable=no-value-for-parameter