Skip to content

Commit

Permalink
Merge branch 'deepset-ai:main' into feat/accept-top_k-in-run-function…
Browse files Browse the repository at this point in the history
…-ElasticSearshBM25Retriever
  • Loading branch information
sahusiddharth authored Dec 22, 2023
2 parents 35eb043 + 5668b83 commit c074b0a
Show file tree
Hide file tree
Showing 19 changed files with 1,203 additions and 12 deletions.
5 changes: 5 additions & 0 deletions .github/labeler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ integration:qdrant:
- any-glob-to-any-file: "integrations/qdrant/**/*"
- any-glob-to-any-file: ".github/workflows/qdrant.yml"

integration:pinecone:
- changed-files:
- any-glob-to-any-file: "integrations/pinecone/**/*"
- any-glob-to-any-file: ".github/workflows/pinecone.yml"

integration:unstructured-fileconverter:
- changed-files:
- any-glob-to-any-file: "integrations/unstructured/fileconverter/**/*"
Expand Down
51 changes: 51 additions & 0 deletions .github/workflows/pinecone.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# This workflow comes from https://github.com/ofek/hatch-mypyc
# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml
name: Test / pinecone

on:
schedule:
- cron: "0 0 * * *"
pull_request:
paths:
- "integrations/pinecone/**"
- ".github/workflows/pinecone.yml"

concurrency:
group: pinecone-${{ github.head_ref }}
cancel-in-progress: true

env:
PYTHONUNBUFFERED: "1"
FORCE_COLOR: "1"
PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }}

jobs:
run:
name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
# Pinecone tests are time expensive, so the matrix is limited to Python 3.9 and 3.10
os: [ubuntu-latest]
python-version: ["3.9", "3.10"]

steps:
- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Install Hatch
run: pip install --upgrade hatch

- name: Lint
working-directory: integrations/pinecone
if: matrix.python-version == '3.9'
run: hatch run lint:all

- name: Run tests
working-directory: integrations/pinecone
run: hatch run cov
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,5 @@ deepset-haystack
| [opensearch-haystack](integrations/opensearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack) | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml) |
| [qdrant-haystack](integrations/qdrant/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/qdrant-haystack.svg?color=orange)](https://pypi.org/project/qdrant-haystack) | [![Test / qdrant](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml) |
| [unstructured-fileconverter-haystack](integrations/unstructured/fileconverter/) | File converter | [![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) | [![Test / unstructured / fileconverter](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml) |
| [jina-haystack](integrations/jina/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/jina-haystack.svg)](https://pypi.org/project/jina-haystack) | [![Test / cohere](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml) |
| [jina-haystack](integrations/jina/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/jina-haystack.svg)](https://pypi.org/project/jina-haystack) | [![Test / cohere](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml)
| [pinecone-haystack](integrations/pinecone/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/pinecone-haystack.svg?color=orange)](https://pypi.org/project/pinecone-haystack) | [![Test / pinecone](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml) |
8 changes: 4 additions & 4 deletions integrations/chroma/example/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pathlib import Path

from haystack import Pipeline
from haystack.components.file_converters import TextFileToDocument
from haystack.components.converters import TextFileToDocument
from haystack.components.writers import DocumentWriter

from chroma_haystack import ChromaDocumentStore
Expand All @@ -19,11 +19,11 @@
indexing.add_component("converter", TextFileToDocument())
indexing.add_component("writer", DocumentWriter(document_store))
indexing.connect("converter", "writer")
indexing.run({"converter": {"paths": file_paths}})
indexing.run({"converter": {"sources": file_paths}})

querying = Pipeline()
querying.add_component("retriever", ChromaQueryRetriever(document_store))
results = querying.run({"retriever": {"queries": ["Variable declarations"], "top_k": 3}})

for d in results["retriever"][0]:
print(d.metadata, d.score)
for d in results["retriever"]["documents"][0]:
print(d.meta, d.score)
7 changes: 1 addition & 6 deletions integrations/chroma/src/chroma_haystack/document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,10 +247,6 @@ def _normalize_filters(self, filters: Dict[str, Any]) -> Tuple[List[str], Dict[s
# if the list contains multiple items, we need an $or chain
for v in value:
where["$or"].append({field: v})
elif field == "mime_type":
# Schedule for removal the original key, we're going to change it
keys_to_remove.append(field)
where["_mime_type"] = value

for k in keys_to_remove:
del filters[k]
Expand Down Expand Up @@ -310,8 +306,7 @@ def _query_result_to_documents(self, result: QueryResult) -> List[List[Document]

# prepare metadata
if metadatas := result.get("metadatas"):
document_dict["metadata"] = dict(metadatas[i][j])
document_dict["mime_type"] = document_dict["metadata"].pop("_mime_type")
document_dict["meta"] = dict(metadatas[i][j])

if embeddings := result.get("embeddings"):
document_dict["embedding"] = np.array(embeddings[i][j])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ def _bm25_retrieval(
"query": query,
"fuzziness": fuzziness,
"type": "most_fields",
"operator": "AND",
"operator": "OR",
}
}
]
Expand Down
27 changes: 27 additions & 0 deletions integrations/elasticsearch/tests/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,33 @@ def test_bm25_retrieval_with_fuzziness(self, document_store: ElasticsearchDocume
assert "functional" in res[1].content
assert "functional" in res[2].content

def test_bm25_not_all_terms_must_match(self, document_store: ElasticsearchDocumentStore):
"""
Test that not all terms must mandatorily match for BM25 retrieval to return a result.
"""
documents = [
Document(id=1, content="There are over 7,000 languages spoken around the world today."),
Document(
id=2,
content=(
"Elephants have been observed to behave in a way that indicates a high level of self-awareness"
" such as recognizing themselves in mirrors."
),
),
Document(
id=3,
content=(
"In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness"
" the phenomenon of bioluminescent waves."
),
),
]
document_store.write_documents(documents)

res = document_store._bm25_retrieval("How much self awareness do elephants have?", top_k=3)
assert len(res) == 1
assert res[0].id == 2

def test_embedding_retrieval(self, document_store: ElasticsearchDocumentStore):
docs = [
Document(content="Most similar document", embedding=[1.0, 1.0, 1.0, 1.0]),
Expand Down
24 changes: 24 additions & 0 deletions integrations/pinecone/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
[![test](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml)

[![PyPI - Version](https://img.shields.io/pypi/v/pinecone-haystack.svg)](https://pypi.org/project/pinecone-haystack)
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pinecone-haystack.svg)](https://pypi.org/project/pinecone-haystack)

# Pinecone Document Store

Document Store for Haystack 2.x, supports Pinecone.

## Installation

```console
pip install pinecone-haystack
```

## Testing

```console
hatch run test
```

## License

`pinecone-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license.
186 changes: 186 additions & 0 deletions integrations/pinecone/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
[build-system]
requires = ["hatchling", "hatch-vcs"]
build-backend = "hatchling.build"

[project]
name = "pinecone_haystack"
dynamic = ["version"]
description = ''
readme = "README.md"
requires-python = ">=3.8"
license = "Apache-2.0"
keywords = []
authors = [
{ name = "deepset GmbH", email = "[email protected]" },
]
classifiers = [
"Development Status :: 4 - Beta",
"Programming Language :: Python",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
]
dependencies = [
"haystack-ai",
"pinecone-client",
]

[project.urls]
Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/pinecone#readme"
Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues"
Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/pinecone"

[tool.hatch.version]
source = "vcs"
tag-pattern = 'integrations\/pinecone-v(?P<version>.*)'

[tool.hatch.version.raw-options]
root = "../.."
git_describe_command = 'git describe --tags --match="integrations/pinecone-v[0-9]*"'

[tool.hatch.envs.default]
dependencies = [
"coverage[toml]>=6.5",
"pytest",
"pytest-xdist",
]
[tool.hatch.envs.default.scripts]
# Pinecone tests are slow (require HTTP requests), so we run them in parallel
# with pytest-xdist (https://pytest-xdist.readthedocs.io/en/stable/distribution.html)
test = "pytest -n auto --maxprocesses=3 {args:tests}"
test-cov = "coverage run -m pytest -n auto --maxprocesses=3 {args:tests}"
cov-report = [
"- coverage combine",
"coverage report",
]
cov = [
"test-cov",
"cov-report",
]

[[tool.hatch.envs.all.matrix]]
python = ["3.8", "3.9", "3.10", "3.11"]

[tool.hatch.envs.lint]
detached = true
dependencies = [
"black>=23.1.0",
"mypy>=1.0.0",
"ruff>=0.0.243",
"numpy",
]
[tool.hatch.envs.lint.scripts]
typing = "mypy --install-types --non-interactive {args:src/pinecone_haystack tests}"
style = [
"ruff {args:.}",
"black --check --diff {args:.}",
]
fmt = [
"black {args:.}",
"ruff --fix {args:.}",
"style",
]
all = [
"style",
"typing",
]

[tool.hatch.metadata]
allow-direct-references = true

[tool.black]
target-version = ["py37"]
line-length = 120
skip-string-normalization = true

[tool.ruff]
target-version = "py37"
line-length = 120
select = [
"A",
"ARG",
"B",
"C",
"DTZ",
"E",
"EM",
"F",
"FBT",
"I",
"ICN",
"ISC",
"N",
"PLC",
"PLE",
"PLR",
"PLW",
"Q",
"RUF",
"S",
"T",
"TID",
"UP",
"W",
"YTT",
]
ignore = [
# Allow non-abstract empty methods in abstract base classes
"B027",
# Allow boolean positional values in function calls, like `dict.get(... True)`
"FBT003",
# Ignore checks for possible passwords
"S105", "S106", "S107",
# Ignore complexity
"C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",
]
unfixable = [
# Don't touch unused imports
"F401",
]

[tool.ruff.isort]
known-first-party = ["pinecone_haystack"]

[tool.ruff.flake8-tidy-imports]
ban-relative-imports = "all"

[tool.ruff.per-file-ignores]
# Tests can use magic values, assertions, and relative imports
"tests/**/*" = ["PLR2004", "S101", "TID252"]

[tool.coverage.run]
source_pkgs = ["pinecone_haystack", "tests"]
branch = true
parallel = true
omit = [
"example"
]

[tool.coverage.paths]
pinecone_haystack = ["src/pinecone_haystack", "*/pinecone_haystack/src/pinecone_haystack"]
tests = ["tests", "*/pinecone_haystack/tests"]

[tool.coverage.report]
exclude_lines = [
"no cov",
"if __name__ == .__main__.:",
"if TYPE_CHECKING:",
]

[tool.pytest.ini_options]
minversion = "6.0"
markers = [
"unit: unit tests",
"integration: integration tests"
]

[[tool.mypy.overrides]]
module = [
"pinecone.*",
"haystack.*",
"pytest.*"
]
ignore_missing_imports = true
6 changes: 6 additions & 0 deletions integrations/pinecone/src/pinecone_haystack/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0
from pinecone_haystack.document_store import PineconeDocumentStore

__all__ = ["PineconeDocumentStore"]
Loading

0 comments on commit c074b0a

Please sign in to comment.