Skip to content

Commit

Permalink
Merge branch 'main' into pinecone-filters
Browse files Browse the repository at this point in the history
  • Loading branch information
anakin87 committed Dec 22, 2023
2 parents b9f9571 + fbdb9a0 commit b69f747
Show file tree
Hide file tree
Showing 50 changed files with 123 additions and 65 deletions.
4 changes: 2 additions & 2 deletions .github/ISSUE_TEMPLATE/new-integration-proposal.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
name: New Integration Proposal
about: Track the creation process for a new integration
title: ''
labels: New integration request
labels: new integration
assignees: ''

---
Expand Down Expand Up @@ -30,4 +30,4 @@ If the request is accepted, ensure the following checklist is complete before cl
- [ ] An integration tile has been added to https://github.com/deepset-ai/haystack-integrations
- [ ] The integration has been listed in the [Inventory section](https://github.com/deepset-ai/haystack-core-integrations#inventory) of this repo README
- [ ] There is an example available to demonstrate the feature
- [ ] The feature was announced through social media
- [ ] The feature was announced through social media
4 changes: 2 additions & 2 deletions .github/labeler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ integration:elasticsearch:

integration:google-vertex:
- changed-files:
- any-glob-to-any-file: "integrations/google-vertex/**/*"
- any-glob-to-any-file: "integrations/google_vertex/**/*"
- any-glob-to-any-file: ".github/workflows/google_vertex.yml"

integration:gradient:
Expand All @@ -26,7 +26,7 @@ integration:gradient:

integration:instructor-embedders:
- changed-files:
- any-glob-to-any-file: "integrations/instructor-embedders/**/*"
- any-glob-to-any-file: "integrations/instructor_embedders/**/*"
- any-glob-to-any-file: ".github/workflows/instructor_embedders.yml"

integration:jina:
Expand Down
16 changes: 16 additions & 0 deletions .github/workflows/CI_check_integration_format.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: Core / Check Integration Format
on:
- pull_request

jobs:
check:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Ensure no hyphens
run: |
set +e
find -name "*-*" -type d -maxdepth 2 | grep integrations
test "$?" -eq 1 && exit 0 || echo "::error::Names of folders in ./integrations must not contain hyphens" && exit 1
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: "Labeler"
name: Core / Labeler
on:
- pull_request_target

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Track issues with Github project
name: Core / Add issues to Github project

on:
issues:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
name: Project release on PyPi
name: Core / Project release on PyPi

# The pushed tag must be formatted like so:
# * nodes/speech2text-v1.0.0
# * stores/foo-documentstore-v1.2.3
# * integrations/<INTEGRATION_FOLDER_NAME>-v1.0.0
#
# The first part must be the path of the project being released.
# If we want to release version 1.0.0 of project text2speech
# that lives in path nodes/text2speech we'd have to push a
# nodes/text2speech-v1.0.0 tag.
# For example, if we want to release version 1.0.99
# of the google-vertex-haystack integration we'd have to push the tag:
#
# integrations/google_vertex-v1.0.99

on:
push:
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/google_vertex.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@ on:
- cron: "0 0 * * *"
pull_request:
paths:
- "integrations/google-vertex/**"
- ".github/workflows/google-vertex.yml"
- "integrations/google_vertex/**"
- ".github/workflows/google_vertex.yml"

defaults:
run:
working-directory: integrations/google-vertex
working-directory: integrations/google_vertex

concurrency:
group: google-vertex-${{ github.head_ref }}
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/instructor_embedders.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ on:
- cron: "0 0 * * *"
pull_request:
paths:
- 'integrations/instructor-embedders/**'
- 'integrations/instructor_embedders/**'
- '.github/workflows/instructor_embedders.yml'

defaults:
run:
working-directory: integrations/instructor-embedders
working-directory: integrations/instructor_embedders

jobs:
test:
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@ deepset-haystack
| [chroma-haystack](integrations/chroma/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/chroma-haystack.svg)](https://pypi.org/project/chroma-haystack) | [![Test / chroma](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml) |
| [cohere-haystack](integrations/cohere/) | Embedder, Generator | [![PyPI - Version](https://img.shields.io/pypi/v/cohere-haystack.svg)](https://pypi.org/project/cohere-haystack) | [![Test / cohere](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml) |
| [elasticsearch-haystack](integrations/elasticsearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) | [![Test / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml) |
| [google-vertex-haystack](integrations/google-vertex/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/google-vertex-haystack.svg)](https://pypi.org/project/google-vertex-haystack) | [![Test / google-vertex](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml) |
| [google-vertex-haystack](integrations/google_vertex/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/google-vertex-haystack.svg)](https://pypi.org/project/google-vertex-haystack) | [![Test / google-vertex](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml) |
| [gradient-haystack](integrations/gradient/) | Embedder, Generator | [![PyPI - Version](https://img.shields.io/pypi/v/gradient-haystack.svg)](https://pypi.org/project/gradient-haystack) | [![Test / gradient](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/gradient.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/gradient.yml) |
| [instructor-embedders-haystack](integrations/instructor-embedders/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack) | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml) |
| [instructor-embedders-haystack](integrations/instructor_embedders/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack) | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml) |
| [opensearch-haystack](integrations/opensearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack) | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml) |
| [qdrant-haystack](integrations/qdrant/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/qdrant-haystack.svg?color=orange)](https://pypi.org/project/qdrant-haystack) | [![Test / qdrant](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml) |
| [unstructured-fileconverter-haystack](integrations/unstructured/fileconverter/) | File converter | [![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) | [![Test / unstructured / fileconverter](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml) |
Expand Down
8 changes: 4 additions & 4 deletions integrations/chroma/example/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pathlib import Path

from haystack import Pipeline
from haystack.components.file_converters import TextFileToDocument
from haystack.components.converters import TextFileToDocument
from haystack.components.writers import DocumentWriter

from chroma_haystack import ChromaDocumentStore
Expand All @@ -19,11 +19,11 @@
indexing.add_component("converter", TextFileToDocument())
indexing.add_component("writer", DocumentWriter(document_store))
indexing.connect("converter", "writer")
indexing.run({"converter": {"paths": file_paths}})
indexing.run({"converter": {"sources": file_paths}})

querying = Pipeline()
querying.add_component("retriever", ChromaQueryRetriever(document_store))
results = querying.run({"retriever": {"queries": ["Variable declarations"], "top_k": 3}})

for d in results["retriever"][0]:
print(d.metadata, d.score)
for d in results["retriever"]["documents"][0]:
print(d.meta, d.score)
7 changes: 1 addition & 6 deletions integrations/chroma/src/chroma_haystack/document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,10 +247,6 @@ def _normalize_filters(self, filters: Dict[str, Any]) -> Tuple[List[str], Dict[s
# if the list contains multiple items, we need an $or chain
for v in value:
where["$or"].append({field: v})
elif field == "mime_type":
# Schedule for removal the original key, we're going to change it
keys_to_remove.append(field)
where["_mime_type"] = value

for k in keys_to_remove:
del filters[k]
Expand Down Expand Up @@ -310,8 +306,7 @@ def _query_result_to_documents(self, result: QueryResult) -> List[List[Document]

# prepare metadata
if metadatas := result.get("metadatas"):
document_dict["metadata"] = dict(metadatas[i][j])
document_dict["mime_type"] = document_dict["metadata"].pop("_mime_type")
document_dict["meta"] = dict(metadatas[i][j])

if embeddings := result.get("embeddings"):
document_dict["embedding"] = np.array(embeddings[i][j])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ def _bm25_retrieval(
"query": query,
"fuzziness": fuzziness,
"type": "most_fields",
"operator": "AND",
"operator": "OR",
}
}
]
Expand Down
27 changes: 27 additions & 0 deletions integrations/elasticsearch/tests/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,33 @@ def test_bm25_retrieval_with_fuzziness(self, document_store: ElasticsearchDocume
assert "functional" in res[1].content
assert "functional" in res[2].content

def test_bm25_not_all_terms_must_match(self, document_store: ElasticsearchDocumentStore):
"""
Test that not all terms must mandatorily match for BM25 retrieval to return a result.
"""
documents = [
Document(id=1, content="There are over 7,000 languages spoken around the world today."),
Document(
id=2,
content=(
"Elephants have been observed to behave in a way that indicates a high level of self-awareness"
" such as recognizing themselves in mirrors."
),
),
Document(
id=3,
content=(
"In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness"
" the phenomenon of bioluminescent waves."
),
),
]
document_store.write_documents(documents)

res = document_store._bm25_retrieval("How much self awareness do elephants have?", top_k=3)
assert len(res) == 1
assert res[0].id == 2

def test_embedding_retrieval(self, document_store: ElasticsearchDocumentStore):
docs = [
Document(content="Most similar document", embedding=[1.0, 1.0, 1.0, 1.0]),
Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,17 @@ dependencies = [
]

[project.urls]
Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/google-vertex#readme"
Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/google_vertex#readme"
Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues"
Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/google-vertex"
Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/google_vertex"

[tool.hatch.version]
source = "vcs"
tag-pattern = 'integrations\/google-vertex-v(?P<version>.*)'
tag-pattern = 'integrations\/google_vertex-v(?P<version>.*)'

[tool.hatch.version.raw-options]
root = "../.."
git_describe_command = 'git describe --tags --match="integrations/google-vertex-v[0-9]*"'
git_describe_command = 'git describe --tags --match="integrations/google_vertex-v[0-9]*"'

[tool.hatch.envs.default]
dependencies = [
Expand Down Expand Up @@ -150,8 +150,8 @@ omit = [
]

[tool.coverage.paths]
google_vertex_haystack = ["src/google_vertex_haystack", "*/google-vertex-haystack/src/google_vertex_haystack"]
tests = ["tests", "*/google-vertex-haystack/tests"]
google_vertex_haystack = ["src/google_vertex_haystack", "*/google_vertex/src/google_vertex_haystack"]
tests = ["tests", "*/google_vertex_haystack/tests"]

[tool.coverage.report]
exclude_lines = [
Expand Down
9 changes: 6 additions & 3 deletions integrations/gradient/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@ name = "gradient-haystack"
dynamic = ["version"]
description = ''
readme = "README.md"
requires-python = ">=3.7"
requires-python = ">=3.8"
license = "Apache-2.0"
keywords = []
authors = [
{ name = "Mateusz Haligowski", email = "[email protected]" },
{ name = "Mateusz Haligowski", email = "[email protected]" },
{ name = "Michael Feil", email = "[email protected]" },
{ name = "Hayden Wilson", email = "[email protected]" },
]
classifiers = [
"Development Status :: 4 - Beta",
Expand All @@ -26,8 +28,9 @@ classifiers = [
]
dependencies = [
"haystack-ai",
"gradientai",
"gradientai>=1.4.0",
]
optional-dependencies = { tqdm = ["tqdm"] }

[project.urls]
Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/gradient#readme"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,20 @@
from gradientai import Gradient
from haystack import Document, component, default_to_dict

tqdm_imported: bool = True
try:
from tqdm import tqdm
except ImportError:
tqdm_imported = False


logger = logging.getLogger(__name__)


def _alt_progress_bar(x: Any) -> Any:
return x


@component
class GradientDocumentEmbedder:
"""
Expand All @@ -31,24 +42,28 @@ def __init__(
self,
*,
model_name: str = "bge-large",
batch_size: int = 100,
batch_size: int = 32_768,
access_token: Optional[str] = None,
workspace_id: Optional[str] = None,
host: Optional[str] = None,
progress_bar: bool = True,
) -> None:
"""
Create a GradientDocumentEmbedder component.
:param model_name: The name of the model to use.
:param batch_size: Update cycle for tqdm progress bar, default is to update every 32_768 docs.
:param access_token: The Gradient access token. If not provided it's read from the environment
variable GRADIENT_ACCESS_TOKEN.
:param workspace_id: The Gradient workspace ID. If not provided it's read from the environment
variable GRADIENT_WORKSPACE_ID.
:param host: The Gradient host. By default it uses https://api.gradient.ai/.
:param progress_bar: Whether to show a progress bar while embedding the documents.
"""
self._batch_size = batch_size
self._host = host
self._model_name = model_name
self._progress_bar = progress_bar

self._gradient = Gradient(access_token=access_token, host=host, workspace_id=workspace_id)

Expand All @@ -75,11 +90,17 @@ def _generate_embeddings(self, documents: List[Document], batch_size: int) -> Li
"""
Batches the documents and generates the embeddings.
"""
batches = [documents[i : i + batch_size] for i in range(0, len(documents), batch_size)]
if self._progress_bar and tqdm_imported:
batches = [documents[i : i + batch_size] for i in range(0, len(documents), batch_size)]
progress_bar = tqdm
else:
# no progress bar
progress_bar = _alt_progress_bar # type: ignore
batches = [documents]

embeddings = []
for batch in batches:
response = self._embedding_model.generate_embeddings(inputs=[{"input": doc.content} for doc in batch])
for batch in progress_bar(batches):
response = self._embedding_model.embed(inputs=[{"input": doc.content} for doc in batch])
embeddings.extend([e.embedding for e in response.embeddings])

return embeddings
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def run(self, text: str):
msg = "The embedding model has not been loaded. Please call warm_up() before running."
raise RuntimeError(msg)

result = self._embedding_model.generate_embeddings(inputs=[{"input": text}])
result = self._embedding_model.embed(inputs=[{"input": text}])

if (not result) or (result.embeddings is None) or (len(result.embeddings) == 0):
msg = "The embedding model did not return any embeddings."
Expand Down
Loading

0 comments on commit b69f747

Please sign in to comment.