Skip to content

Commit

Permalink
Merge branch 'deepset-ai:main' into ci-deepset-ai#5931-isort
Browse files Browse the repository at this point in the history
  • Loading branch information
mjspeck authored Oct 18, 2023
2 parents 682ddec + 21d894d commit 49b7dd9
Show file tree
Hide file tree
Showing 130 changed files with 3,887 additions and 648 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/examples_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ jobs:
python-version: ${{ env.PYTHON_VERSION }}

- name: Install Haystack
run: pip install .[all,dev]
run: |
pip install --upgrade pip
pip install .[inference,dev,elasticsearch,preprocessing,file-conversion]
- name: Run
run: pytest examples/
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/linting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ on:
paths:
- "**.py"
- "**/pyproject.toml"
- "!haystack/preview/**/*.py"
- "!test/preview/**/*.py"
- "!e2e/preview/**/*.py"

env:
PYTHON_VERSION: "3.8"
Expand Down
81 changes: 81 additions & 0 deletions .github/workflows/linting_preview.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# If you change this name also do it in linting-skipper.yml and ci_metrics.yml
name: Linting (Preview)

on:
pull_request:
paths:
- "haystack/preview/**/*.py"
- "test/preview/**/*.py"
- "e2e/preview/**/*.py"
- "**/pyproject.toml"

env:
PYTHON_VERSION: "3.8"

jobs:
mypy:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
with:
# With the default value of 1, there are corner cases where tj-actions/changed-files
# fails with a `no merge base` error
fetch-depth: 0

- name: Get changed files
id: files
uses: tj-actions/changed-files@v39
with:
files: |
**/*.py
files_ignore: |
test/**
rest_api/test/**
- uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}

- name: Install Haystack
run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2'

- name: Mypy
if: steps.files.outputs.any_changed == 'true'
run: |
mkdir .mypy_cache/
mypy --install-types --non-interactive ${{ steps.files.outputs.all_changed_files }} --exclude=rest_api/build/ --exclude=rest_api/test/
pylint:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
with:
# With the default value of 1, there are corner cases where tj-actions/changed-files
# fails with a `no merge base` error
fetch-depth: 0

- name: Get changed files
id: files
uses: tj-actions/changed-files@v39
with:
files: |
**/*.py
files_ignore: |
test/**
rest_api/test/**
- uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}

- name: Install Haystack
run: |
pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2'
pip install ./haystack-linter
- name: Pylint
if: steps.files.outputs.any_changed == 'true'
run: |
pylint -ry -j 0 ${{ steps.files.outputs.all_changed_files }}
3 changes: 3 additions & 0 deletions .github/workflows/linting_skipper.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ on:
paths-ignore:
- "**.py"
- "**/pyproject.toml"
- "!haystack/preview/**/*.py"
- "!test/preview/**/*.py"
- "!e2e/preview/**/*.py"

jobs:
mypy:
Expand Down
55 changes: 55 additions & 0 deletions .github/workflows/preview_imports.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
name: Verify preview imports only preview

on:
pull_request:
types:
- opened
- reopened
- synchronize
- ready_for_review
paths:
- "haystack/preview/**.py"

jobs:
verify-imports:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
with:
# With the default value of 1, there are corner cases where tj-actions/changed-files
# fails with a `no merge base` error
fetch-depth: 0

- name: Get changed files
id: files
uses: tj-actions/changed-files@v39
with:
files: |
haystack/preview/**.py
- name: Check imports
shell: python
run: |
import re
regex = r"^(from haystack|import haystack)(?!\.preview| import preview)(.*)"
changed_files = "${{ steps.files.outputs.all_changed_files }}".split()
matches = {}
for path in changed_files:
with open(path, "r") as f:
file_matches = []
for line in f.readlines():
file_matches.extend(re.finditer(regex, line.strip()))
if file_matches:
matches[path] = file_matches
for path, match in matches.items():
print(f"Bad imports in file '{path}'")
for m in match:
print(m.group())
print()
if matches:
print("::error:: Imports in haystack.preview can only import from haystack.preview")
import sys; sys.exit(1)
36 changes: 32 additions & 4 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ on:
- "pyproject.toml"
- "!haystack/preview/**/*.py" # See tests_preview.yml
- "!test/preview/**/*.py" # See tests_preview.yml
- "!e2e/preview/**/*.py" # See e2e_preview.yml
- "!.github/**/*.py"
- "!rest_api/**/*.py"
- "!docs/**/*.py"
Expand Down Expand Up @@ -124,10 +125,10 @@ jobs:
include:
- topic: document_stores
os: ubuntu-latest
dependencies: elasticsearch8,faiss,weaviate,pinecone,opensearch,inference,audio,crawler,preprocessing,file-conversion,pdf,ocr,ray,onnx,beir,metrics,aws,dev
dependencies: elasticsearch8,faiss,weaviate,pinecone,opensearch,inference,crawler,preprocessing,file-conversion,pdf,ocr,metrics,dev
- topic: document_stores
os: windows-latest
dependencies: elasticsearch8,faiss,weaviate,pinecone,opensearch,inference,audio,crawler,preprocessing,file-conversion,pdf,ocr,ray,onnx,beir,metrics,aws,dev
dependencies: elasticsearch8,faiss,weaviate,pinecone,opensearch,inference,crawler,preprocessing,file-conversion,pdf,ocr,metrics,dev
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
Expand Down Expand Up @@ -329,7 +330,7 @@ jobs:
runs-on: ${{ matrix.os }}
services:
elasticsearch:
image: elasticsearch:8.8.0
image: elasticsearch:8.10.2
env:
discovery.type: "single-node"
xpack.security.enabled: "false"
Expand All @@ -346,9 +347,36 @@ jobs:
- name: Install Haystack
run: pip install .[elasticsearch8,dev,preprocessing,inference]

- name: Make elasticsearch comfortable with a disk almost full
run: |
curl -X PUT "localhost:9200/_cluster/settings?pretty" -H 'Content-Type: application/json' -d'
{
"persistent": {
"cluster.routing.allocation.disk.watermark.low": "90%",
"cluster.routing.allocation.disk.watermark.low.max_headroom": "100GB",
"cluster.routing.allocation.disk.watermark.high": "95%",
"cluster.routing.allocation.disk.watermark.high.max_headroom": "20GB",
"cluster.routing.allocation.disk.watermark.flood_stage": "97%",
"cluster.routing.allocation.disk.watermark.flood_stage.max_headroom": "5GB",
"cluster.routing.allocation.disk.watermark.flood_stage.frozen": "97%",
"cluster.routing.allocation.disk.watermark.flood_stage.frozen.max_headroom": "5GB"
}
}
'
curl -X PUT "localhost:9200/*/_settings?expand_wildcards=all&pretty" -H 'Content-Type: application/json' -d'
{
"index.blocks.read_only_allow_delete": null
}
'
- name: Run tests
run: |
pytest --maxfail=5 -m "document_store and integration" test/document_stores/test_elasticsearch.py
pytest -x -m"document_store and integration" test/document_stores/test_elasticsearch.py
- name: logs
if: failure()
run: |
docker logs "${{ job.services.elasticsearch.id }}"
- name: Calculate alert data
id: calculator
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/tests_skipper.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@ on:
- ready_for_review
paths-ignore:
- "**.py"
- "pyproject.toml"
- "!haystack/preview/**/*.py" # See tests_preview.yml
- "!test/preview/**/*.py" # See tests_preview.yml
- "pyproject.toml"
- "!e2e/preview/**/*.py" # See e2e_preview.yml
- "!.github/**/*.py"
- "!rest_api/**/*.py"
- "!docs/**/*.py"
Expand Down
52 changes: 51 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,57 @@
| Meta | ![Discord](https://img.shields.io/discord/993534733298450452?logo=discord) ![Twitter Follow](https://img.shields.io/twitter/follow/deepset_ai) |
</div>

[Haystack](https://haystack.deepset.ai/) is an end-to-end NLP framework that enables you to build applications powered by LLMs, Transformer models, vector search and more. Whether you want to perform question answering, answer generation, semantic document search, or build tools that are capable of complex decision making and query resolution, you can use the state-of-the-art NLP models with Haystack to build end-to-end NLP applications solving your use case.
[Haystack](https://haystack.deepset.ai/) is an end-to-end NLP framework that enables you to build applications powered by LLMs, Transformer models, vector search and more. Whether you want to perform question answering, answer generation, semantic document search, or build tools that are capable of complex decision-making and query resolution, you can use the state-of-the-art NLP models with Haystack to build end-to-end NLP applications solving your use case.

## Quickstart

Haystack is built around the concept of pipelines. A pipeline is a powerful structure that performs an NLP task. It's made up of components connected together. For example, you can connect a `Retriever` and a `PromptNode` to build a Generative Question Answering pipeline that uses your own data.

Try out how Haystack answers questions about Game of Thrones using the Retrieval Augmented Generation (RAG) approach 👇

First, run the minimal Haystack installation:

```sh
pip install farm-haystack
```

Then, index your data to the DocumentStore, build a RAG pipeline, and ask a question on your data:

```python
from haystack.document_stores import InMemoryDocumentStore
from haystack.utils import build_pipeline, add_example_data, print_answers

# We are model agnostic :) Here, you can choose from: "anthropic", "cohere", "huggingface", and "openai".
provider = "openai"
API_KEY = "sk-..." # ADD YOUR KEY HERE

# We support many different databases. Here, we load a simple and lightweight in-memory database.
document_store = InMemoryDocumentStore(use_bm25=True)

# Download and add Game of Thrones TXT articles to Haystack DocumentStore.
# You can also provide a folder with your local documents.
add_example_data(document_store, "data/GoT_getting_started")

# Build a pipeline with a Retriever to get relevant documents to the query and a PromptNode interacting with LLMs using a custom prompt.
pipeline = build_pipeline(provider, API_KEY, document_store)

# Ask a question on the data you just added.
result = pipeline.run(query="Who is the father of Arya Stark?")

# For details, like which documents were used to generate the answer, look into the <result> object
print_answers(result, details="medium")
```

The output of the pipeline will reference the documents used to generate the answer:

```
'Query: Who is the father of Arya Stark?'
'Answers:'
[{'answer': 'The father of Arya Stark is Lord Eddard Stark of '
'Winterfell. [Document 1, Document 4, Document 5]'}]
```

Congratulations, you have just built your first Haystack app!

## Core Concepts

Expand Down
1 change: 1 addition & 0 deletions annotation_tool/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ services:
# DEFAULT_ADMIN_PASSWORD: "DEMO_PASSWORD"
# COOKIE_KEYS: "somesafecookiekeys"
# JWT_SECRET: "somesafesecret"
# DOMAIN_WHITELIST: "*"
ports:
- "7001:7001"
links:
Expand Down
36 changes: 25 additions & 11 deletions e2e/preview/pipelines/test_extractive_qa_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,39 @@
import json

from haystack.preview import Pipeline, Document
from haystack.preview.document_stores import MemoryDocumentStore
from haystack.preview.components.retrievers import MemoryBM25Retriever
from haystack.preview.document_stores import InMemoryDocumentStore
from haystack.preview.components.retrievers import InMemoryBM25Retriever
from haystack.preview.components.readers import ExtractiveReader


def test_extractive_qa_pipeline():
document_store = MemoryDocumentStore()
def test_extractive_qa_pipeline(tmp_path):
# Create the pipeline
qa_pipeline = Pipeline()
qa_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore()), name="retriever")
qa_pipeline.add_component(instance=ExtractiveReader(model_name_or_path="deepset/tinyroberta-squad2"), name="reader")
qa_pipeline.connect("retriever", "reader")

# Draw the pipeline
qa_pipeline.draw(tmp_path / "test_extractive_qa_pipeline.png")

# Serialize the pipeline to JSON
with open(tmp_path / "test_bm25_rag_pipeline.json", "w") as f:
print(json.dumps(qa_pipeline.to_dict(), indent=4))
json.dump(qa_pipeline.to_dict(), f)

# Load the pipeline back
with open(tmp_path / "test_bm25_rag_pipeline.json", "r") as f:
qa_pipeline = Pipeline.from_dict(json.load(f))

# Populate the document store
documents = [
Document(text="My name is Jean and I live in Paris."),
Document(text="My name is Mark and I live in Berlin."),
Document(text="My name is Giorgio and I live in Rome."),
]
qa_pipeline.get_component("retriever").document_store.write_documents(documents)

document_store.write_documents(documents)

qa_pipeline = Pipeline()
qa_pipeline.add_component(instance=MemoryBM25Retriever(document_store=document_store), name="retriever")
qa_pipeline.add_component(instance=ExtractiveReader(model_name_or_path="deepset/tinyroberta-squad2"), name="reader")
qa_pipeline.connect("retriever", "reader")

# Query and assert
questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"]
answers_spywords = ["Jean", "Mark", "Giorgio"]

Expand Down
Loading

0 comments on commit 49b7dd9

Please sign in to comment.