Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: Add support for new filters declaration #6397

Merged
merged 12 commits into from
Nov 24, 2023
2 changes: 1 addition & 1 deletion e2e/preview/pipelines/test_preprocessing_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def test_preprocessing_pipeline(tmp_path):
preprocessing_pipeline.add_component(instance=TextFileToDocument(), name="text_file_converter")
preprocessing_pipeline.add_component(instance=DocumentLanguageClassifier(), name="language_classifier")
preprocessing_pipeline.add_component(
instance=MetadataRouter(rules={"en": {"language": {"$eq": "en"}}}), name="router"
instance=MetadataRouter(rules={"en": {"field": "language", "operator": "==", "value": "en"}}), name="router"
)
preprocessing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner")
preprocessing_pipeline.add_component(
Expand Down
41 changes: 34 additions & 7 deletions haystack/preview/components/routers/metadata_router.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Dict, List

from haystack.preview import component, Document
from haystack.preview.utils.filters import document_matches_filter
from haystack.preview.utils.filters import document_matches_filter, convert


@component
Expand All @@ -19,12 +19,36 @@ def __init__(self, rules: Dict[str, Dict]):
follow the format of filtering expressions in Haystack. For example:
```python
{
"edge_1": {"created_at": {"$gte": "2023-01-01", "$lt": "2023-04-01"}},
"edge_2": {"created_at": {"$gte": "2023-04-01", "$lt": "2023-07-01"}},
"edge_3": {"created_at": {"$gte": "2023-07-01", "$lt": "2023-10-01"}},
"edge_4": {"created_at": {"$gte": "2023-10-01", "$lt": "2024-01-01"}},
}
```
"edge_1": {
"operator": "AND",
"conditions": [
{"field": "meta.created_at", "operator": ">=", "value": "2023-01-01"},
{"field": "meta.created_at", "operator": "<", "value": "2023-04-01"},
],
},
"edge_2": {
"operator": "AND",
"conditions": [
{"field": "meta.created_at", "operator": ">=", "value": "2023-04-01"},
{"field": "meta.created_at", "operator": "<", "value": "2023-07-01"},
],
},
"edge_3": {
"operator": "AND",
"conditions": [
{"field": "meta.created_at", "operator": ">=", "value": "2023-07-01"},
{"field": "meta.created_at", "operator": "<", "value": "2023-10-01"},
],
},
"edge_4": {
"operator": "AND",
"conditions": [
{"field": "meta.created_at", "operator": ">=", "value": "2023-10-01"},
{"field": "meta.created_at", "operator": "<", "value": "2024-01-01"},
],
},
}
```
"""
self.rules = rules
component.set_output_types(self, unmatched=List[Document], **{edge: List[Document] for edge in rules})
Expand All @@ -43,6 +67,9 @@ def run(self, documents: List[Document]):
for document in documents:
cur_document_matched = False
for edge, rule in self.rules.items():
if "operator" not in rule:
# Must be a legacy filter, convert it
rule = convert(rule)
if document_matches_filter(rule, document):
output[edge].append(document)
cur_document_matched = True
Expand Down
82 changes: 15 additions & 67 deletions haystack/preview/document_stores/in_memory/document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from haystack.preview.document_stores.decorator import document_store
from haystack.preview.dataclasses import Document
from haystack.preview.document_stores.protocols import DuplicatePolicy
from haystack.preview.utils.filters import document_matches_filter
from haystack.preview.utils.filters import document_matches_filter, convert
from haystack.preview.document_stores.errors import DuplicateDocumentError, DocumentStoreError
from haystack.preview.utils import expit

Expand Down Expand Up @@ -92,75 +92,15 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc
"""
Returns the documents that match the filters provided.

Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical operator (`"$and"`,
`"$or"`, `"$not"`), a comparison operator (`"$eq"`, `$ne`, `"$in"`, `$nin`, `"$gt"`, `"$gte"`, `"$lt"`,
`"$lte"`) or a metadata field name.

Logical operator keys take a dictionary of metadata field names and/or logical operators as value. Metadata
field names take a dictionary of comparison operators as value. Comparison operator keys take a single value or
(in case of `"$in"`) a list of values as value. If no logical operator is provided, `"$and"` is used as default
operation. If no comparison operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used
as default operation.

Example:

```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
# or simpler using default operators
filters = {
"type": "article",
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": ["economy", "politics"],
"publisher": "nytimes"
}
}
```

To use the same logical operator multiple times on the same level, logical operators can take a list of
dictionaries as value.

Example:

```python
filters = {
"$or": [
{
"$and": {
"Type": "News Paper",
"Date": {
"$lt": "2019-01-01"
}
}
},
{
"$and": {
"Type": "Blog Post",
"Date": {
"$gte": "2019-01-01"
}
}
}
]
}
```
For a detailed specification of the filters, refer to the DocumentStore.filter_documents() protocol documentation.

:param filters: The filters to apply to the document list.
:return: A list of Documents that match the given filters.
"""
if filters:
return [doc for doc in self.storage.values() if document_matches_filter(conditions=filters, document=doc)]
if "operator" not in filters:
filters = convert(filters)
return [doc for doc in self.storage.values() if document_matches_filter(filters=filters, document=doc)]
return list(self.storage.values())

def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL) -> int:
Expand Down Expand Up @@ -220,9 +160,17 @@ def bm25_retrieval(
if not query:
raise ValueError("Query should be a non-empty string")

content_type_filter = {"$or": {"content": {"$not": None}, "dataframe": {"$not": None}}}
content_type_filter = {
"operator": "OR",
"conditions": [
{"field": "content", "operator": "!=", "value": None},
{"field": "dataframe", "operator": "!=", "value": None},
],
}
if filters:
filters = {"$and": [content_type_filter, filters]}
if "operator" not in filters:
filters = convert(filters)
filters = {"operator": "AND", "conditions": [content_type_filter, filters]}
else:
filters = content_type_filter
all_documents = self.filter_documents(filters=filters)
Expand Down
99 changes: 47 additions & 52 deletions haystack/preview/document_stores/protocols.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,69 +51,64 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc
"""
Returns the documents that match the filters provided.

Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical operator (`"$and"`,
`"$or"`, `"$not"`), a comparison operator (`"$eq"`, `$ne`, `"$in"`, `$nin`, `"$gt"`, `"$gte"`, `"$lt"`,
`"$lte"`) or a metadata field name.
Filters are defined as nested dictionaries that can be of two types:
- Comparison
- Logic

Logical operator keys take a dictionary of metadata field names and/or logical operators as value. Metadata
field names take a dictionary of comparison operators as value. Comparison operator keys take a single value or
(in case of `"$in"`) a list of values as value. If no logical operator is provided, `"$and"` is used as default
operation. If no comparison operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used
as default operation.
Comparison dictionaries must contain the keys:

Example:
- `field`
- `operator`
- `value`

```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
# or simpler using default operators
filters = {
"type": "article",
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": ["economy", "politics"],
"publisher": "nytimes"
}
}
```
Logic dictionaries must contain the keys:

- `operator`
- `conditions`

The `conditions` key must be a list of dictionaries, either of type Comparison or Logic.

To use the same logical operator multiple times on the same level, logical operators can take a list of
dictionaries as value.
The `operator` value in Comparison dictionaries must be one of:

Example:
- `==`
- `!=`
- `>`
- `>=`
- `<`
- `<=`
- `in`
- `not in`

The `operator` values in Logic dictionaries must be one of:

- `NOT`
- `OR`
- `AND`


A simple filter:
```python
filters = {"field": "meta.type", "operator": "==", "value": "article"}
```

A more complex filter:
```python
filters = {
"$or": [
"operator": "AND",
"conditions": [
{"field": "meta.type", "operator": "==", "value": "article"},
{"field": "meta.date", "operator": ">=", "value": 1420066800},
{"field": "meta.date", "operator": "<", "value": 1609455600},
{"field": "meta.rating", "operator": ">=", "value": 3},
{
"$and": {
"Type": "News Paper",
"Date": {
"$lt": "2019-01-01"
}
}
"operator": "OR",
"conditions": [
{"field": "meta.genre", "operator": "in", "value": ["economy", "politics"]},
{"field": "meta.publisher", "operator": "==", "value": "nytimes"},
],
},
{
"$and": {
"Type": "Blog Post",
"Date": {
"$gte": "2019-01-01"
}
}
}
]
],
}
```

:param filters: the filters to apply to the document list.
:return: a list of Documents that match the given filters.
Expand Down
25 changes: 15 additions & 10 deletions haystack/preview/testing/document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def document_store(self):
@pytest.mark.unit
def test_incorrect_filter_type(self, document_store: DocumentStore, filterable_docs: List[Document]):
document_store.write_documents(filterable_docs)
with pytest.raises(FilterError):
with pytest.raises(ValueError):
document_store.filter_documents(filters="something odd") # type: ignore

@pytest.mark.unit
Expand Down Expand Up @@ -574,7 +574,9 @@ def document_store(self):
def test_lt_filter(self, document_store: DocumentStore, filterable_docs: List[Document]):
document_store.write_documents(filterable_docs)
result = document_store.filter_documents(filters={"number": {"$lt": 0.0}})
assert result == [doc for doc in filterable_docs if "number" in doc.meta and doc.meta["number"] < 0]
assert result == [
doc for doc in filterable_docs if doc.meta.get("number") is not None and doc.meta["number"] < 0
]

@pytest.mark.unit
def test_lt_filter_non_numeric(self, document_store: DocumentStore, filterable_docs: List[Document]):
Expand Down Expand Up @@ -614,7 +616,9 @@ def document_store(self):
def test_lte_filter(self, document_store: DocumentStore, filterable_docs: List[Document]):
document_store.write_documents(filterable_docs)
result = document_store.filter_documents(filters={"number": {"$lte": 2.0}})
assert result == [doc for doc in filterable_docs if "number" in doc.meta and doc.meta["number"] <= 2.0]
assert result == [
doc for doc in filterable_docs if doc.meta.get("number") is not None and doc.meta["number"] <= 2.0
]

@pytest.mark.unit
def test_lte_filter_non_numeric(self, document_store: DocumentStore, filterable_docs: List[Document]):
Expand Down Expand Up @@ -658,7 +662,8 @@ def test_filter_simple_or(self, document_store: DocumentStore, filterable_docs:
assert result == [
doc
for doc in filterable_docs
if (("number" in doc.meta and doc.meta["number"] < 1) or doc.meta.get("name") in ["name_0", "name_1"])
if (doc.meta.get("number") is not None and doc.meta["number"] < 1)
or doc.meta.get("name") in ["name_0", "name_1"]
]

@pytest.mark.unit
Expand Down Expand Up @@ -733,7 +738,10 @@ def test_filter_nested_or(self, document_store: DocumentStore, filterable_docs:
assert result == [
doc
for doc in filterable_docs
if (doc.meta.get("name") in ["name_0", "name_1"] or ("number" in doc.meta and doc.meta["number"] < 1))
if (
doc.meta.get("name") in ["name_0", "name_1"]
or (doc.meta.get("number") is not None and doc.meta["number"] < 1)
)
]

@pytest.mark.unit
Expand Down Expand Up @@ -783,11 +791,8 @@ def test_filter_nested_or_and(self, document_store: DocumentStore, filterable_do
doc
for doc in filterable_docs
if (
("number" in doc.meta and doc.meta["number"] < 1)
or (
doc.meta.get("name") in ["name_0", "name_1"]
and ("chapter" in doc.meta and doc.meta["chapter"] != "intro")
)
(doc.meta.get("number") is not None and doc.meta["number"] < 1)
or (doc.meta.get("name") in ["name_0", "name_1"] and (doc.meta.get("chapter") != "intro"))
)
]

Expand Down
Loading