Skip to content

Commit

Permalink
community[patch]: Advanced filtering for HANA Cloud Vector Engine (#2…
Browse files Browse the repository at this point in the history
…0821)

- **Description:**
This PR adds support for advanced filtering to the integration of HANA
Vector Engine.
The newly supported filtering operators are: $eq, $ne, $gt, $gte, $lt,
$lte, $between, $in, $nin, $like, $and, $or

  - **Issue:** N/A
  - **Dependencies:** no new dependencies added

Added integration tests to:
`libs/community/tests/integration_tests/vectorstores/test_hanavector.py`

Description of the new capabilities in notebook:
`docs/docs/integrations/vectorstores/hanavector.ipynb`
  • Loading branch information
MartinKolbAtWork authored Apr 24, 2024
1 parent 12e5ec6 commit 0186e4e
Show file tree
Hide file tree
Showing 3 changed files with 448 additions and 13 deletions.
173 changes: 173 additions & 0 deletions docs/docs/integrations/vectorstores/sap_hanavector.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,179 @@
"print(len(docs))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Advanced filtering\n",
"In addition to the basic value-based filtering capabilities, it is possible to use more advanced filtering.\n",
"The table below shows the available filter operators.\n",
"\n",
"| Operator | Semantic |\n",
"|----------|-------------------------|\n",
"| `$eq` | Equality (==) |\n",
"| `$ne` | Inequality (!=) |\n",
"| `$lt` | Less than (<) |\n",
"| `$lte` | Less than or equal (<=) |\n",
"| `$gt` | Greater than (>) |\n",
"| `$gte` | Greater than or equal (>=) |\n",
"| `$in` | Contained in a set of given values (in) |\n",
"| `$nin` | Not contained in a set of given values (not in) |\n",
"| `$between` | Between the range of two boundary values |\n",
"| `$like` | Text equality based on the \"LIKE\" semantics in SQL (using \"%\" as wildcard) |\n",
"| `$and` | Logical \"and\", supporting 2 or more operands |\n",
"| `$or` | Logical \"or\", supporting 2 or more operands |"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Prepare some test documents\n",
"docs = [\n",
" Document(\n",
" page_content=\"First\",\n",
" metadata={\"name\": \"adam\", \"is_active\": True, \"id\": 1, \"height\": 10.0},\n",
" ),\n",
" Document(\n",
" page_content=\"Second\",\n",
" metadata={\"name\": \"bob\", \"is_active\": False, \"id\": 2, \"height\": 5.7},\n",
" ),\n",
" Document(\n",
" page_content=\"Third\",\n",
" metadata={\"name\": \"jane\", \"is_active\": True, \"id\": 3, \"height\": 2.4},\n",
" ),\n",
"]\n",
"\n",
"db = HanaDB(\n",
" connection=connection,\n",
" embedding=embeddings,\n",
" table_name=\"LANGCHAIN_DEMO_ADVANCED_FILTER\",\n",
")\n",
"\n",
"# Delete already existing documents from the table\n",
"db.delete(filter={})\n",
"db.add_documents(docs)\n",
"\n",
"\n",
"# Helper function for printing filter results\n",
"def print_filter_result(result):\n",
" if len(result) == 0:\n",
" print(\"<empty result>\")\n",
" for doc in result:\n",
" print(doc.metadata)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Filtering with `$ne`, `$gt`, `$gte`, `$lt`, `$lte`"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"advanced_filter = {\"id\": {\"$ne\": 1}}\n",
"print(f\"Filter: {advanced_filter}\")\n",
"print_filter_result(db.similarity_search(\"just testing\", k=5, filter=advanced_filter))\n",
"\n",
"advanced_filter = {\"id\": {\"$gt\": 1}}\n",
"print(f\"Filter: {advanced_filter}\")\n",
"print_filter_result(db.similarity_search(\"just testing\", k=5, filter=advanced_filter))\n",
"\n",
"advanced_filter = {\"id\": {\"$gte\": 1}}\n",
"print(f\"Filter: {advanced_filter}\")\n",
"print_filter_result(db.similarity_search(\"just testing\", k=5, filter=advanced_filter))\n",
"\n",
"advanced_filter = {\"id\": {\"$lt\": 1}}\n",
"print(f\"Filter: {advanced_filter}\")\n",
"print_filter_result(db.similarity_search(\"just testing\", k=5, filter=advanced_filter))\n",
"\n",
"advanced_filter = {\"id\": {\"$lte\": 1}}\n",
"print(f\"Filter: {advanced_filter}\")\n",
"print_filter_result(db.similarity_search(\"just testing\", k=5, filter=advanced_filter))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Filtering with `$between`, `$in`, `$nin`"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"advanced_filter = {\"id\": {\"$between\": (1, 2)}}\n",
"print(f\"Filter: {advanced_filter}\")\n",
"print_filter_result(db.similarity_search(\"just testing\", k=5, filter=advanced_filter))\n",
"\n",
"advanced_filter = {\"name\": {\"$in\": [\"adam\", \"bob\"]}}\n",
"print(f\"Filter: {advanced_filter}\")\n",
"print_filter_result(db.similarity_search(\"just testing\", k=5, filter=advanced_filter))\n",
"\n",
"advanced_filter = {\"name\": {\"$nin\": [\"adam\", \"bob\"]}}\n",
"print(f\"Filter: {advanced_filter}\")\n",
"print_filter_result(db.similarity_search(\"just testing\", k=5, filter=advanced_filter))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Text filtering with `$like`"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"advanced_filter = {\"name\": {\"$like\": \"a%\"}}\n",
"print(f\"Filter: {advanced_filter}\")\n",
"print_filter_result(db.similarity_search(\"just testing\", k=5, filter=advanced_filter))\n",
"\n",
"advanced_filter = {\"name\": {\"$like\": \"%a%\"}}\n",
"print(f\"Filter: {advanced_filter}\")\n",
"print_filter_result(db.similarity_search(\"just testing\", k=5, filter=advanced_filter))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Combined filtering with `$and`, `$or`"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"advanced_filter = {\"$or\": [{\"id\": 1}, {\"name\": \"bob\"}]}\n",
"print(f\"Filter: {advanced_filter}\")\n",
"print_filter_result(db.similarity_search(\"just testing\", k=5, filter=advanced_filter))\n",
"\n",
"advanced_filter = {\"$and\": [{\"id\": 1}, {\"id\": 2}]}\n",
"print(f\"Filter: {advanced_filter}\")\n",
"print_filter_result(db.similarity_search(\"just testing\", k=5, filter=advanced_filter))\n",
"\n",
"advanced_filter = {\"$or\": [{\"id\": 1}, {\"id\": 2}, {\"id\": 3}]}\n",
"print(f\"Filter: {advanced_filter}\")\n",
"print_filter_result(db.similarity_search(\"just testing\", k=5, filter=advanced_filter))"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
116 changes: 104 additions & 12 deletions libs/community/langchain_community/vectorstores/hanavector.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
TYPE_CHECKING,
Any,
Callable,
Dict,
Iterable,
List,
Optional,
Expand All @@ -34,6 +35,27 @@
DistanceStrategy.EUCLIDEAN_DISTANCE: ("L2DISTANCE", "ASC"),
}

COMPARISONS_TO_SQL = {
"$eq": "=",
"$ne": "<>",
"$lt": "<",
"$lte": "<=",
"$gt": ">",
"$gte": ">=",
}

IN_OPERATORS_TO_SQL = {
"$in": "IN",
"$nin": "NOT IN",
}

BETWEEN_OPERATOR = "$between"

LIKE_OPERATOR = "$like"

LOGICAL_OPERATORS_TO_SQL = {"$and": "AND", "$or": "OR"}


default_distance_strategy = DistanceStrategy.COSINE
default_table_name: str = "EMBEDDINGS"
default_content_column: str = "VEC_TEXT"
Expand Down Expand Up @@ -404,29 +426,99 @@ def similarity_search_by_vector( # type: ignore[override]
return [doc for doc, _ in docs_and_scores]

def _create_where_by_filter(self, filter): # type: ignore[no-untyped-def]
query_tuple = []
where_str = ""
if filter:
where_str, query_tuple = self._process_filter_object(filter)
where_str = " WHERE " + where_str
return where_str, query_tuple

def _process_filter_object(self, filter): # type: ignore[no-untyped-def]
query_tuple = []
where_str = ""
if filter:
for i, key in enumerate(filter.keys()):
if i == 0:
where_str += " WHERE "
else:
filter_value = filter[key]
if i != 0:
where_str += " AND "

where_str += f" JSON_VALUE({self.metadata_column}, '$.{key}') = ?"

if isinstance(filter[key], bool):
if filter[key]:
query_tuple.append("true")
# Handling of 'special' boolean operators "$and", "$or"
if key in LOGICAL_OPERATORS_TO_SQL:
logical_operator = LOGICAL_OPERATORS_TO_SQL[key]
logical_operands = filter_value
for j, logical_operand in enumerate(logical_operands):
if j != 0:
where_str += f" {logical_operator} "
(
where_str_logical,
query_tuple_logical,
) = self._process_filter_object(logical_operand)
where_str += where_str_logical
query_tuple += query_tuple_logical
continue

operator = "="
sql_param = "?"

if isinstance(filter_value, bool):
query_tuple.append("true" if filter_value else "false")
elif isinstance(filter_value, int) or isinstance(filter_value, str):
query_tuple.append(filter_value)
elif isinstance(filter_value, Dict):
# Handling of 'special' operators starting with "$"
special_op = next(iter(filter_value))
special_val = filter_value[special_op]
# "$eq", "$ne", "$lt", "$lte", "$gt", "$gte"
if special_op in COMPARISONS_TO_SQL:
operator = COMPARISONS_TO_SQL[special_op]
if isinstance(special_val, bool):
query_tuple.append("true" if filter_value else "false")
elif isinstance(special_val, float):
sql_param = "CAST(? as float)"
query_tuple.append(special_val)
else:
query_tuple.append(special_val)
# "$between"
elif special_op == BETWEEN_OPERATOR:
between_from = special_val[0]
between_to = special_val[1]
operator = "BETWEEN"
sql_param = "? AND ?"
query_tuple.append(between_from)
query_tuple.append(between_to)
# "$like"
elif special_op == LIKE_OPERATOR:
operator = "LIKE"
query_tuple.append(special_val)
# "$in", "$nin"
elif special_op in IN_OPERATORS_TO_SQL:
operator = IN_OPERATORS_TO_SQL[special_op]
if isinstance(special_val, list):
for i, list_entry in enumerate(special_val):
if i == 0:
sql_param = "("
sql_param = sql_param + "?"
if i == (len(special_val) - 1):
sql_param = sql_param + ")"
else:
sql_param = sql_param + ","
query_tuple.append(list_entry)
else:
raise ValueError(
f"Unsupported value for {operator}: {special_val}"
)
else:
query_tuple.append("false")
elif isinstance(filter[key], int) or isinstance(filter[key], str):
query_tuple.append(filter[key])
raise ValueError(f"Unsupported operator: {special_op}")
else:
raise ValueError(
f"Unsupported filter data-type: {type(filter[key])}"
f"Unsupported filter data-type: {type(filter_value)}"
)

where_str += (
f" JSON_VALUE({self.metadata_column}, '$.{key}')"
f" {operator} {sql_param}"
)

return where_str, query_tuple

def delete( # type: ignore[override]
Expand Down
Loading

0 comments on commit 0186e4e

Please sign in to comment.