Skip to content

Commit

Permalink
feat: Add advanced search API (#438)
Browse files Browse the repository at this point in the history
* feat: add advanced search on entry nodes API

* feat: add pagination on search API

* fix: add text search order in search API

* fix: fix multi-word search

* refactor: enable multi-word whitespaced search

* chore: update Client SDK

* chore: update frontend to support new API

* style: lint backend

* feat: return filters AST in search API call

* refactor: simplify filter search term parsing

* refactor: respond to comments

* chore: regenerate SDK

* style: lint

* docs: add function doc

* fix: reorder search response

* refactor: add parsed query string to search response

* docs: add code docs

* docs: add filter docs
  • Loading branch information
eric-nguyen-cs authored Mar 21, 2024
1 parent e60b525 commit 9c87d1b
Show file tree
Hide file tree
Showing 23 changed files with 909 additions and 241 deletions.
33 changes: 18 additions & 15 deletions backend/editor/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@
# Required imports
# ------------------------------------------------------------------------------------#
from datetime import datetime
from typing import Optional
from typing import Annotated, Optional

# FastAPI
from fastapi import (
BackgroundTasks,
FastAPI,
Form,
HTTPException,
Query,
Request,
Response,
UploadFile,
Expand All @@ -31,7 +32,7 @@
from . import graph_db

# Controller imports
from .controllers import project_controller
from .controllers import project_controller, search_controller
from .entries import TaxonomyGraph

# Custom exceptions
Expand All @@ -40,6 +41,7 @@
# Data model imports
from .models.node_models import EntryNodeCreate, ErrorNode, Footer, Header, NodeType
from .models.project_models import Project, ProjectEdit, ProjectStatus
from .models.search_models import EntryNodeSearchResult
from .scheduler import scheduler_lifespan

# -----------------------------------------------------------------------------------#
Expand Down Expand Up @@ -231,16 +233,6 @@ async def find_one_entry_children(response: Response, branch: str, taxonomy_name
return one_entry_children


@app.get("/{taxonomy_name}/{branch}/entry")
async def find_all_entries(response: Response, branch: str, taxonomy_name: str):
"""
Get all entries within taxonomy
"""
taxonomy = TaxonomyGraph(branch, taxonomy_name)
all_entries = await taxonomy.get_all_nodes("ENTRY")
return all_entries


@app.get("/{taxonomy_name}/{branch}/synonym/{synonym}")
async def find_one_synonym(response: Response, branch: str, taxonomy_name: str, synonym: str):
"""
Expand Down Expand Up @@ -317,10 +309,21 @@ async def find_all_errors(branch: str, taxonomy_name: str) -> ErrorNode:
return result


@app.get("/{taxonomy_name}/{branch}/search")
async def search_node(response: Response, branch: str, taxonomy_name: str, query: str):
@app.get("/{taxonomy_name}/{branch}/nodes/entry")
async def search_entry_nodes(
branch: str,
taxonomy_name: str,
q: Annotated[
str,
Query(
description="The search query string to filter down the returned entry nodes.\
Example: is:root language:en not(language):fr"
),
] = "",
page: int = 1,
) -> EntryNodeSearchResult:
taxonomy = TaxonomyGraph(branch, taxonomy_name)
result = await taxonomy.full_text_search(query)
result = await search_controller.search_entry_nodes(taxonomy.project_name, q, page)
return result


Expand Down
310 changes: 310 additions & 0 deletions backend/editor/controllers/search_controller.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,310 @@
import math
from dataclasses import dataclass

from openfoodfacts_taxonomy_parser import utils as parser_utils
from pydantic import ValidationError

from ..graph_db import get_current_transaction
from ..models.node_models import EntryNode
from ..models.search_models import (
CypherQuery,
EntryNodeSearchResult,
FilterSearchTerm,
FilterSearchTermValidator,
)


def get_query_param_name_prefix(index: int) -> str:
return f"value_{index}"


@dataclass(frozen=True)
class Query:
project_id: str
search_terms: list[str]
name_search_terms: list[str]
filter_search_terms: list[FilterSearchTerm]


def split_query_into_search_terms(query: str) -> list[str]:
"""
Queries should be split by whitespaces that are not inside quotes
"""
query = query.strip()
search_terms = []

inside_quotes = False
term_start = 0

for term_end in range(len(query)):
if query[term_end] == '"':
inside_quotes = not inside_quotes
# If we are not inside quotes and we encounter a whitespace
# we are at the end of the current search term
elif query[term_end] == " " and not inside_quotes:
# If the term is not empty, we add it to the list of search terms
if term_start != term_end:
search_term = query[term_start:term_end]
search_terms.append(search_term)
term_start = term_end + 1

search_terms.append(query[term_start:])

return search_terms


def parse_filter_search_term(search_term: str) -> FilterSearchTerm | None:
"""
Parses a filter search term of the format `filter:value` if possible
OR
Returns None
"""

if ":" not in search_term:
return None

filter_name, filter_value = search_term.split(":", maxsplit=1)

if filter_value.startswith('"') and filter_value.endswith('"'):
filter_value = filter_value[1:-1]

# If the filter value contains quotes, it is invalid
if '"' in filter_value:
return None

try:
# dispatch according to filter_name
return FilterSearchTermValidator.validate_python(
{"filter_type": filter_name, "filter_value": filter_value}
)
except ValidationError:
return None


def validate_query(project_id: str, query: str) -> Query:
"""
A query is composed of search terms separated by whitespaces.
A search term is either a name search term or a filter search term.
A filter search term is of the format `filter:value` where `filter` is a valid filter value
and `value` is a valid search value for the particular filter.
The `value` is surrounded by quotes if it contains whitespaces.
The value cannot contain quotes.
All other terms are considered name search terms.
The name search term allows for a text search on a node's tags.
The possible filters are:
- `is`: `root`, `external` and `not:external` are the only possible values.
It allows to filter on the root and external nodes.
- `language`: the value is a language code. It allows to filter on
if the language exists or not on the node.
You can negate the filter with the not:lc syntax.
- `parent`: the value is a node's id. It allows to filter on if the node is a
parent of the node with the given id.
- `child`: the value is a node's id. It allows to filter on if the node is a child of
the node with the given id.
- `ancestor`: the value is a node's id. It allows to filter on if the node is an ancestor
of the node with the given id.
- `descendant`: the value is a node's id. It allows to filter on if the node is a descendant
of the node with the given id.
- `property`: the value is a property name and an optional value (property_name:value).
It allows to filter on if the node has the given property and if the property has the
given value if it is provided. You can add the `not:inherited:` prefix to the filter to
negate it or to also search on parent nodes for inherited properties.
Examples:
- "is:root language:en not(language):fr property:inherited:vegan:en:yes"
- "is:not:external parent:"en:apple juice" descendant:en:juices "fruit concentrate""
"""

search_terms = split_query_into_search_terms(query)

parsed_search_terms = []
name_search_terms = []
filter_search_terms = []

for search_term in search_terms:
if (filter_search_term := parse_filter_search_term(search_term)) is not None:
filter_search_terms.append(filter_search_term)
parsed_search_terms.append(filter_search_term.to_query_string())
else:
name_search_terms.append(search_term)
parsed_search_terms.append(search_term)

return Query(project_id, parsed_search_terms, name_search_terms, filter_search_terms)


def _get_token_query(token: str) -> str:
"""
Returns the lucene query for a token.
The tokens are additive and the fuzziness of the search depends on the length of the token.
"""

token = "+" + token
if len(token) > 10:
return token + "~2"
elif len(token) > 4:
return token + "~1"
else:
return token


def build_lucene_name_search_query(search_value: str) -> str | None:
"""
The name search term can trigger two types of searches:
- if the search value is in the format `language_code:raw_search_value`,
it triggers a search on the tags_ids_{language_code} index
- else it triggers a search on the tags_ids index
If the `raw_search_value` is surrounded by quotes, the search will be exact.
Otherwise, the search is fuzzy when the search value is longer than 4 characters
(the edit distance depends of the length of the search value)
"""
language_code = None

# get an eventual language prefix
if len(search_value) > 2 and search_value[2] == ":" and search_value[0:2].isalpha():
language_code, search_value = search_value.split(":", maxsplit=1)
language_code = language_code.lower()

def get_search_query() -> str | None:
if search_value.startswith('"') and search_value.endswith('"'):
return search_value if len(search_value) > 2 else None

if language_code is not None:
normalized_text = parser_utils.normalize_text(search_value, language_code)
else:
normalized_text = parser_utils.normalize_text(search_value)

# If normalized text is empty, no searches are found
if normalized_text.strip() == "":
return None

tokens = normalized_text.split("-")

return "(" + " ".join(map(_get_token_query, tokens)) + ")"

search_query = get_search_query()

if search_query is None:
return None

if language_code is not None:
search_query = f"tags_ids_{language_code}:{search_query}"

return search_query


def build_cypher_query(query: Query, skip: int, limit: int) -> tuple[str, str, dict[str, str]]:
# build part of the query doing full text search
lucene_name_search_queries = list(
filter(
lambda q: q is not None, map(build_lucene_name_search_query, query.name_search_terms)
)
)

# build part of the query for filter:value members
cypher_filter_search_terms = [
term.build_cypher_query(get_query_param_name_prefix(index))
for index, term in enumerate(query.filter_search_terms)
]

full_text_search_query, order_clause = "", "WITH n ORDER BY n.is_external, n.id"
query_params = {}

if lucene_name_search_queries:
SEARCH_QUERY_PARAM_NAME = "search_query"
MIN_SEARCH_SCORE = 0.1

full_text_search_query = f"""
CALL db.index.fulltext.queryNodes("{query.project_id}_SearchTagsIds",
${SEARCH_QUERY_PARAM_NAME})
YIELD node, score
WHERE score > {MIN_SEARCH_SCORE}
WITH node.id AS nodeId
WITH COLLECT(nodeId) AS nodeIds
"""
query_params[SEARCH_QUERY_PARAM_NAME] = " AND ".join(lucene_name_search_queries)

order_clause = (
"WITH n, apoc.coll.indexOf(nodeIds, n.id) AS index ORDER BY index, n.is_external"
)

name_filter_search_term = "n.id IN nodeIds"
cypher_filter_search_terms.append(CypherQuery(name_filter_search_term))

for cypher_filter_search_term in cypher_filter_search_terms:
query_params |= cypher_filter_search_term.params

combined_filter_query = (
f"WHERE {' AND '.join([cypher_query.query for cypher_query in cypher_filter_search_terms])}"
if cypher_filter_search_terms
else ""
)

base_query = f"""
{full_text_search_query}
MATCH (n:{query.project_id}:ENTRY)
{combined_filter_query}
"""

page_subquery = f"""
{order_clause}
WITH collect(n) AS nodeList, count(n) AS nodeCount
UNWIND nodeList AS node
WITH node, nodeCount
SKIP {skip} LIMIT {limit}
WITH collect(node) AS nodeList, nodeCount
RETURN nodeList, nodeCount;
"""

count_subquery = """
RETURN count(n) AS nodeCount;
"""

page_query = base_query + page_subquery
count_query = base_query + count_subquery

return page_query, count_query, query_params


async def search_entry_nodes(project_id: str, raw_query: str, page: int) -> EntryNodeSearchResult:
"""
Search for entry nodes in the database
"""
query = validate_query(project_id, raw_query)

parsed_query_string = " ".join(query.search_terms)
# For better UX on the search bar
if parsed_query_string != "":
parsed_query_string += " "

PAGE_LENGTH = 50
skip = max(0, (page - 1) * PAGE_LENGTH)

cypher_query = build_cypher_query(query, skip, PAGE_LENGTH)

page_query, count_query, query_params = cypher_query

result = await get_current_transaction().run(page_query, query_params)
search_result = await result.single()

if search_result is None:
count_result = await get_current_transaction().run(count_query, query_params)
node_count = (await count_result.single())["nodeCount"]
return EntryNodeSearchResult(
node_count=node_count,
page_count=math.ceil(node_count / PAGE_LENGTH),
q=parsed_query_string,
filters=query.filter_search_terms,
)

node_count, nodes = search_result["nodeCount"], search_result["nodeList"]
return EntryNodeSearchResult(
node_count=node_count,
page_count=math.ceil(node_count / PAGE_LENGTH),
q=parsed_query_string,
filters=query.filter_search_terms,
nodes=[EntryNode(**node) for node in nodes],
)
Loading

0 comments on commit 9c87d1b

Please sign in to comment.