Skip to content

Commit

Permalink
Merge pull request #44 from USCbiostats/keyword-search-and-bug-fixes
Browse files Browse the repository at this point in the history
Keyword search and bug fixes
  • Loading branch information
AyaanKakkar authored Oct 2, 2024
2 parents 339e4e8 + 5d1bf11 commit 6866d57
Show file tree
Hide file tree
Showing 6 changed files with 241 additions and 96 deletions.
2 changes: 1 addition & 1 deletion src/graphql/models/annotation_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class DocCount:

@strawberry.input
class Histogram:
interval: Optional[int] = 4893.27
interval: Optional[float] = 4893.27
min: Optional[int] = 10636
max: Optional[int] = 499963

Expand Down
21 changes: 18 additions & 3 deletions src/graphql/resolvers/count_resolver.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from ...config.es import es
from ...config.settings import settings
from src.graphql.models.annotation_model import FilterArgs, PageArgs
from .helper_resolver import IDs_query, annotation_query, chromosome_query, convert_hits, gene_query, get_aggregation_query, rsID_query, rsIDs_query
from src.graphql.models.annotation_model import FilterArgs
from .helper_resolver import IDs_query, annotation_query, chromosome_query, gene_query, keyword_query, rsID_query, rsIDs_query


async def get_annotations_count():
Expand Down Expand Up @@ -106,4 +106,19 @@ async def count_by_gene(gene:str, filter_args=FilterArgs):
)
return resp['count']

return 0
return 0

async def count_by_keyword(keyword: str):
"""
Query for getting count of annotation by keyword
Params:
keyword: Keyword to search
Returns: integer for count of annotations
"""
resp = await es.count(
index = settings.ES_INDEX,
query = keyword_query(keyword)
)
return resp['count']
147 changes: 98 additions & 49 deletions src/graphql/resolvers/helper_resolver.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import inspect
import json
import typing
from typing import Dict
from src.graphql.gene_pos import get_pos_from_gene_id, map_gene, chromosomal_location_dic
from src.graphql.models.snp_model import ScrollSnp, Snp, SnpAggs
from src.graphql.models.annotation_model import AggregationItem, Bucket, DocCount, Histogram
from src.graphql.models.annotation_model import AggregationItem, Bucket, DocCount, FilterArgs, Histogram

from src.utils import clean_field_name

Expand Down Expand Up @@ -224,61 +227,107 @@ def gene_query(gene, filter_args=None):

return None

async def get_aggregation_query(es_fields: list[str], histogram: Histogram):
def keyword_query(keyword: str):
"""
Query for getting aggregates of annotation
Query for getting annotation by keyword
Params: es_fields: List of fields to be returned in elasticsearch query
histogram: Histogram object for histogram aggregation
Params: keyword: Keyword for search
Returns: Query for elasticsearch
"""
results = dict()
for field in es_fields:

results[f'{field}_doc_count'] = {
"filter" : {
"exists": {
"field": field
}
}
}

results[f'{field}_min'] = {
"min": {
"field": "pos"
searchable_fields = []
with open('./data/anno_tree.json') as f:
data = json.load(f)
searchable_fields = [elt['name'] for elt in data if data.get('keyword_searchable', False)]

query = {
"multi_match": {
"query": keyword,
"fields": searchable_fields
}
}
}

results[f'{field}_max'] = {
"max": {
"field": "pos"
}
}
return query

results[f'{field}_frequency'] = {
"terms": {
"field": "pos",
"min_doc_count": 0,
"size": 20
}
}

results[f'{field}_missing'] = {
"missing": {
"field": "pos"
}
}

results[f'{field}_histogram'] = {
"histogram": {
"field": "pos",
"interval": histogram.interval,
"extended_bounds": {
"min": histogram.min,
"max": histogram.max
}
}
}
async def get_aggregation_query(aggregation_fields: list[tuple[str, list[str]]], histogram: Histogram):
"""
Query for getting aggregates of annotation
Params: aggregation_fields: List of fields for aggregation, along with their subfields
histogram: Histogram object for histogram aggregation
Returns: Query for elasticsearch
"""
results = dict()
for field, subfields in aggregation_fields:

# Check the type of the field. If it is a string, then we have to add .keyword to the field name while querying missing and frequency
# Using the pydantic model Snp, we can check the type of the field
is_text_field = typing.get_args(inspect.get_annotations(Snp)[field])[0] == str
textual_suffix = '.keyword' if is_text_field else ''

for subfield in subfields:
if subfield == 'doc_count':
results[f'{field}_doc_count'] = {
"filter" : {
"exists": {
"field": field
}
}
}

elif subfield == 'min':
results[f'{field}_min'] = {
"min": {
"field": field
}
}

elif subfield == 'max':
results[f'{field}_max'] = {
"max": {
"field": field
}
}

elif subfield == 'frequency':
results[f'{field}_frequency'] = {
"terms": {
"field": field + textual_suffix,
"min_doc_count": 0,
"size": 20
}
}

elif subfield == 'missing':
results[f'{field}_missing'] = {
"missing": {
"field": field + textual_suffix
}
}

elif subfield == 'histogram':
results[f'{field}_histogram'] = {
"histogram": {
"field": field,
"interval": histogram.interval,
"extended_bounds": {
"min": histogram.min,
"max": histogram.max
}
}
}

return results


def get_default_aggregation_fields(es_fields: list[str]):
"""
Get default aggregation fields for elasticsearch query
return results
Params: es_fields: List of fields to be returned in elasticsearch query
Returns: List of fields for aggregation
"""
return [(field, ['doc_count']) for field in es_fields]
Loading

0 comments on commit 6866d57

Please sign in to comment.