Merge pull request #44 from USCbiostats/keyword-search-and-bug-fixes

Keyword search and bug fixes
USCbiostats · Oct 2, 2024 · 6866d57 · 6866d57
2 parents 339e4e8 + 5d1bf11
commit 6866d57
Show file tree

Hide file tree

Showing 6 changed files with 241 additions and 96 deletions.
diff --git a/src/graphql/models/annotation_model.py b/src/graphql/models/annotation_model.py
@@ -23,7 +23,7 @@ class DocCount:
 
 @strawberry.input
 class Histogram:
-    interval: Optional[int] = 4893.27
+    interval: Optional[float] = 4893.27
     min: Optional[int] = 10636
     max: Optional[int] = 499963
 

diff --git a/src/graphql/resolvers/count_resolver.py b/src/graphql/resolvers/count_resolver.py
@@ -1,7 +1,7 @@
 from ...config.es import es
 from ...config.settings import settings
-from src.graphql.models.annotation_model import FilterArgs, PageArgs
-from .helper_resolver import IDs_query, annotation_query, chromosome_query, convert_hits, gene_query, get_aggregation_query, rsID_query, rsIDs_query
+from src.graphql.models.annotation_model import FilterArgs
+from .helper_resolver import IDs_query, annotation_query, chromosome_query, gene_query, keyword_query, rsID_query, rsIDs_query
 
 
 async def get_annotations_count():
@@ -106,4 +106,19 @@ async def count_by_gene(gene:str, filter_args=FilterArgs):
         )
         return resp['count']
 
-      return 0
+      return 0
+
+async def count_by_keyword(keyword: str):
+      """ 
+      Query for getting count of annotation by keyword
+
+      Params: 
+            keyword: Keyword to search
+
+      Returns: integer for count of annotations
+      """
+      resp = await es.count(
+            index = settings.ES_INDEX,
+            query = keyword_query(keyword)
+      )
+      return resp['count']
diff --git a/src/graphql/resolvers/helper_resolver.py b/src/graphql/resolvers/helper_resolver.py
@@ -1,7 +1,10 @@
+import inspect
+import json
+import typing
 from typing import Dict
 from src.graphql.gene_pos import get_pos_from_gene_id, map_gene, chromosomal_location_dic
 from src.graphql.models.snp_model import ScrollSnp, Snp, SnpAggs
-from src.graphql.models.annotation_model import AggregationItem, Bucket, DocCount, Histogram
+from src.graphql.models.annotation_model import AggregationItem, Bucket, DocCount, FilterArgs, Histogram
 
 from src.utils import clean_field_name
 
@@ -224,61 +227,107 @@ def gene_query(gene, filter_args=None):
 
     return None
 
-async def get_aggregation_query(es_fields: list[str], histogram: Histogram):
+def keyword_query(keyword: str):
     """
-    Query for getting aggregates of annotation
+    Query for getting annotation by keyword
 
-    Params: es_fields: List of fields to be returned in elasticsearch query
-            histogram: Histogram object for histogram aggregation
+    Params: keyword: Keyword for search
 
     Returns: Query for elasticsearch
     """
-    results = dict()
-    for field in es_fields:
-
-        results[f'{field}_doc_count'] = {
-           "filter" : {
-            "exists": {
-              "field": field
-            }
-           }
-        }
 
-        results[f'{field}_min'] = {
-          "min": {
-            "field": "pos"
+    searchable_fields = []
+    with open('./data/anno_tree.json') as f:
+        data = json.load(f)
+        searchable_fields = [elt['name'] for elt in data if data.get('keyword_searchable', False)]
+
+    query = {
+              "multi_match": {
+                "query": keyword,
+                "fields": searchable_fields
+              }
           }
-       }
 
-        results[f'{field}_max'] = {
-            "max": {
-              "field": "pos"
-            }
-        }
+    return query
 
-        results[f'{field}_frequency'] = {
-          "terms": {
-            "field": "pos",
-            "min_doc_count": 0,
-            "size": 20
-          }
-       }
-
-        results[f'{field}_missing'] = {
-          "missing": {
-              "field": "pos"
-            }
-       }
-
-        results[f'{field}_histogram'] = {
-          "histogram": {
-            "field": "pos",
-            "interval": histogram.interval,
-            "extended_bounds": {
-              "min": histogram.min,
-              "max": histogram.max
-            }
-          }
-       }
+async def get_aggregation_query(aggregation_fields: list[tuple[str, list[str]]], histogram: Histogram):
+    """
+    Query for getting aggregates of annotation
+
+    Params: aggregation_fields: List of fields for aggregation, along with their subfields
+            histogram: Histogram object for histogram aggregation
+
+    Returns: Query for elasticsearch
+    """
+    results = dict()
+    for field, subfields in aggregation_fields:
+
+        # Check the type of the field. If it is a string, then we have to add .keyword to the field name while querying missing and frequency
+        # Using the pydantic model Snp, we can check the type of the field
+        is_text_field = typing.get_args(inspect.get_annotations(Snp)[field])[0] == str
+        textual_suffix = '.keyword' if is_text_field else ''
+
+        for subfield in subfields:
+            if subfield == 'doc_count': 
+                results[f'{field}_doc_count'] = {
+                    "filter" : {
+                        "exists": {
+                            "field": field
+                        }
+                    }
+                }
+
+            elif subfield == 'min':
+                results[f'{field}_min'] = {
+                    "min": {
+                        "field": field
+                    }
+                }
+
+            elif subfield == 'max':
+                results[f'{field}_max'] = {
+                    "max": {
+                        "field": field
+                    }
+                }
+
+            elif subfield == 'frequency':
+                results[f'{field}_frequency'] = {
+                    "terms": {
+                        "field": field + textual_suffix,
+                        "min_doc_count": 0,
+                        "size": 20
+                    }
+                }
+
+            elif subfield == 'missing':
+                results[f'{field}_missing'] = {
+                    "missing": {
+                        "field": field + textual_suffix
+                    }
+                }
+
+            elif subfield == 'histogram':
+                results[f'{field}_histogram'] = {
+                    "histogram": {
+                        "field": field,
+                        "interval": histogram.interval,
+                        "extended_bounds": {
+                            "min": histogram.min,
+                            "max": histogram.max
+                        }
+                    }
+                }
+
+    return results
+
+
+def get_default_aggregation_fields(es_fields: list[str]):
+    """
+    Get default aggregation fields for elasticsearch query
 
-    return results
+    Params: es_fields: List of fields to be returned in elasticsearch query
+
+    Returns: List of fields for aggregation
+    """
+    return [(field, ['doc_count']) for field in es_fields]