add ipykernel

sanjaynagi · Sep 30, 2024 · cd93ee1 · cd93ee1
1 parent f9047aa
commit cd93ee1
Show file tree

Hide file tree

Showing 4 changed files with 50 additions and 36 deletions.
diff --git a/anoexpress/data.py b/anoexpress/data.py
@@ -87,7 +87,7 @@ def metadata(analysis, microarray=False):
 
 
 
-def data(data_type, analysis, microarray=False, gene_id=None, sample_query=None, sort_by=None, annotations=False, pvalue_filter=None, low_count_filter=None, fraction_na_allowed=None):
+def data(data_type, analysis, microarray=False, gene_id=None, sample_query=None, sort_by=None, annotations=False, pvalue_filter=None, low_count_filter=None, fraction_na_allowed=None, gff_method='malariagen_data'):
     """
     Load the combined data for a given analysis and sample query
 
@@ -117,6 +117,8 @@ def data(data_type, analysis, microarray=False, gene_id=None, sample_query=None,
       if provided, genes with a median count below the threshold will be removed from the dataframe. Default is None.
     fraction_na_allowed: float, optional
       fraction of missing values allowed in the data. Defaults to 0.5
+    gff_method: {"malariagen_data", "vectorbase"}, optional
+      which method to use to load the gff file. Default is 'malariagen_data'.
     
     Returns
     -------
@@ -150,7 +152,7 @@ def data(data_type, analysis, microarray=False, gene_id=None, sample_query=None,
 
     # subset to the gene ids of interest including reading file 
     if gene_id is not None:
-      gene_id = resolve_gene_id(gene_id=gene_id, analysis=analysis)
+      gene_id = resolve_gene_id(gene_id=gene_id, analysis=analysis, gff_method=gff_method)
       df = df.query("GeneID in @gene_id")
 
     if annotations: # add gene name and description to the dataframe as index 
@@ -161,7 +163,7 @@ def data(data_type, analysis, microarray=False, gene_id=None, sample_query=None,
       df = null_fold_changes(pval_df=pval_df, fc_df=df, threshold=pvalue_filter)
 
     # sort genes 
-    df = _sort_genes(df=df, analysis=analysis, sort_by=sort_by)
+    df = _sort_genes(df=df, analysis=analysis, sort_by=sort_by, gff_method=gff_method)
 
     # remove low count genes
     if low_count_filter is not None:
@@ -195,7 +197,7 @@ def add_annotations_to_array(df):
     df = df.reset_index().merge(df_annots, on="GeneID", how="left").set_index(["GeneID", "GeneName", "GeneDescription"])   
     return df 
 
-def _sort_genes(df, analysis, sort_by=None):
+def _sort_genes(df, analysis, sort_by=None, gff_method='malariagen_data'):
   if sort_by is None:
      return df.copy()
   if sort_by == 'median':
@@ -207,7 +209,7 @@ def _sort_genes(df, analysis, sort_by=None):
   elif sort_by == 'position':
     assert analysis != 'fun', "funestus cannot be sorted by position yet"
 
-    gff = load_gff()
+    gff = load_gff(method=gff_method)
     gene_ids = gff.reset_index()['GeneID'].to_list()
     ordered_genes = gff.query(f"GeneID in {gene_ids}")['GeneID'].to_list()
     sort_idxs = [np.where(df.reset_index()['GeneID'] == gene)[0][0] for gene in ordered_genes if gene in df.reset_index()['GeneID'].to_list()]

diff --git a/anoexpress/gsea.py b/anoexpress/gsea.py
@@ -5,7 +5,7 @@
 from .utils import resolve_gene_id
 
 
-def load_genes_for_enrichment(analysis, func, gene_ids, percentile, microarray, low_count_filter=None):
+def load_genes_for_enrichment(analysis, func, gene_ids, percentile, microarray, low_count_filter=None, gff_method='malariagen_data'):
 
     assert func is not None or gene_ids is not None, "either a ranking function (func) or gene_ids must be provided"
     assert func is None or gene_ids is None, "Only a ranking function (func) or gene_ids must be provided, not both"
@@ -20,7 +20,7 @@ def load_genes_for_enrichment(analysis, func, gene_ids, percentile, microarray,
       percentile_idx = fc_ranked.reset_index()['GeneID'].unique().shape[0] * percentile
       top_geneIDs = fc_ranked.reset_index().loc[:, 'GeneID'][:int(percentile_idx)] 
     elif gene_ids:
-      top_geneIDs = resolve_gene_id(gene_id=gene_ids, analysis=analysis)
+      top_geneIDs = resolve_gene_id(gene_id=gene_ids, analysis=analysis, gff_method=gff_method)
 
     return top_geneIDs, fc_genes
 

diff --git a/anoexpress/utils.py b/anoexpress/utils.py
@@ -6,7 +6,7 @@
 gff_url =  'https://vectorbase.org/common/downloads/release-68/AgambiaePEST/gff/data/VectorBase-68_AgambiaePEST.gff'
 
 
-def resolve_gene_id(gene_id, analysis):
+def resolve_gene_id(gene_id, analysis, gff_method='malariagen_data'):
 
     if isinstance(gene_id, str):
       if gene_id.startswith(('2L', '2R', '3L', '3R', 'X', '2RL', '3RL')):
@@ -18,7 +18,7 @@ def resolve_gene_id(gene_id, analysis):
           start, end = start_end.replace(",", "").split('-')
           start, end = int(start), int(end)
 
-          gff = load_gff(query=f"contig == '{contig}' and start <= {end} and end >= {start}")
+          gff = load_gff(query=f"contig == '{contig}' and start <= {end} and end >= {start}", method=gff_method)
           gene_id = gff.GeneID.to_list()
 
       elif gene_id.endswith(('.tsv', '.txt')):
@@ -50,35 +50,46 @@ def _gene_ids_from_annotation(gene_annot_df, annotation):
     return np.unique(gene_list)
 
 
-def load_gff(type='protein_coding_gene', query=None):
+def load_gff(method='malariagen_data', override_type=None, query=None):
 
-    df = pd.concat([chunk for chunk in tqdm(pd.read_csv(gff_url, sep="\t", comment="#", chunksize=10000), desc='Loading gff data from VectorBase')])
-    df.columns = ['contig', 'source', 'type', 'start', 'end', 'na', 'strand', 'na2', 'attributes']
-    df = df.assign(contig=lambda x: x.contig.str.split("_").str.get(1))
+    if method == 'malariagen_data':
+        import malariagen_data
+        record_type = override_type if override_type else 'gene'
+
+        ag3 = malariagen_data.Ag3()
+        gff = ag3.genome_features(['2RL', '3RL', 'X'])
+        gff = gff.query(f"type == '{record_type}'").rename(columns={'ID':'GeneID'})
+
+    elif method == 'vectorbase':
+        record_type = override_type if override_type else 'protein_coding_gene'
+
+        df = pd.concat([chunk for chunk in tqdm(pd.read_csv(gff_url, sep="\t", comment="#", chunksize=10000), desc='Loading gff data from VectorBase')])
+        df.columns = ['contig', 'source', 'type', 'start', 'end', 'na', 'strand', 'na2', 'attributes']
+        df = df.assign(contig=lambda x: x.contig.str.split("_").str.get(1))
+
+        if type:
+         df = df.query(f"type == '{record_type}'")
+
+        # may only work for protein_coding_genes 
+        df = df.assign(GeneID=df.attributes.str.split(";", expand=True).iloc[:, 0].str.split("=").str.get(1))
+
+        # combine 2R and 2L, 3R and 3L
+        offset_2R = 61545105
+        offset_3R = 53200684
+
+        gffs = []
+        for contig in tqdm(['2R', '2L', '3R', '3L']):
+            df_contig = df.query("contig == @contig").copy()
+            if contig == '2L':
+                df_contig = df_contig.assign(contig='2RL', start=lambda x: x.start + offset_2R, end=lambda x: x.end + offset_2R)
+            if contig == '3L':
+                df_contig = df_contig.assign(contig='3RL', start=lambda x: x.start + offset_3R, end=lambda x: x.end + offset_3R)
+            elif contig in ['3R', '2R']:
+                df_contig = df_contig.assign(contig=lambda x: x.contig + 'L')
+            gffs.append(df_contig)
 
-    if type:
-     df = df.query(f"type == '{type}'")
-
-    # may only work for protein_coding_genes 
-    df = df.assign(GeneID=df.attributes.str.split(";", expand=True).iloc[:, 0].str.split("=").str.get(1))
-
-    # combine 2R and 2L, 3R and 3L
-    offset_2R = 61545105
-    offset_3R = 53200684
-
-    gffs = []
-    for contig in tqdm(['2R', '2L', '3R', '3L']):
-        df_contig = df.query("contig == @contig").copy()
-        if contig == '2L':
-            df_contig = df_contig.assign(contig='2RL', start=lambda x: x.start + offset_2R, end=lambda x: x.end + offset_2R)
-        if contig == '3L':
-            df_contig = df_contig.assign(contig='3RL', start=lambda x: x.start + offset_3R, end=lambda x: x.end + offset_3R)
-        elif contig in ['3R', '2R']:
-            df_contig = df_contig.assign(contig=lambda x: x.contig + 'L')
-        gffs.append(df_contig)
-
-    gff = pd.concat(gffs)
-    gff = pd.concat([gff, df]).sort_values(['contig', 'start', 'end'])
+        gff = pd.concat(gffs)
+        gff = pd.concat([gff, df]).sort_values(['contig', 'start', 'end'])
 
     if query:
         gff = gff.query(query)

diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,7 @@ license = "MIT"
 [tool.poetry.dependencies]
 python = ">=3.9, <3.12"
 malariagen_data = ">10.0.1"
+ipykernel = "*"
 scikit-allel="*"
 pandas = "*"
 seaborn = "*"