Skip to content

Commit

Permalink
add ipykernel
Browse files Browse the repository at this point in the history
  • Loading branch information
sanjaynagi committed Sep 30, 2024
1 parent f9047aa commit cd93ee1
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 36 deletions.
12 changes: 7 additions & 5 deletions anoexpress/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def metadata(analysis, microarray=False):



def data(data_type, analysis, microarray=False, gene_id=None, sample_query=None, sort_by=None, annotations=False, pvalue_filter=None, low_count_filter=None, fraction_na_allowed=None):
def data(data_type, analysis, microarray=False, gene_id=None, sample_query=None, sort_by=None, annotations=False, pvalue_filter=None, low_count_filter=None, fraction_na_allowed=None, gff_method='malariagen_data'):
"""
Load the combined data for a given analysis and sample query
Expand Down Expand Up @@ -117,6 +117,8 @@ def data(data_type, analysis, microarray=False, gene_id=None, sample_query=None,
if provided, genes with a median count below the threshold will be removed from the dataframe. Default is None.
fraction_na_allowed: float, optional
fraction of missing values allowed in the data. Defaults to 0.5
gff_method: {"malariagen_data", "vectorbase"}, optional
which method to use to load the gff file. Default is 'malariagen_data'.
Returns
-------
Expand Down Expand Up @@ -150,7 +152,7 @@ def data(data_type, analysis, microarray=False, gene_id=None, sample_query=None,

# subset to the gene ids of interest including reading file
if gene_id is not None:
gene_id = resolve_gene_id(gene_id=gene_id, analysis=analysis)
gene_id = resolve_gene_id(gene_id=gene_id, analysis=analysis, gff_method=gff_method)
df = df.query("GeneID in @gene_id")

if annotations: # add gene name and description to the dataframe as index
Expand All @@ -161,7 +163,7 @@ def data(data_type, analysis, microarray=False, gene_id=None, sample_query=None,
df = null_fold_changes(pval_df=pval_df, fc_df=df, threshold=pvalue_filter)

# sort genes
df = _sort_genes(df=df, analysis=analysis, sort_by=sort_by)
df = _sort_genes(df=df, analysis=analysis, sort_by=sort_by, gff_method=gff_method)

# remove low count genes
if low_count_filter is not None:
Expand Down Expand Up @@ -195,7 +197,7 @@ def add_annotations_to_array(df):
df = df.reset_index().merge(df_annots, on="GeneID", how="left").set_index(["GeneID", "GeneName", "GeneDescription"])
return df

def _sort_genes(df, analysis, sort_by=None):
def _sort_genes(df, analysis, sort_by=None, gff_method='malariagen_data'):
if sort_by is None:
return df.copy()
if sort_by == 'median':
Expand All @@ -207,7 +209,7 @@ def _sort_genes(df, analysis, sort_by=None):
elif sort_by == 'position':
assert analysis != 'fun', "funestus cannot be sorted by position yet"

gff = load_gff()
gff = load_gff(method=gff_method)
gene_ids = gff.reset_index()['GeneID'].to_list()
ordered_genes = gff.query(f"GeneID in {gene_ids}")['GeneID'].to_list()
sort_idxs = [np.where(df.reset_index()['GeneID'] == gene)[0][0] for gene in ordered_genes if gene in df.reset_index()['GeneID'].to_list()]
Expand Down
4 changes: 2 additions & 2 deletions anoexpress/gsea.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from .utils import resolve_gene_id


def load_genes_for_enrichment(analysis, func, gene_ids, percentile, microarray, low_count_filter=None):
def load_genes_for_enrichment(analysis, func, gene_ids, percentile, microarray, low_count_filter=None, gff_method='malariagen_data'):

assert func is not None or gene_ids is not None, "either a ranking function (func) or gene_ids must be provided"
assert func is None or gene_ids is None, "Only a ranking function (func) or gene_ids must be provided, not both"
Expand All @@ -20,7 +20,7 @@ def load_genes_for_enrichment(analysis, func, gene_ids, percentile, microarray,
percentile_idx = fc_ranked.reset_index()['GeneID'].unique().shape[0] * percentile
top_geneIDs = fc_ranked.reset_index().loc[:, 'GeneID'][:int(percentile_idx)]
elif gene_ids:
top_geneIDs = resolve_gene_id(gene_id=gene_ids, analysis=analysis)
top_geneIDs = resolve_gene_id(gene_id=gene_ids, analysis=analysis, gff_method=gff_method)

return top_geneIDs, fc_genes

Expand Down
69 changes: 40 additions & 29 deletions anoexpress/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
gff_url = 'https://vectorbase.org/common/downloads/release-68/AgambiaePEST/gff/data/VectorBase-68_AgambiaePEST.gff'


def resolve_gene_id(gene_id, analysis):
def resolve_gene_id(gene_id, analysis, gff_method='malariagen_data'):

if isinstance(gene_id, str):
if gene_id.startswith(('2L', '2R', '3L', '3R', 'X', '2RL', '3RL')):
Expand All @@ -18,7 +18,7 @@ def resolve_gene_id(gene_id, analysis):
start, end = start_end.replace(",", "").split('-')
start, end = int(start), int(end)

gff = load_gff(query=f"contig == '{contig}' and start <= {end} and end >= {start}")
gff = load_gff(query=f"contig == '{contig}' and start <= {end} and end >= {start}", method=gff_method)
gene_id = gff.GeneID.to_list()

elif gene_id.endswith(('.tsv', '.txt')):
Expand Down Expand Up @@ -50,35 +50,46 @@ def _gene_ids_from_annotation(gene_annot_df, annotation):
return np.unique(gene_list)


def load_gff(type='protein_coding_gene', query=None):
def load_gff(method='malariagen_data', override_type=None, query=None):

df = pd.concat([chunk for chunk in tqdm(pd.read_csv(gff_url, sep="\t", comment="#", chunksize=10000), desc='Loading gff data from VectorBase')])
df.columns = ['contig', 'source', 'type', 'start', 'end', 'na', 'strand', 'na2', 'attributes']
df = df.assign(contig=lambda x: x.contig.str.split("_").str.get(1))
if method == 'malariagen_data':
import malariagen_data
record_type = override_type if override_type else 'gene'

ag3 = malariagen_data.Ag3()
gff = ag3.genome_features(['2RL', '3RL', 'X'])
gff = gff.query(f"type == '{record_type}'").rename(columns={'ID':'GeneID'})

elif method == 'vectorbase':
record_type = override_type if override_type else 'protein_coding_gene'

df = pd.concat([chunk for chunk in tqdm(pd.read_csv(gff_url, sep="\t", comment="#", chunksize=10000), desc='Loading gff data from VectorBase')])
df.columns = ['contig', 'source', 'type', 'start', 'end', 'na', 'strand', 'na2', 'attributes']
df = df.assign(contig=lambda x: x.contig.str.split("_").str.get(1))

if type:
df = df.query(f"type == '{record_type}'")

# may only work for protein_coding_genes
df = df.assign(GeneID=df.attributes.str.split(";", expand=True).iloc[:, 0].str.split("=").str.get(1))

# combine 2R and 2L, 3R and 3L
offset_2R = 61545105
offset_3R = 53200684

gffs = []
for contig in tqdm(['2R', '2L', '3R', '3L']):
df_contig = df.query("contig == @contig").copy()
if contig == '2L':
df_contig = df_contig.assign(contig='2RL', start=lambda x: x.start + offset_2R, end=lambda x: x.end + offset_2R)
if contig == '3L':
df_contig = df_contig.assign(contig='3RL', start=lambda x: x.start + offset_3R, end=lambda x: x.end + offset_3R)
elif contig in ['3R', '2R']:
df_contig = df_contig.assign(contig=lambda x: x.contig + 'L')
gffs.append(df_contig)

if type:
df = df.query(f"type == '{type}'")

# may only work for protein_coding_genes
df = df.assign(GeneID=df.attributes.str.split(";", expand=True).iloc[:, 0].str.split("=").str.get(1))

# combine 2R and 2L, 3R and 3L
offset_2R = 61545105
offset_3R = 53200684

gffs = []
for contig in tqdm(['2R', '2L', '3R', '3L']):
df_contig = df.query("contig == @contig").copy()
if contig == '2L':
df_contig = df_contig.assign(contig='2RL', start=lambda x: x.start + offset_2R, end=lambda x: x.end + offset_2R)
if contig == '3L':
df_contig = df_contig.assign(contig='3RL', start=lambda x: x.start + offset_3R, end=lambda x: x.end + offset_3R)
elif contig in ['3R', '2R']:
df_contig = df_contig.assign(contig=lambda x: x.contig + 'L')
gffs.append(df_contig)

gff = pd.concat(gffs)
gff = pd.concat([gff, df]).sort_values(['contig', 'start', 'end'])
gff = pd.concat(gffs)
gff = pd.concat([gff, df]).sort_values(['contig', 'start', 'end'])

if query:
gff = gff.query(query)
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ license = "MIT"
[tool.poetry.dependencies]
python = ">=3.9, <3.12"
malariagen_data = ">10.0.1"
ipykernel = "*"
scikit-allel="*"
pandas = "*"
seaborn = "*"
Expand Down

0 comments on commit cd93ee1

Please sign in to comment.