Skip to content

Commit

Permalink
update gff loader
Browse files Browse the repository at this point in the history
  • Loading branch information
sanjaynagi committed May 23, 2024
1 parent 9487f4d commit a886bc9
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 17 deletions.
26 changes: 22 additions & 4 deletions anoexpress/anoexpress.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,28 @@ def load_gff(type='protein_coding_gene', query=None):
# may only work for protein_coding_genes
df = df.assign(GeneID=df.attributes.str.split(";", expand=True).iloc[:, 0].str.split("=").str.get(1))

# combine 2R and 2L, 3R and 3L
offset_2R = 61545105
offset_3R = 53200684

gffs = []
for contig in tqdm(['2R', '2L', '3R', '3L']):
df_contig = df.query("contig == @contig").copy()
if contig == '2L':
df_contig = df_contig.assign(contig='2RL', start=lambda x: x.start + offset_2R, end=lambda x: x.end + offset_2R)
if contig == '3L':
df_contig = df_contig.assign(contig='3RL', start=lambda x: x.start + offset_3R, end=lambda x: x.end + offset_3R)
elif contig in ['3R', '2R']:
df_contig = df_contig.assign(contig=lambda x: x.contig + 'L')
gffs.append(df_contig)

gff = pd.concat(gffs)
gff = pd.concat([gff, df]).sort_values(['contig', 'start', 'end'])

if query:
df = df.query(query)
gff = gff.query(query)

return df
return gff

def resolve_gene_id(gene_id, analysis):

Expand All @@ -107,7 +125,7 @@ def resolve_gene_id(gene_id, analysis):
else:

contig, start_end = gene_id.split(':')
start, end = start_end.split('-')
start, end = start_end.replace(",", "").split('-')

gff = load_gff(query=f"contig == '{contig}' and start <= {end} and end >= {start}")
gene_id = gff.GeneID.to_list()
Expand Down Expand Up @@ -241,7 +259,7 @@ def _sort_genes(df, analysis, sort_by=None):
elif sort_by == 'position':
assert analysis != 'fun', "funestus cannot be sorted by position yet"

gff = load_gff(query="contig in ['2L', '2R', '3L', '3R', 'X']").sort_values(['contig', 'start'])
gff = load_gff(query="contig in ['2L', '2R', '3L', '3R', 'X']")
gene_ids = gff.reset_index()['GeneID'].to_list()
ordered_genes = gff.query(f"GeneID in {gene_ids}")['GeneID'].to_list()
sort_idxs = [np.where(df.reset_index()['GeneID'] == gene)[0][0] for gene in ordered_genes]
Expand Down
13 changes: 0 additions & 13 deletions tests/test_anoexpress.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,19 +190,6 @@ def test_plot_gene_expression_spans(gene_id):
plot_type='strip',
)

def test_contig_expression():

xpress.plot_contig_expression(
contig='3R',
analysis='gamb_colu',
microarray=False,
y_range=(-8,10),
size=10,
step=5,
height=400,
width=600
)


def test_load_candidates():

Expand Down

0 comments on commit a886bc9

Please sign in to comment.