Skip to content

Commit

Permalink
troubleshoot filtering; faster df concatenation
Browse files Browse the repository at this point in the history
  • Loading branch information
jykr committed Oct 19, 2023
1 parent ed99b29 commit 85dcde3
Show file tree
Hide file tree
Showing 8 changed files with 59 additions and 15 deletions.
11 changes: 7 additions & 4 deletions bean/annotate/filter_alleles.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from ..framework.Edit import Allele
from ..framework.AminoAcidEdit import CodingNoncodingAllele
from ._supporting_fn import map_alleles_to_filtered
from .utils import fast_concat


def sum_column_groups(mat, column_index_list):
Expand Down Expand Up @@ -517,10 +518,10 @@ def _map_alleles_to_filtered(
.rename(columns={"allele_mapped": allele_col})
.groupby(["guide", allele_col])
.sum()
)
).reset_index()

mapped_allele_counts.append(guide_raw_counts)
res = pd.concat(mapped_allele_counts).reset_index()
res = fast_concat(mapped_allele_counts).reset_index(drop=True)
res = res.loc[res[allele_col].map(bool), :]
return res

Expand Down Expand Up @@ -579,8 +580,10 @@ def _distribute_alleles_to_filtered(
res,
index=guide_filtered_counts.index,
columns=guide_filtered_counts.columns,
)
).reset_index()
mapped_allele_counts.append(added_counts)
res = pd.concat(mapped_allele_counts).reset_index()
# @TODO: these lines are taking very long?
print("Contatenating allele counts...")
res = fast_concat(mapped_allele_counts).reset_index(drop=True)
res = res.loc[res[allele_col].map(bool), :]
return res
19 changes: 15 additions & 4 deletions bean/annotate/translate_allele.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,8 @@ def get_aa_change(
self, allele: Allele, include_synonymous: bool = True
) -> CodingNoncodingAllele: # sourcery skip: use-named-expression
"""Finds overlapping CDS and call the same function for the CDS, else return CodingNonCodingAllele with no translated allele."""
if len(allele.edits) == 0:
return CodingNoncodingAllele.from_alleles(nt_allele=allele)
chrom, start, end = allele.get_range()
overlapping_cds = find_overlap(chrom, start, end, self.cds_ranges)
if overlapping_cds:
Expand Down Expand Up @@ -520,6 +522,18 @@ def filter_nt_alleles(cn_allele_df: pd.DataFrame, pos_include: Iterable[int]):
return alleles


def strsplit_edit(edit_str):
if len(edit_str.split(":")) == 3:
chrom, pos, transition = edit_str.split(":")
elif len(edit_str.split(":")) == 2:
pos, transition = edit_str.split(":")
chrom = None
else:
raise ValueError(f"{edit_str} is not in the correct format.")
ref, alt = transition.split(">")
return chrom, pos, ref, alt


def annotate_edit(
edit_info: pd.DataFrame,
edit_col="edit",
Expand All @@ -538,10 +552,7 @@ def annotate_edit(
edit_info["group"] = ""
edit_info["int_pos"] = -1
if "pos" not in edit_info.columns:
edit_info["pos"], transition = zip(*(edit_info[edit_col].str.split(":")))
edit_info["ref"], edit_info["alt"] = zip(
*(pd.Series(transition).str.split(">"))
)
edit_info["chrom"], edit_info["pos"], edit_info["ref"], edit_info["alt"] = zip(*(edit_info[edit_col].map(strsplit_edit)))
edit_info.loc[edit_info.pos.map(lambda s: s.startswith("A")), "coding"] = "coding"
edit_info.loc[
edit_info.pos.map(lambda s: not s.startswith("A")), "coding"
Expand Down
20 changes: 19 additions & 1 deletion bean/annotate/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import os
import sys
import requests
from typing import Optional
from typing import Optional, List
import argparse
import pandas as pd
from itertools import chain
import logging

logging.basicConfig(
Expand All @@ -19,6 +20,23 @@
info = logging.info


def fast_flatten(input_list):
return list(chain.from_iterable(input_list))


def fast_concat(df_list: List[pd.DataFrame]):
"""Faster concatenation of many dataframes from
https://gist.github.com/TariqAHassan/fc77c00efef4897241f49e61ddbede9e
"""
colnames = df_list[0].columns
df_dict = dict.fromkeys(colnames, [])
for col in colnames:
extracted = (df[col] for df in df_list)
# Flatten and save to df_dict
df_dict[col] = fast_flatten(extracted)
return pd.DataFrame.from_dict(df_dict)[colnames]


def find_overlap(
chrom: str, start: int, end: int, range_df: pd.DataFrame
) -> Optional[str]:
Expand Down
2 changes: 2 additions & 0 deletions bean/framework/Edit.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,8 @@ def match_str(cls, allele_str):

def get_range(self):
"""Returns genomic range of the edits in the allele"""
if len(self.edits) == 0:
return None
return (
self.chrom,
min(edit.pos for edit in self.edits),
Expand Down
11 changes: 8 additions & 3 deletions bean/framework/ReporterScreen.py
Original file line number Diff line number Diff line change
Expand Up @@ -850,9 +850,14 @@ def write(self, out_path):
adata.uns[k]["edit"].iloc[0], Edit
):
adata.uns[k].edit = adata.uns[k].edit.map(str)
for c in [colname for colname in v.columns if "allele" in colname]:
if isinstance(v[c].iloc[0], (Allele, CodingNoncodingAllele)):
adata.uns[k].loc[:, c] = adata.uns[k][c].map(str)
try:
for c in [
colname for colname in v.columns if "allele" in str(colname)
]:
if isinstance(v[c].iloc[0], (Allele, CodingNoncodingAllele)):
adata.uns[k].loc[:, c] = adata.uns[k][c].map(str)
except TypeError as e:
raise TypeError(f"error with {e}: {k, v} cannot be written")
super(ReporterScreen, adata).write(out_path)


Expand Down
8 changes: 6 additions & 2 deletions bean/plotting/editing_patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,11 @@ def _add_absent_edits(


def get_edit_rates(
bdata, edit_count_key="edit_counts", add_absent=True, adjust_spacer_pos: bool = True
bdata,
edit_count_key="edit_counts",
add_absent=True,
adjust_spacer_pos: bool = True,
reporter_column: str = "reporter",
):
"""
Obtain position- and context-wise editing rate (context: base preceding the target base position).
Expand Down Expand Up @@ -113,7 +117,7 @@ def get_edit_rates(
)
edit_rates_agg.rel_pos = edit_rates_agg.rel_pos.astype(int)
edit_rates_agg["context"] = edit_rates_agg.apply(
lambda row: bdata.guides.loc[row.guide, "Reporter"][
lambda row: bdata.guides.loc[row.guide, reporter_column][
row.rel_pos - 1 : row.rel_pos + 1
],
axis=1,
Expand Down
1 change: 1 addition & 0 deletions bin/bean-filter
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ if __name__ == "__main__":
f"Filtered down to {len(bdata.uns[f'{allele_df_keys[-1]}_spacer'])} alleles."
)
allele_df_keys.append(f"{allele_df_keys[-1]}_spacer")
bdata.write(f"{args.output_prefix}.tmp.h5ad")

if len(bdata.uns[allele_df_keys[-1]]) > 0 and args.filter_window:
info(
Expand Down
2 changes: 1 addition & 1 deletion notebooks/sample_quality_report.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@
"metadata": {},
"outputs": [],
"source": [
"bdata.samples[[\"rep\", condition_label]] = bdata.samples.index.to_series().str.split(\"_\", expand=True)"
"#bdata.samples[[replicate_label, condition_label]] = bdata.samples.index.to_series().str.split(\"_\", expand=True)"
]
},
{
Expand Down

0 comments on commit 85dcde3

Please sign in to comment.