Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved doublet detection in call_lineages #225

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 49 additions & 1 deletion cassiopeia/preprocess/doublet_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Invoked through pipeline.py and supports the filter_molecule_table and
call_lineage_groups functions.
"""
from typing import Dict, Set, Tuple
from typing import Dict, Set, Tuple, List

import pandas as pd

Expand Down Expand Up @@ -168,3 +168,51 @@ def filter_inter_doublets(at: pd.DataFrame, rule: float = 0.35) -> pd.DataFrame:
n_cells = at["cellBC"].nunique()
logger.debug(f"Filtered {n_filtered} inter-doublets of {n_cells} cells")
return at[at["cellBC"].isin(passing_cellBCs)]


def filter_doublet_lg_sets(
PIV: pd.DataFrame,
master_LGs: List[int],
master_intBCs: Dict[int, List[str]]
) -> Tuple[List[int], Dict[int, List[str]]]:
"""Filters out lineage groups that are likely doublets.

Essentially, filters out lineage groups that have a high proportion of
intBCs that are shared with other lineage groups. For every lineage group,
calculates the proportion of intBCs that are shared with every pair of two
other lineage groups. If the proportion is > .8, then the lineage group
is filtered out.

Args:
PIV: A pivot table of cellBC-intBC-allele groups to be filtered
master_LGs: A list of lineage groups to be filtered
master_intBCs: A dictionary that has mappings from the lineage group
number to the set of intBCs being used for reconstruction

Returns:
A filtered list of lineage groups and a filtered dictionary of intBCs
for each lineage group
"""
lg_sorted = (PIV.value_counts('lineageGrp')
.reset_index().sort_values('lineageGrp', ascending=False))

for lg in lg_sorted['lineageGrp']:
lg = tuple([lg])
filtered = False
lg_intBC = set(master_intBCs[lg])
for lg_i in master_LGs:
for lg_j in master_LGs:
if lg == lg_i or lg == lg_j:
continue
pair_intBC = set(master_intBCs[lg_i]).union(set(master_intBCs[lg_j]))
if len(pair_intBC.intersection(lg_intBC)) > len(lg_intBC) * .8:
master_LGs.remove(lg)
master_intBCs.pop(lg)
logger.debug(f"Filtered lineage group {lg} as a doublet"
f" of {lg_i} and {lg_j}")
filtered = True
break
if filtered:
break

return master_LGs, master_intBCs
148 changes: 86 additions & 62 deletions cassiopeia/preprocess/lineage_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from typing import Dict, List, Tuple

from collections import Counter
from matplotlib import colors, colorbar
import matplotlib.pyplot as plt
import numpy as np
Expand Down Expand Up @@ -49,26 +50,29 @@ def assign_lineage_groups(
piv_assigned = pd.DataFrame()

# Loop for iteratively assigning LGs
prev_clust_size = np.inf
remaining_intBCs = pivot_in.columns.tolist()
i = 0
while prev_clust_size > min_clust_size:
while len(remaining_intBCs) > 0 and pivot_in.shape[0] > 0:
# run function
piv_lg, piv_nolg = find_top_lg(
piv_lg, piv_nolg, intBC_top = find_top_lg(
pivot_in,
i,
min_intbc_prop=min_intbc_thresh,
kinship_thresh=kinship_thresh,
intbcs = remaining_intBCs,
)

# append returned objects to output variable
piv_assigned = pd.concat([piv_assigned, piv_lg], sort=True)

# if lineage group larger than min_clust_size
if piv_lg.shape[0] > min_clust_size:
# append returned objects to output variable
piv_assigned = pd.concat([piv_assigned, piv_lg], sort=True)
i += 1

# update pivot_in by removing assigned alignments
pivot_in = piv_nolg

prev_clust_size = piv_lg.shape[0]

i += 1
# remove intBC_top from remaining_intBCs
remaining_intBCs.remove(intBC_top)

return piv_assigned

Expand All @@ -78,6 +82,7 @@ def find_top_lg(
iteration: int,
min_intbc_prop: float = 0.2,
kinship_thresh: float = 0.2,
intbcs = List[str],
) -> Tuple[pd.DataFrame, pd.DataFrame]:

"""Algorithm to creates lineage groups from a pivot table of UMI counts
Expand All @@ -98,14 +103,15 @@ def find_top_lg(
the most frequent intBC
kinship_thresh: Determines the proportion of intBCs that a cell needs
to share with the cluster in order to included in that cluster
intbcs: A list of intBCs to consider for seeding the lineage group

Returns:
A pivot table of cells labled with lineage group assignments, and a
pivot table of the remaining unassigned cells
"""

# Calculate sum of observed intBCs, identify top intBC
intBC_sums = PIVOT_in.sum(0).sort_values(ascending=False)
intBC_sums = PIVOT_in.loc[:,intbcs].sum(0).sort_values(ascending=False)
intBC_top = intBC_sums.index[0]

# Take subset of PIVOT table that contain cells that have the top intBC
Expand Down Expand Up @@ -146,11 +152,12 @@ def find_top_lg(
PIV_LG["lineageGrp"] = iteration + 1

# Print statements
logger.debug(
f"LG {iteration+1} Assignment: {PIV_LG.shape[0]} cells assigned"
)
if PIV_LG.shape[0] > 0:
logger.debug(
f"LG {iteration+1} Assignment: {PIV_LG.shape[0]} cells assigned"
)

return PIV_LG, PIV_noLG
return PIV_LG, PIV_noLG, intBC_top


def filter_intbcs_lg_sets(
Expand Down Expand Up @@ -203,14 +210,14 @@ def filter_intbcs_lg_sets(
def score_lineage_kinships(
PIV: pd.DataFrame,
master_LGs: List[int],
master_intBCs: Dict[int, pd.DataFrame],
master_intBCs: Dict[int, List[str]],
) -> pd.DataFrame:
"""Identifies which lineage group each cell should belong to.
"""Calculates lineage group kinship for each cell.

Given a set of cells and a set of lineage groups with their intBCs sets,
identifies which lineage group each cell has the most kinship with. Kinship
is defined as the total UMI count of intBCs shared between the cell and the
intBC set of a lineage group.
calculates lineage group kinship for each cell. Kinship is defined as the
UMI count of intBCs shared between the cell and the intBC set of a lineage
group normalized by the total UMI count of the cell.

Args:
PIV: A pivot table of cells labled with lineage group assignments
Expand All @@ -220,8 +227,7 @@ def score_lineage_kinships(


Returns:
A DataFrame that contains the lineage group for each cell with the
greatest kinship
A DataFrame that contains the lineage group kinship for each cell.
"""

dfLG2intBC = pd.DataFrame()
Expand Down Expand Up @@ -253,67 +259,85 @@ def score_lineage_kinships(

# Matrix math
dfCellBC2LG = subPIVOT.dot(dfLG2intBC.T)
max_kinship = dfCellBC2LG.max(axis=1)

max_kinship_ind = dfCellBC2LG.apply(lambda x: np.argmax(x), axis=1)
max_kinship_frame = max_kinship.to_frame()

max_kinship_LG = pd.concat(
[max_kinship_frame, max_kinship_ind + 1], axis=1, sort=True
)
max_kinship_LG.columns = ["maxOverlap", "lineageGrp"]

return max_kinship_LG
return dfCellBC2LG


def annotate_lineage_groups(
dfMT: pd.DataFrame,
max_kinship_LG: pd.DataFrame,
master_intBCs: Dict[int, pd.DataFrame],
at: pd.DataFrame,
kinship_scores: pd.DataFrame,
doublet_kinship_thresh: float = 0.8,
) -> pd.DataFrame:
"""
Assign cells in the allele table to a lineage group.

Takes in an allele table and a DataFrame identifying the chosen
lineage group for each cell and annotates the lineage groups in the
original DataFrame.
Takes in an allele table and a DataFrame of kinship scores for each cell
which is used to assign cells to lineage groups. If a cell has a kinship
score above doublet_kinship_thresh, it is assigned to the lineage group
with the highest kinship score. If a cell has a kinship score below
doublet_kinship_thresh, it is assigned to the two lineage groups with the
highest kinship scores. Returns the original allele table with an
additional lineageGrp column.

Args:
dfMT: An allele table of cellBC-UMI-allele groups
max_kinship_LG: A DataFrame with the max kinship lineage group for each
at: An allele table of cellBC-UMI-allele groups
kinship_scores: A DataFrame with lineage kinship scores for each
cell, see documentation of score_lineage_kinships
master_intBCs: A dictionary relating lineage group to its set of intBCs
doublet_kinship_thresh: A float between 0 and 1 specifying the
the minimum kinship score a cell needs to be assigned to a single
lineage group.

Returns:
Original allele table with annotated lineage group assignments for cells
"""

dfMT["lineageGrp"] = 0
if doublet_kinship_thresh:
logger.info("Identifying inter-lineage group doublets with"
f" kinship threshold {doublet_kinship_thresh}...")

# Assign cells to lineage groups using kinship scores
cellBC2LG = {}
for n in max_kinship_LG.index:
cellBC2LG[n] = max_kinship_LG.loc[n, "lineageGrp"]

dfMT["lineageGrp"] = dfMT["cellBC"].map(cellBC2LG)

dfMT["lineageGrp"] = dfMT["lineageGrp"].fillna(value=0)

lg_sizes = {}
n_doublets = 0
for cellBC, scores in kinship_scores.iterrows():
sorted_scores = scores.sort_values(ascending=False)
if doublet_kinship_thresh:
if sorted_scores[0] < doublet_kinship_thresh:
cellBC2LG[cellBC] = [sorted_scores.index[0],
sorted_scores.index[1]]
n_doublets += 1
else:
cellBC2LG[cellBC] = [sorted_scores.index[0]]
else:
cellBC2LG[cellBC] = [sorted_scores.index[0]]

if doublet_kinship_thresh:
n_cells = len(cellBC2LG)
logger.debug(f"Identified {n_doublets} inter-group doublets"
f" out of {n_cells} cells")

# Rename lineage groups based on size
lg_counts = Counter([item for sublist in cellBC2LG.values() for item in sublist])
rename_lg = {}
i = 1
for lg, count in lg_counts.most_common():
rename_lg[lg] = i
i += 1

for n, g in dfMT.groupby("lineageGrp"):
if n != 0:
lg_sizes[n] = len(g["cellBC"].unique())

sorted_by_value = sorted(lg_sizes.items(), key=lambda kv: kv[1])[::-1]
for i, tup in zip(range(1, len(sorted_by_value) + 1), sorted_by_value):
rename_lg[tup[0]] = float(i)

rename_lg[0] = 0.0

dfMT["lineageGrp"] = dfMT.apply(lambda x: rename_lg[x.lineageGrp], axis=1)

return dfMT
# Rename lineage groups in cellBC2LG
for cellBC, lgs in cellBC2LG.items():
if len(lgs) == 1:
cellBC2LG[cellBC] = rename_lg[lgs[0]]
else:
if rename_lg[lgs[0]] < rename_lg[lgs[1]]:
cellBC2LG[cellBC] = (rename_lg[lgs[0]], rename_lg[lgs[1]])
else:
cellBC2LG[cellBC] = (rename_lg[lgs[1]], rename_lg[lgs[0]])

# Add lineageGrp column to allele table
at["lineageGrp"] = at["cellBC"].map(cellBC2LG)
at["lineageGrp"] = at["lineageGrp"].fillna(value=0).astype(str)

return at


def filter_intbcs_final_lineages(
Expand Down
Loading
Loading