From ba16e59dc7709d35a9bee32a04cf6c642d2e1b51 Mon Sep 17 00:00:00 2001 From: colganwi Date: Mon, 18 Sep 2023 15:26:30 -0400 Subject: [PATCH 1/4] filter umi per intbc --- cassiopeia/preprocess/pipeline.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cassiopeia/preprocess/pipeline.py b/cassiopeia/preprocess/pipeline.py index 7349c171..1df66ffb 100755 --- a/cassiopeia/preprocess/pipeline.py +++ b/cassiopeia/preprocess/pipeline.py @@ -1012,6 +1012,7 @@ def call_lineage_groups( output_directory: str, min_umi_per_cell: int = 10, min_avg_reads_per_umi: float = 2.0, + min_umi_per_intbc: int = 1, min_cluster_prop: float = 0.005, min_intbc_thresh: float = 0.05, inter_doublet_threshold: float = 0.35, @@ -1046,6 +1047,8 @@ def call_lineage_groups( min_avg_reads_per_umi: The threshold specifying the minimum coverage (i.e. average) reads per UMI in a cell needed in order for that cell not to be filtered during filtering + min_umi_per_intbc: The threshold specifying the minimum number of UMIs + an intBC needs to have in order to be retained during filtering min_cluster_prop: The minimum cluster size in the putative lineage assignment step, as a proportion of the number of cells min_intbc_thresh: The threshold specifying the minimum proportion of @@ -1074,6 +1077,10 @@ def call_lineage_groups( piv = pd.pivot_table( input_df, index="cellBC", columns="intBC", values="UMI", aggfunc="count" ) + # Filter out intBCs with fewer than min_umi_per_intbc UMIs + piv[piv < min_umi_per_intbc] = np.nan + + # Normalize by total UMIs per cell piv = piv.div(piv.sum(axis=1), axis=0) # Reorder piv columns by binarized intBC frequency From 5432ffd53b497376d63609a0bd1546c1db641367 Mon Sep 17 00:00:00 2001 From: colganwi Date: Mon, 18 Sep 2023 15:39:05 -0400 Subject: [PATCH 2/4] Lineage group size not strictly decreasing --- cassiopeia/preprocess/lineage_utils.py | 34 +++++++++++++++----------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/cassiopeia/preprocess/lineage_utils.py b/cassiopeia/preprocess/lineage_utils.py index 26938259..a25162c3 100755 --- a/cassiopeia/preprocess/lineage_utils.py +++ b/cassiopeia/preprocess/lineage_utils.py @@ -49,26 +49,29 @@ def assign_lineage_groups( piv_assigned = pd.DataFrame() # Loop for iteratively assigning LGs - prev_clust_size = np.inf + remaining_intBCs = pivot_in.columns.tolist() i = 0 - while prev_clust_size > min_clust_size: + while len(remaining_intBCs) > 0 and pivot_in.shape[0] > 0: # run function - piv_lg, piv_nolg = find_top_lg( + piv_lg, piv_nolg, intBC_top = find_top_lg( pivot_in, i, min_intbc_prop=min_intbc_thresh, kinship_thresh=kinship_thresh, + intbcs = remaining_intBCs, ) - # append returned objects to output variable - piv_assigned = pd.concat([piv_assigned, piv_lg], sort=True) - + # if lineage group larger than min_clust_size + if piv_lg.shape[0] > min_clust_size: + # append returned objects to output variable + piv_assigned = pd.concat([piv_assigned, piv_lg], sort=True) + i += 1 + # update pivot_in by removing assigned alignments pivot_in = piv_nolg - prev_clust_size = piv_lg.shape[0] - - i += 1 + # remove intBC_top from remaining_intBCs + remaining_intBCs.remove(intBC_top) return piv_assigned @@ -78,6 +81,7 @@ def find_top_lg( iteration: int, min_intbc_prop: float = 0.2, kinship_thresh: float = 0.2, + intbcs = List[str], ) -> Tuple[pd.DataFrame, pd.DataFrame]: """Algorithm to creates lineage groups from a pivot table of UMI counts @@ -98,6 +102,7 @@ def find_top_lg( the most frequent intBC kinship_thresh: Determines the proportion of intBCs that a cell needs to share with the cluster in order to included in that cluster + intbcs: A list of intBCs to consider for seeding the lineage group Returns: A pivot table of cells labled with lineage group assignments, and a @@ -105,7 +110,7 @@ def find_top_lg( """ # Calculate sum of observed intBCs, identify top intBC - intBC_sums = PIVOT_in.sum(0).sort_values(ascending=False) + intBC_sums = PIVOT_in.loc[:,intbcs].sum(0).sort_values(ascending=False) intBC_top = intBC_sums.index[0] # Take subset of PIVOT table that contain cells that have the top intBC @@ -146,11 +151,12 @@ def find_top_lg( PIV_LG["lineageGrp"] = iteration + 1 # Print statements - logger.debug( - f"LG {iteration+1} Assignment: {PIV_LG.shape[0]} cells assigned" - ) + if PIV_LG.shape[0] > 0: + logger.debug( + f"LG {iteration+1} Assignment: {PIV_LG.shape[0]} cells assigned" + ) - return PIV_LG, PIV_noLG + return PIV_LG, PIV_noLG, intBC_top def filter_intbcs_lg_sets( From 24efa19fd1a4d20b481ef77ff9237147f2765a9b Mon Sep 17 00:00:00 2001 From: colganwi Date: Tue, 19 Sep 2023 19:05:49 -0400 Subject: [PATCH 3/4] doublet detection --- cassiopeia/preprocess/doublet_utils.py | 50 ++++++++++- cassiopeia/preprocess/lineage_utils.py | 116 ++++++++++++++----------- cassiopeia/preprocess/pipeline.py | 71 ++++++++++++--- 3 files changed, 174 insertions(+), 63 deletions(-) diff --git a/cassiopeia/preprocess/doublet_utils.py b/cassiopeia/preprocess/doublet_utils.py index 1d735209..38893ac7 100644 --- a/cassiopeia/preprocess/doublet_utils.py +++ b/cassiopeia/preprocess/doublet_utils.py @@ -3,7 +3,7 @@ Invoked through pipeline.py and supports the filter_molecule_table and call_lineage_groups functions. """ -from typing import Dict, Set, Tuple +from typing import Dict, Set, Tuple, List import pandas as pd @@ -168,3 +168,51 @@ def filter_inter_doublets(at: pd.DataFrame, rule: float = 0.35) -> pd.DataFrame: n_cells = at["cellBC"].nunique() logger.debug(f"Filtered {n_filtered} inter-doublets of {n_cells} cells") return at[at["cellBC"].isin(passing_cellBCs)] + + +def filter_doublet_lg_sets( + PIV: pd.DataFrame, + master_LGs: List[int], + master_intBCs: Dict[int, List[str]] +) -> Tuple[List[int], Dict[int, List[str]]]: + """Filters out lineage groups that are likely doublets. + + Essentially, filters out lineage groups that have a high proportion of + intBCs that are shared with other lineage groups. For every lineage group, + calculates the proportion of intBCs that are shared with every pair of two + other lineage groups. If the proportion is > .8, then the lineage group + is filtered out. + + Args: + PIV: A pivot table of cellBC-intBC-allele groups to be filtered + master_LGs: A list of lineage groups to be filtered + master_intBCs: A dictionary that has mappings from the lineage group + number to the set of intBCs being used for reconstruction + + Returns: + A filtered list of lineage groups and a filtered dictionary of intBCs + for each lineage group + """ + lg_sorted = (PIV.value_counts('lineageGrp') + .reset_index().sort_values('lineageGrp', ascending=False)) + + for lg in lg_sorted['lineageGrp']: + lg = tuple([lg]) + filtered = False + lg_intBC = set(master_intBCs[lg]) + for lg_i in master_LGs: + for lg_j in master_LGs: + if lg == lg_i or lg == lg_j: + continue + pair_intBC = set(master_intBCs[lg_i]).union(set(master_intBCs[lg_j])) + if len(pair_intBC.intersection(lg_intBC)) > len(lg_intBC) * .8: + master_LGs.remove(lg) + master_intBCs.pop(lg) + logger.debug(f"Filtered lineage group {lg} as a doublet" + f" of {lg_i} and {lg_j}") + filtered = True + break + if filtered: + break + + return master_LGs, master_intBCs \ No newline at end of file diff --git a/cassiopeia/preprocess/lineage_utils.py b/cassiopeia/preprocess/lineage_utils.py index a25162c3..26704e0b 100755 --- a/cassiopeia/preprocess/lineage_utils.py +++ b/cassiopeia/preprocess/lineage_utils.py @@ -8,6 +8,7 @@ from typing import Dict, List, Tuple +from collections import Counter from matplotlib import colors, colorbar import matplotlib.pyplot as plt import numpy as np @@ -209,14 +210,14 @@ def filter_intbcs_lg_sets( def score_lineage_kinships( PIV: pd.DataFrame, master_LGs: List[int], - master_intBCs: Dict[int, pd.DataFrame], + master_intBCs: Dict[int, List[str]], ) -> pd.DataFrame: - """Identifies which lineage group each cell should belong to. + """Calculates lineage group kinship for each cell. Given a set of cells and a set of lineage groups with their intBCs sets, - identifies which lineage group each cell has the most kinship with. Kinship - is defined as the total UMI count of intBCs shared between the cell and the - intBC set of a lineage group. + calculates lineage group kinship for each cell. Kinship is defined as the + UMI count of intBCs shared between the cell and the intBC set of a lineage + group normalized by the total UMI count of the cell. Args: PIV: A pivot table of cells labled with lineage group assignments @@ -226,8 +227,7 @@ def score_lineage_kinships( Returns: - A DataFrame that contains the lineage group for each cell with the - greatest kinship + A DataFrame that contains the lineage group kinship for each cell. """ dfLG2intBC = pd.DataFrame() @@ -259,67 +259,85 @@ def score_lineage_kinships( # Matrix math dfCellBC2LG = subPIVOT.dot(dfLG2intBC.T) - max_kinship = dfCellBC2LG.max(axis=1) - max_kinship_ind = dfCellBC2LG.apply(lambda x: np.argmax(x), axis=1) - max_kinship_frame = max_kinship.to_frame() - - max_kinship_LG = pd.concat( - [max_kinship_frame, max_kinship_ind + 1], axis=1, sort=True - ) - max_kinship_LG.columns = ["maxOverlap", "lineageGrp"] - - return max_kinship_LG + return dfCellBC2LG def annotate_lineage_groups( - dfMT: pd.DataFrame, - max_kinship_LG: pd.DataFrame, - master_intBCs: Dict[int, pd.DataFrame], + at: pd.DataFrame, + kinship_scores: pd.DataFrame, + doublet_kinship_thresh: float = 0.8, ) -> pd.DataFrame: """ Assign cells in the allele table to a lineage group. - Takes in an allele table and a DataFrame identifying the chosen - lineage group for each cell and annotates the lineage groups in the - original DataFrame. + Takes in an allele table and a DataFrame of kinship scores for each cell + which is used to assign cells to lineage groups. If a cell has a kinship + score above doublet_kinship_thresh, it is assigned to the lineage group + with the highest kinship score. If a cell has a kinship score below + doublet_kinship_thresh, it is assigned to the two lineage groups with the + highest kinship scores. Returns the original allele table with an + additional lineageGrp column. Args: - dfMT: An allele table of cellBC-UMI-allele groups - max_kinship_LG: A DataFrame with the max kinship lineage group for each + at: An allele table of cellBC-UMI-allele groups + kinship_scores: A DataFrame with lineage kinship scores for each cell, see documentation of score_lineage_kinships - master_intBCs: A dictionary relating lineage group to its set of intBCs + doublet_kinship_thresh: A float between 0 and 1 specifying the + the minimum kinship score a cell needs to be assigned to a single + lineage group. Returns: Original allele table with annotated lineage group assignments for cells """ - dfMT["lineageGrp"] = 0 + if doublet_kinship_thresh: + logger.info("Identifying inter-lineage group doublets with" + f" kinship threshold {doublet_kinship_thresh}...") + # Assign cells to lineage groups using kinship scores cellBC2LG = {} - for n in max_kinship_LG.index: - cellBC2LG[n] = max_kinship_LG.loc[n, "lineageGrp"] - - dfMT["lineageGrp"] = dfMT["cellBC"].map(cellBC2LG) - - dfMT["lineageGrp"] = dfMT["lineageGrp"].fillna(value=0) - - lg_sizes = {} + n_doublets = 0 + for cellBC, scores in kinship_scores.iterrows(): + sorted_scores = scores.sort_values(ascending=False) + if doublet_kinship_thresh: + if sorted_scores[0] < doublet_kinship_thresh: + cellBC2LG[cellBC] = [sorted_scores.index[0], + sorted_scores.index[1]] + n_doublets += 1 + else: + cellBC2LG[cellBC] = [sorted_scores.index[0]] + else: + cellBC2LG[cellBC] = [sorted_scores.index[0]] + + if doublet_kinship_thresh: + n_cells = len(cellBC2LG) + logger.debug(f"Identified {n_doublets} inter-group doublets" + f" out of {n_cells} cells") + + # Rename lineage groups based on size + lg_counts = Counter([item for sublist in cellBC2LG.values() for item in sublist]) rename_lg = {} - - for n, g in dfMT.groupby("lineageGrp"): - if n != 0: - lg_sizes[n] = len(g["cellBC"].unique()) - - sorted_by_value = sorted(lg_sizes.items(), key=lambda kv: kv[1])[::-1] - for i, tup in zip(range(1, len(sorted_by_value) + 1), sorted_by_value): - rename_lg[tup[0]] = float(i) - - rename_lg[0] = 0.0 - - dfMT["lineageGrp"] = dfMT.apply(lambda x: rename_lg[x.lineageGrp], axis=1) - - return dfMT + i = 1 + for lg, count in lg_counts.most_common(): + rename_lg[lg] = i + i += 1 + + # Rename lineage groups in cellBC2LG + for cellBC, lgs in cellBC2LG.items(): + if len(lgs) == 1: + cellBC2LG[cellBC] = rename_lg[lgs[0]] + else: + if rename_lg[lgs[0]] < rename_lg[lgs[1]]: + cellBC2LG[cellBC] = (rename_lg[lgs[0]], rename_lg[lgs[1]]) + else: + cellBC2LG[cellBC] = (rename_lg[lgs[1]], rename_lg[lgs[0]]) + + # Add lineageGrp column to allele table + at["lineageGrp"] = at["cellBC"].map(cellBC2LG) + at["lineageGrp"] = at["lineageGrp"].fillna(value=0).astype(str) + + return at def filter_intbcs_final_lineages( diff --git a/cassiopeia/preprocess/pipeline.py b/cassiopeia/preprocess/pipeline.py index 1df66ffb..81deb5ba 100755 --- a/cassiopeia/preprocess/pipeline.py +++ b/cassiopeia/preprocess/pipeline.py @@ -1015,7 +1015,9 @@ def call_lineage_groups( min_umi_per_intbc: int = 1, min_cluster_prop: float = 0.005, min_intbc_thresh: float = 0.05, - inter_doublet_threshold: float = 0.35, + inter_doublet_threshold: float = None, + doublet_kinship_thresh: float = 0.75, + keep_doublets: bool = False, kinship_thresh: float = 0.25, plot: bool = False, ) -> pd.DataFrame: @@ -1059,6 +1061,15 @@ def call_lineage_groups( inter_doublet_threshold: The threshold specifying the minimum proportion of kinship a cell shares with its assigned lineage group out of all lineage groups for it to be retained during doublet filtering + doublet_kinship_thresh: The threshold specifying the minimum kinship a + cell needs to have with a lineage group in order to be considered a + singlet. Cells with kinship scores below this threshold will be + filtered out or marked as doublets depending on the value of + `keep_doublets`. + keep_doublets: Whether or not to keep doublets in the allele table. If + True, doublets will appear in the allele table with a lineage group + of (lg1, lg2). If False, doublets will be removed from the + allele table. kinship_thresh: The threshold specifying the minimum proportion of intBCs shared between a cell and the intBC set of a lineage group needed to assign that cell to that lineage group in putative @@ -1068,16 +1079,29 @@ def call_lineage_groups( Returns: None, saves output allele table to file. """ + + if inter_doublet_threshold: + logger.warning( + "Doublet filtering with the inter_doublet_threshold parameter is" + " depreciated and will be removed in Cassiopeia 2.1.0. Please use" + " the doublet_kinship_thresh parameter instead." + ) + logger.info( f"{input_df.shape[0]} UMIs (rows), with {input_df.shape[1]} attributes (columns)" ) logger.info(str(len(input_df["cellBC"].unique())) + " Cells") + if min_umi_per_intbc > 1: + logger.info(f"Filtering out intBCs with less than " + f"{min_umi_per_intbc} UMIs...") + input_df = input_df.groupby(['cellBC',"intBC"]).filter( + lambda x: len(x) >= min_umi_per_intbc) + # Create a pivot_table piv = pd.pivot_table( input_df, index="cellBC", columns="intBC", values="UMI", aggfunc="count" ) - # Filter out intBCs with fewer than min_umi_per_intbc UMIs piv[piv < min_umi_per_intbc] = np.nan # Normalize by total UMIs per cell @@ -1108,6 +1132,13 @@ def call_lineage_groups( piv_assigned, min_intbc_thresh=min_intbc_thresh ) + logger.info( + "Redefining lineage groups by removing doublet groups..." + ) + master_LGs, master_intBCs = doublet_utils.filter_doublet_lg_sets( + piv_assigned, master_LGs, master_intBCs + ) + logger.info("Reassigning cells to refined lineage groups by kinship...") kinship_scores = lineage_utils.score_lineage_kinships( piv_assigned, master_LGs, master_intBCs @@ -1115,26 +1146,40 @@ def call_lineage_groups( logger.info("Annotating alignment table with refined lineage groups...") allele_table = lineage_utils.annotate_lineage_groups( - input_df, kinship_scores, master_intBCs + input_df, kinship_scores, doublet_kinship_thresh=doublet_kinship_thresh, ) - if inter_doublet_threshold: + + if doublet_kinship_thresh: + if not keep_doublets: + logger.info("Filtering out inter-lineage group doublets with" + f" kinship threshold {doublet_kinship_thresh}...") + allele_table = allele_table[ + ~allele_table["lineageGrp"].str.startswith("(")] + if inter_doublet_threshold: + logger.warning( + "Ignoring inter_doublet_threshold parameter since" + " doublet_kinship_thresh is set." + ) + elif inter_doublet_threshold: logger.info( - f"Filtering out inter-lineage group doublets with proportion {inter_doublet_threshold}..." + f"Filtering out inter-lineage group doublets with" + f" doublet threshold {inter_doublet_threshold}..." ) allele_table = doublet_utils.filter_inter_doublets( - allele_table, rule=inter_doublet_threshold + allele_table, rule=inter_doublet_threshold, + keep_doublets=keep_doublets ) logger.info( "Filtering out low proportion intBCs in finalized lineage groups..." ) - filtered_lgs = lineage_utils.filter_intbcs_final_lineages( - allele_table, min_intbc_thresh=min_intbc_thresh - ) + #filtered_lgs = lineage_utils.filter_intbcs_final_lineages( + # allele_table, min_intbc_thresh=min_intbc_thresh + #) - allele_table = lineage_utils.filtered_lineage_group_to_allele_table( - filtered_lgs - ) + #allele_table = lineage_utils.filtered_lineage_group_to_allele_table( + # filtered_lgs + #) logger.debug("Final lineage group assignments:") for n, g in allele_table.groupby(["lineageGrp"]): @@ -1146,7 +1191,7 @@ def call_lineage_groups( min_umi_per_cell=int(min_umi_per_cell), min_avg_reads_per_umi=min_avg_reads_per_umi, ) - allele_table["lineageGrp"] = allele_table["lineageGrp"].astype(int) + allele_table["lineageGrp"] = allele_table["lineageGrp"].astype(str) if plot: logger.info("Producing Plots...") From 209e9d83b08cc588ddb6b51a5239e9e4ed3f3e42 Mon Sep 17 00:00:00 2001 From: colganwi Date: Tue, 19 Sep 2023 19:37:21 -0400 Subject: [PATCH 4/4] removed testing comments: --- cassiopeia/preprocess/pipeline.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/cassiopeia/preprocess/pipeline.py b/cassiopeia/preprocess/pipeline.py index 81deb5ba..9f0bc7bf 100755 --- a/cassiopeia/preprocess/pipeline.py +++ b/cassiopeia/preprocess/pipeline.py @@ -1029,11 +1029,12 @@ def call_lineage_groups( many intBCs they share with each intBC group (kinship). 2. Refines these putative groups by removing non-informative intBCs - and reassigning cells through kinship. + and doublet lineage groups and reassigning cells through kinship. - 3. Removes all inter-lineage doublets, defined as cells that have - relatively equal kinship scores across multiple lineages and whose - assignments are therefore ambigious. + 3. Identifies all inter-lineage doublets, defined as cells that have + max kinship scores below a threshold. If `keep_doublets` is True, + these doublets will be retained in the final allele table with a + lineage group of (lg1, lg2). 4. Finally, performs one more round of filtering non-informative intBCs and cellBCs with low UMI counts before returning a final table of @@ -1173,13 +1174,13 @@ def call_lineage_groups( logger.info( "Filtering out low proportion intBCs in finalized lineage groups..." ) - #filtered_lgs = lineage_utils.filter_intbcs_final_lineages( - # allele_table, min_intbc_thresh=min_intbc_thresh - #) + filtered_lgs = lineage_utils.filter_intbcs_final_lineages( + allele_table, min_intbc_thresh=min_intbc_thresh + ) - #allele_table = lineage_utils.filtered_lineage_group_to_allele_table( - # filtered_lgs - #) + allele_table = lineage_utils.filtered_lineage_group_to_allele_table( + filtered_lgs + ) logger.debug("Final lineage group assignments:") for n, g in allele_table.groupby(["lineageGrp"]):