YosefLab · colganwi · Sep 18, 2023 · Sep 18, 2023 · Sep 19, 2023 · Sep 19, 2023
diff --git a/cassiopeia/preprocess/doublet_utils.py b/cassiopeia/preprocess/doublet_utils.py
@@ -3,7 +3,7 @@
 Invoked through pipeline.py and supports the filter_molecule_table and
 call_lineage_groups functions.
 """
-from typing import Dict, Set, Tuple
+from typing import Dict, Set, Tuple, List
 
 import pandas as pd
 
@@ -168,3 +168,51 @@ def filter_inter_doublets(at: pd.DataFrame, rule: float = 0.35) -> pd.DataFrame:
     n_cells = at["cellBC"].nunique()
     logger.debug(f"Filtered {n_filtered} inter-doublets of {n_cells} cells")
     return at[at["cellBC"].isin(passing_cellBCs)]
+
+
+def filter_doublet_lg_sets(
+        PIV: pd.DataFrame,
+        master_LGs: List[int], 
+        master_intBCs: Dict[int, List[str]]
+) -> Tuple[List[int], Dict[int, List[str]]]:
+    """Filters out lineage groups that are likely doublets.
+
+    Essentially, filters out lineage groups that have a high proportion of
+    intBCs that are shared with other lineage groups. For every lineage group,
+    calculates the proportion of intBCs that are shared with every pair of two
+    other lineage groups. If the proportion is > .8, then the lineage group
+    is filtered out.
+
+    Args:
+        PIV: A pivot table of cellBC-intBC-allele groups to be filtered
+        master_LGs: A list of lineage groups to be filtered
+        master_intBCs: A dictionary that has mappings from the lineage group
+            number to the set of intBCs being used for reconstruction
+
+    Returns:
+        A filtered list of lineage groups and a filtered dictionary of intBCs
+            for each lineage group
+    """
+    lg_sorted = (PIV.value_counts('lineageGrp')
+        .reset_index().sort_values('lineageGrp', ascending=False))
+
+    for lg in lg_sorted['lineageGrp']:
+        lg = tuple([lg])
+        filtered = False
+        lg_intBC = set(master_intBCs[lg])
+        for lg_i in master_LGs:
+            for lg_j in master_LGs:
+                if lg == lg_i or lg == lg_j:
+                    continue
+                pair_intBC = set(master_intBCs[lg_i]).union(set(master_intBCs[lg_j]))
+                if len(pair_intBC.intersection(lg_intBC)) > len(lg_intBC) * .8:
+                    master_LGs.remove(lg)
+                    master_intBCs.pop(lg)
+                    logger.debug(f"Filtered lineage group {lg} as a doublet"
+                                 f" of {lg_i} and {lg_j}")
+                    filtered = True
+                    break
+            if filtered:
+                break
+
+    return master_LGs, master_intBCs
diff --git a/cassiopeia/preprocess/lineage_utils.py b/cassiopeia/preprocess/lineage_utils.py
@@ -8,6 +8,7 @@
 
 from typing import Dict, List, Tuple
 
+from collections import Counter
 from matplotlib import colors, colorbar
 import matplotlib.pyplot as plt
 import numpy as np
@@ -49,26 +50,29 @@ def assign_lineage_groups(
     piv_assigned = pd.DataFrame()
 
     # Loop for iteratively assigning LGs
-    prev_clust_size = np.inf
+    remaining_intBCs = pivot_in.columns.tolist()
     i = 0
-    while prev_clust_size > min_clust_size:
+    while len(remaining_intBCs) > 0 and pivot_in.shape[0] > 0:
         # run function
-        piv_lg, piv_nolg = find_top_lg(
+        piv_lg, piv_nolg, intBC_top = find_top_lg(
             pivot_in,
             i,
             min_intbc_prop=min_intbc_thresh,
             kinship_thresh=kinship_thresh,
+            intbcs = remaining_intBCs,
         )
 
-        # append returned objects to output variable
-        piv_assigned = pd.concat([piv_assigned, piv_lg], sort=True)
-
+        # if lineage group larger than min_clust_size
+        if piv_lg.shape[0] > min_clust_size:
+             # append returned objects to output variable
+            piv_assigned = pd.concat([piv_assigned, piv_lg], sort=True)
+            i += 1
+
         # update pivot_in by removing assigned alignments
         pivot_in = piv_nolg
 
-        prev_clust_size = piv_lg.shape[0]
-
-        i += 1
+        # remove intBC_top from remaining_intBCs
+        remaining_intBCs.remove(intBC_top) 
 
     return piv_assigned
 
@@ -78,6 +82,7 @@ def find_top_lg(
     iteration: int,
     min_intbc_prop: float = 0.2,
     kinship_thresh: float = 0.2,
+    intbcs = List[str],
 ) -> Tuple[pd.DataFrame, pd.DataFrame]:
 
     """Algorithm to creates lineage groups from a pivot table of UMI counts
@@ -98,14 +103,15 @@ def find_top_lg(
             the most frequent intBC
         kinship_thresh: Determines the proportion of intBCs that a cell needs
             to share with the cluster in order to included in that cluster
+        intbcs: A list of intBCs to consider for seeding the lineage group
 
     Returns:
         A pivot table of cells labled with lineage group assignments, and a
             pivot table of the remaining unassigned cells
     """
 
     # Calculate sum of observed intBCs, identify top intBC
-    intBC_sums = PIVOT_in.sum(0).sort_values(ascending=False)
+    intBC_sums = PIVOT_in.loc[:,intbcs].sum(0).sort_values(ascending=False)
     intBC_top = intBC_sums.index[0]
 
     # Take subset of PIVOT table that contain cells that have the top intBC
@@ -146,11 +152,12 @@ def find_top_lg(
     PIV_LG["lineageGrp"] = iteration + 1
 
     # Print statements
-    logger.debug(
-        f"LG {iteration+1} Assignment: {PIV_LG.shape[0]} cells assigned"
-    )
+    if PIV_LG.shape[0] > 0:
+        logger.debug(
+            f"LG {iteration+1} Assignment: {PIV_LG.shape[0]} cells assigned"
+        )
 
-    return PIV_LG, PIV_noLG
+    return PIV_LG, PIV_noLG, intBC_top
 
 
 def filter_intbcs_lg_sets(
@@ -203,14 +210,14 @@ def filter_intbcs_lg_sets(
 def score_lineage_kinships(
     PIV: pd.DataFrame,
     master_LGs: List[int],
-    master_intBCs: Dict[int, pd.DataFrame],
+    master_intBCs: Dict[int, List[str]],
 ) -> pd.DataFrame:
-    """Identifies which lineage group each cell should belong to.
+    """Calculates lineage group kinship for each cell.
 
     Given a set of cells and a set of lineage groups with their intBCs sets,
-    identifies which lineage group each cell has the most kinship with. Kinship
-    is defined as the total UMI count of intBCs shared between the cell and the
-    intBC set of a lineage group.
+    calculates lineage group kinship for each cell. Kinship is defined as the 
+    UMI count of intBCs shared between the cell and the intBC set of a lineage 
+    group normalized by the total UMI count of the cell.
 
     Args:
         PIV: A pivot table of cells labled with lineage group assignments
@@ -220,8 +227,7 @@ def score_lineage_kinships(
 
 
     Returns:
-        A DataFrame that contains the lineage group for each cell with the
-            greatest kinship
+        A DataFrame that contains the lineage group kinship for each cell.
     """
 
     dfLG2intBC = pd.DataFrame()
@@ -253,67 +259,85 @@ def score_lineage_kinships(
 
     # Matrix math
     dfCellBC2LG = subPIVOT.dot(dfLG2intBC.T)
-    max_kinship = dfCellBC2LG.max(axis=1)
-
-    max_kinship_ind = dfCellBC2LG.apply(lambda x: np.argmax(x), axis=1)
-    max_kinship_frame = max_kinship.to_frame()
 
-    max_kinship_LG = pd.concat(
-        [max_kinship_frame, max_kinship_ind + 1], axis=1, sort=True
-    ) 
-    max_kinship_LG.columns = ["maxOverlap", "lineageGrp"]
-
-    return max_kinship_LG
+    return dfCellBC2LG
 
 
 def annotate_lineage_groups(
-    dfMT: pd.DataFrame,
-    max_kinship_LG: pd.DataFrame,
-    master_intBCs: Dict[int, pd.DataFrame],
+    at: pd.DataFrame,
+    kinship_scores: pd.DataFrame,
+    doublet_kinship_thresh: float = 0.8,
 ) -> pd.DataFrame:
     """
     Assign cells in the allele table to a lineage group.
 
-    Takes in an allele table and a DataFrame identifying the chosen
-    lineage group for each cell and annotates the lineage groups in the
-    original DataFrame.
+    Takes in an allele table and a DataFrame of kinship scores for each cell
+    which is used to assign cells to lineage groups. If a cell has a kinship
+    score above doublet_kinship_thresh, it is assigned to the lineage group
+    with the highest kinship score. If a cell has a kinship score below
+    doublet_kinship_thresh, it is assigned to the two lineage groups with the
+    highest kinship scores. Returns the original allele table with an
+    additional lineageGrp column.
 
     Args:
-        dfMT: An allele table of cellBC-UMI-allele groups
-        max_kinship_LG: A DataFrame with the max kinship lineage group for each
+        at: An allele table of cellBC-UMI-allele groups
+        kinship_scores: A DataFrame with lineage kinship scores for each
             cell, see documentation of score_lineage_kinships
-        master_intBCs: A dictionary relating lineage group to its set of intBCs
+        doublet_kinship_thresh: A float between 0 and 1 specifying the
+            the minimum kinship score a cell needs to be assigned to a single
+            lineage group.
 
     Returns:
         Original allele table with annotated lineage group assignments for cells
     """
 
-    dfMT["lineageGrp"] = 0
+    if doublet_kinship_thresh:
+        logger.info("Identifying inter-lineage group doublets with"
+                    f" kinship threshold {doublet_kinship_thresh}...")
 
+    # Assign cells to lineage groups using kinship scores
     cellBC2LG = {}
-    for n in max_kinship_LG.index:
-        cellBC2LG[n] = max_kinship_LG.loc[n, "lineageGrp"]
-
-    dfMT["lineageGrp"] = dfMT["cellBC"].map(cellBC2LG)
-
-    dfMT["lineageGrp"] = dfMT["lineageGrp"].fillna(value=0)
-
-    lg_sizes = {}
+    n_doublets = 0
+    for cellBC, scores in kinship_scores.iterrows():
+        sorted_scores = scores.sort_values(ascending=False)
+        if doublet_kinship_thresh:
+            if sorted_scores[0] < doublet_kinship_thresh:
+                cellBC2LG[cellBC] = [sorted_scores.index[0],
+                                     sorted_scores.index[1]]
+                n_doublets += 1
+            else:
+                cellBC2LG[cellBC] = [sorted_scores.index[0]]
+        else:
+            cellBC2LG[cellBC] = [sorted_scores.index[0]]
+
+    if doublet_kinship_thresh:
+        n_cells = len(cellBC2LG)
+        logger.debug(f"Identified {n_doublets} inter-group doublets"
+                    f" out of {n_cells} cells")
+
+    # Rename lineage groups based on size
+    lg_counts = Counter([item for sublist in cellBC2LG.values() for item in sublist])
     rename_lg = {}
+    i = 1
+    for lg, count in lg_counts.most_common():
+        rename_lg[lg] = i
+        i += 1
 
-    for n, g in dfMT.groupby("lineageGrp"):
-        if n != 0:
-            lg_sizes[n] = len(g["cellBC"].unique())
-
-    sorted_by_value = sorted(lg_sizes.items(), key=lambda kv: kv[1])[::-1]
-    for i, tup in zip(range(1, len(sorted_by_value) + 1), sorted_by_value):
-        rename_lg[tup[0]] = float(i)
-
-    rename_lg[0] = 0.0
-
-    dfMT["lineageGrp"] = dfMT.apply(lambda x: rename_lg[x.lineageGrp], axis=1)
-
-    return dfMT
+    # Rename lineage groups in cellBC2LG
+    for cellBC, lgs in cellBC2LG.items():
+        if len(lgs) == 1:
+            cellBC2LG[cellBC] = rename_lg[lgs[0]]
+        else:
+            if rename_lg[lgs[0]] < rename_lg[lgs[1]]:
+                cellBC2LG[cellBC] = (rename_lg[lgs[0]], rename_lg[lgs[1]])
+            else:
+                cellBC2LG[cellBC] = (rename_lg[lgs[1]], rename_lg[lgs[0]])
+
+    # Add lineageGrp column to allele table
+    at["lineageGrp"] = at["cellBC"].map(cellBC2LG)
+    at["lineageGrp"] = at["lineageGrp"].fillna(value=0).astype(str)
+
+    return at
 
 
 def filter_intbcs_final_lineages(