From ba16e59dc7709d35a9bee32a04cf6c642d2e1b51 Mon Sep 17 00:00:00 2001
From: colganwi <colgan.william@gmail.com>
Date: Mon, 18 Sep 2023 15:26:30 -0400
Subject: [PATCH 1/4] filter umi per intbc

---
 cassiopeia/preprocess/pipeline.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/cassiopeia/preprocess/pipeline.py b/cassiopeia/preprocess/pipeline.py
index 7349c171..1df66ffb 100755
--- a/cassiopeia/preprocess/pipeline.py
+++ b/cassiopeia/preprocess/pipeline.py
@@ -1012,6 +1012,7 @@ def call_lineage_groups(
     output_directory: str,
     min_umi_per_cell: int = 10,
     min_avg_reads_per_umi: float = 2.0,
+    min_umi_per_intbc: int = 1,
     min_cluster_prop: float = 0.005,
     min_intbc_thresh: float = 0.05,
     inter_doublet_threshold: float = 0.35,
@@ -1046,6 +1047,8 @@ def call_lineage_groups(
         min_avg_reads_per_umi: The threshold specifying the minimum coverage
             (i.e. average) reads per UMI in a cell needed in order for that
             cell not to be filtered during filtering
+        min_umi_per_intbc: The threshold specifying the minimum number of UMIs
+            an intBC needs to have in order to be retained during filtering
         min_cluster_prop: The minimum cluster size in the putative lineage
             assignment step, as a proportion of the number of cells
         min_intbc_thresh: The threshold specifying the minimum proportion of
@@ -1074,6 +1077,10 @@ def call_lineage_groups(
     piv = pd.pivot_table(
         input_df, index="cellBC", columns="intBC", values="UMI", aggfunc="count"
     )
+    # Filter out intBCs with fewer than min_umi_per_intbc UMIs
+    piv[piv < min_umi_per_intbc] = np.nan
+
+    # Normalize by total UMIs per cell
     piv = piv.div(piv.sum(axis=1), axis=0)
 
     # Reorder piv columns by binarized intBC frequency

From 5432ffd53b497376d63609a0bd1546c1db641367 Mon Sep 17 00:00:00 2001
From: colganwi <colgan.william@gmail.com>
Date: Mon, 18 Sep 2023 15:39:05 -0400
Subject: [PATCH 2/4] Lineage group size not strictly decreasing

---
 cassiopeia/preprocess/lineage_utils.py | 34 +++++++++++++++-----------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/cassiopeia/preprocess/lineage_utils.py b/cassiopeia/preprocess/lineage_utils.py
index 26938259..a25162c3 100755
--- a/cassiopeia/preprocess/lineage_utils.py
+++ b/cassiopeia/preprocess/lineage_utils.py
@@ -49,26 +49,29 @@ def assign_lineage_groups(
     piv_assigned = pd.DataFrame()
 
     # Loop for iteratively assigning LGs
-    prev_clust_size = np.inf
+    remaining_intBCs = pivot_in.columns.tolist()
     i = 0
-    while prev_clust_size > min_clust_size:
+    while len(remaining_intBCs) > 0 and pivot_in.shape[0] > 0:
         # run function
-        piv_lg, piv_nolg = find_top_lg(
+        piv_lg, piv_nolg, intBC_top = find_top_lg(
             pivot_in,
             i,
             min_intbc_prop=min_intbc_thresh,
             kinship_thresh=kinship_thresh,
+            intbcs = remaining_intBCs,
         )
 
-        # append returned objects to output variable
-        piv_assigned = pd.concat([piv_assigned, piv_lg], sort=True)
-
+        # if lineage group larger than min_clust_size
+        if piv_lg.shape[0] > min_clust_size:
+             # append returned objects to output variable
+            piv_assigned = pd.concat([piv_assigned, piv_lg], sort=True)
+            i += 1
+            
         # update pivot_in by removing assigned alignments
         pivot_in = piv_nolg
 
-        prev_clust_size = piv_lg.shape[0]
-
-        i += 1
+        # remove intBC_top from remaining_intBCs
+        remaining_intBCs.remove(intBC_top) 
 
     return piv_assigned
 
@@ -78,6 +81,7 @@ def find_top_lg(
     iteration: int,
     min_intbc_prop: float = 0.2,
     kinship_thresh: float = 0.2,
+    intbcs = List[str],
 ) -> Tuple[pd.DataFrame, pd.DataFrame]:
 
     """Algorithm to creates lineage groups from a pivot table of UMI counts
@@ -98,6 +102,7 @@ def find_top_lg(
             the most frequent intBC
         kinship_thresh: Determines the proportion of intBCs that a cell needs
             to share with the cluster in order to included in that cluster
+        intbcs: A list of intBCs to consider for seeding the lineage group
 
     Returns:
         A pivot table of cells labled with lineage group assignments, and a
@@ -105,7 +110,7 @@ def find_top_lg(
     """
 
     # Calculate sum of observed intBCs, identify top intBC
-    intBC_sums = PIVOT_in.sum(0).sort_values(ascending=False)
+    intBC_sums = PIVOT_in.loc[:,intbcs].sum(0).sort_values(ascending=False)
     intBC_top = intBC_sums.index[0]
 
     # Take subset of PIVOT table that contain cells that have the top intBC
@@ -146,11 +151,12 @@ def find_top_lg(
     PIV_LG["lineageGrp"] = iteration + 1
 
     # Print statements
-    logger.debug(
-        f"LG {iteration+1} Assignment: {PIV_LG.shape[0]} cells assigned"
-    )
+    if PIV_LG.shape[0] > 0:
+        logger.debug(
+            f"LG {iteration+1} Assignment: {PIV_LG.shape[0]} cells assigned"
+        )
 
-    return PIV_LG, PIV_noLG
+    return PIV_LG, PIV_noLG, intBC_top
 
 
 def filter_intbcs_lg_sets(

From 24efa19fd1a4d20b481ef77ff9237147f2765a9b Mon Sep 17 00:00:00 2001
From: colganwi <colgan.william@gmail.com>
Date: Tue, 19 Sep 2023 19:05:49 -0400
Subject: [PATCH 3/4] doublet detection

---
 cassiopeia/preprocess/doublet_utils.py |  50 ++++++++++-
 cassiopeia/preprocess/lineage_utils.py | 116 ++++++++++++++-----------
 cassiopeia/preprocess/pipeline.py      |  71 ++++++++++++---
 3 files changed, 174 insertions(+), 63 deletions(-)

diff --git a/cassiopeia/preprocess/doublet_utils.py b/cassiopeia/preprocess/doublet_utils.py
index 1d735209..38893ac7 100644
--- a/cassiopeia/preprocess/doublet_utils.py
+++ b/cassiopeia/preprocess/doublet_utils.py
@@ -3,7 +3,7 @@
 Invoked through pipeline.py and supports the filter_molecule_table and
 call_lineage_groups functions.
 """
-from typing import Dict, Set, Tuple
+from typing import Dict, Set, Tuple, List
 
 import pandas as pd
 
@@ -168,3 +168,51 @@ def filter_inter_doublets(at: pd.DataFrame, rule: float = 0.35) -> pd.DataFrame:
     n_cells = at["cellBC"].nunique()
     logger.debug(f"Filtered {n_filtered} inter-doublets of {n_cells} cells")
     return at[at["cellBC"].isin(passing_cellBCs)]
+
+
+def filter_doublet_lg_sets(
+        PIV: pd.DataFrame,
+        master_LGs: List[int], 
+        master_intBCs: Dict[int, List[str]]
+) -> Tuple[List[int], Dict[int, List[str]]]:
+    """Filters out lineage groups that are likely doublets.
+
+    Essentially, filters out lineage groups that have a high proportion of
+    intBCs that are shared with other lineage groups. For every lineage group,
+    calculates the proportion of intBCs that are shared with every pair of two
+    other lineage groups. If the proportion is > .8, then the lineage group
+    is filtered out.
+
+    Args:
+        PIV: A pivot table of cellBC-intBC-allele groups to be filtered
+        master_LGs: A list of lineage groups to be filtered
+        master_intBCs: A dictionary that has mappings from the lineage group
+            number to the set of intBCs being used for reconstruction
+
+    Returns:
+        A filtered list of lineage groups and a filtered dictionary of intBCs
+            for each lineage group
+    """
+    lg_sorted = (PIV.value_counts('lineageGrp')
+        .reset_index().sort_values('lineageGrp', ascending=False))
+
+    for lg in lg_sorted['lineageGrp']:
+        lg = tuple([lg])
+        filtered = False
+        lg_intBC = set(master_intBCs[lg])
+        for lg_i in master_LGs:
+            for lg_j in master_LGs:
+                if lg == lg_i or lg == lg_j:
+                    continue
+                pair_intBC = set(master_intBCs[lg_i]).union(set(master_intBCs[lg_j]))
+                if len(pair_intBC.intersection(lg_intBC)) > len(lg_intBC) * .8:
+                    master_LGs.remove(lg)
+                    master_intBCs.pop(lg)
+                    logger.debug(f"Filtered lineage group {lg} as a doublet"
+                                 f" of {lg_i} and {lg_j}")
+                    filtered = True
+                    break
+            if filtered:
+                break
+
+    return master_LGs, master_intBCs
\ No newline at end of file
diff --git a/cassiopeia/preprocess/lineage_utils.py b/cassiopeia/preprocess/lineage_utils.py
index a25162c3..26704e0b 100755
--- a/cassiopeia/preprocess/lineage_utils.py
+++ b/cassiopeia/preprocess/lineage_utils.py
@@ -8,6 +8,7 @@
 
 from typing import Dict, List, Tuple
 
+from collections import Counter
 from matplotlib import colors, colorbar
 import matplotlib.pyplot as plt
 import numpy as np
@@ -209,14 +210,14 @@ def filter_intbcs_lg_sets(
 def score_lineage_kinships(
     PIV: pd.DataFrame,
     master_LGs: List[int],
-    master_intBCs: Dict[int, pd.DataFrame],
+    master_intBCs: Dict[int, List[str]],
 ) -> pd.DataFrame:
-    """Identifies which lineage group each cell should belong to.
+    """Calculates lineage group kinship for each cell.
 
     Given a set of cells and a set of lineage groups with their intBCs sets,
-    identifies which lineage group each cell has the most kinship with. Kinship
-    is defined as the total UMI count of intBCs shared between the cell and the
-    intBC set of a lineage group.
+    calculates lineage group kinship for each cell. Kinship is defined as the 
+    UMI count of intBCs shared between the cell and the intBC set of a lineage 
+    group normalized by the total UMI count of the cell.
 
     Args:
         PIV: A pivot table of cells labled with lineage group assignments
@@ -226,8 +227,7 @@ def score_lineage_kinships(
 
 
     Returns:
-        A DataFrame that contains the lineage group for each cell with the
-            greatest kinship
+        A DataFrame that contains the lineage group kinship for each cell.
     """
 
     dfLG2intBC = pd.DataFrame()
@@ -259,67 +259,85 @@ def score_lineage_kinships(
 
     # Matrix math
     dfCellBC2LG = subPIVOT.dot(dfLG2intBC.T)
-    max_kinship = dfCellBC2LG.max(axis=1)
 
-    max_kinship_ind = dfCellBC2LG.apply(lambda x: np.argmax(x), axis=1)
-    max_kinship_frame = max_kinship.to_frame()
-
-    max_kinship_LG = pd.concat(
-        [max_kinship_frame, max_kinship_ind + 1], axis=1, sort=True
-    ) 
-    max_kinship_LG.columns = ["maxOverlap", "lineageGrp"]
-
-    return max_kinship_LG
+    return dfCellBC2LG
 
 
 def annotate_lineage_groups(
-    dfMT: pd.DataFrame,
-    max_kinship_LG: pd.DataFrame,
-    master_intBCs: Dict[int, pd.DataFrame],
+    at: pd.DataFrame,
+    kinship_scores: pd.DataFrame,
+    doublet_kinship_thresh: float = 0.8,
 ) -> pd.DataFrame:
     """
     Assign cells in the allele table to a lineage group.
 
-    Takes in an allele table and a DataFrame identifying the chosen
-    lineage group for each cell and annotates the lineage groups in the
-    original DataFrame.
+    Takes in an allele table and a DataFrame of kinship scores for each cell
+    which is used to assign cells to lineage groups. If a cell has a kinship
+    score above doublet_kinship_thresh, it is assigned to the lineage group
+    with the highest kinship score. If a cell has a kinship score below
+    doublet_kinship_thresh, it is assigned to the two lineage groups with the
+    highest kinship scores. Returns the original allele table with an
+    additional lineageGrp column.
 
     Args:
-        dfMT: An allele table of cellBC-UMI-allele groups
-        max_kinship_LG: A DataFrame with the max kinship lineage group for each
+        at: An allele table of cellBC-UMI-allele groups
+        kinship_scores: A DataFrame with lineage kinship scores for each
             cell, see documentation of score_lineage_kinships
-        master_intBCs: A dictionary relating lineage group to its set of intBCs
+        doublet_kinship_thresh: A float between 0 and 1 specifying the
+            the minimum kinship score a cell needs to be assigned to a single
+            lineage group.
 
     Returns:
         Original allele table with annotated lineage group assignments for cells
     """
 
-    dfMT["lineageGrp"] = 0
+    if doublet_kinship_thresh:
+        logger.info("Identifying inter-lineage group doublets with"
+                    f" kinship threshold {doublet_kinship_thresh}...")
 
+    # Assign cells to lineage groups using kinship scores
     cellBC2LG = {}
-    for n in max_kinship_LG.index:
-        cellBC2LG[n] = max_kinship_LG.loc[n, "lineageGrp"]
-
-    dfMT["lineageGrp"] = dfMT["cellBC"].map(cellBC2LG)
-
-    dfMT["lineageGrp"] = dfMT["lineageGrp"].fillna(value=0)
-
-    lg_sizes = {}
+    n_doublets = 0
+    for cellBC, scores in kinship_scores.iterrows():
+        sorted_scores = scores.sort_values(ascending=False)
+        if doublet_kinship_thresh:
+            if sorted_scores[0] < doublet_kinship_thresh:
+                cellBC2LG[cellBC] = [sorted_scores.index[0],
+                                     sorted_scores.index[1]]
+                n_doublets += 1
+            else:
+                cellBC2LG[cellBC] = [sorted_scores.index[0]]
+        else:
+            cellBC2LG[cellBC] = [sorted_scores.index[0]]
+
+    if doublet_kinship_thresh:
+        n_cells = len(cellBC2LG)
+        logger.debug(f"Identified {n_doublets} inter-group doublets"
+                    f" out of {n_cells} cells")
+
+    # Rename lineage groups based on size
+    lg_counts = Counter([item for sublist in cellBC2LG.values() for item in sublist])
     rename_lg = {}
-
-    for n, g in dfMT.groupby("lineageGrp"):
-        if n != 0:
-            lg_sizes[n] = len(g["cellBC"].unique())
-
-    sorted_by_value = sorted(lg_sizes.items(), key=lambda kv: kv[1])[::-1]
-    for i, tup in zip(range(1, len(sorted_by_value) + 1), sorted_by_value):
-        rename_lg[tup[0]] = float(i)
-
-    rename_lg[0] = 0.0
-
-    dfMT["lineageGrp"] = dfMT.apply(lambda x: rename_lg[x.lineageGrp], axis=1)
-
-    return dfMT
+    i = 1
+    for lg, count in lg_counts.most_common():
+        rename_lg[lg] = i
+        i += 1
+
+    # Rename lineage groups in cellBC2LG
+    for cellBC, lgs in cellBC2LG.items():
+        if len(lgs) == 1:
+            cellBC2LG[cellBC] = rename_lg[lgs[0]]
+        else:
+            if rename_lg[lgs[0]] < rename_lg[lgs[1]]:
+                cellBC2LG[cellBC] = (rename_lg[lgs[0]], rename_lg[lgs[1]])
+            else:
+                cellBC2LG[cellBC] = (rename_lg[lgs[1]], rename_lg[lgs[0]])
+
+    # Add lineageGrp column to allele table
+    at["lineageGrp"] = at["cellBC"].map(cellBC2LG)
+    at["lineageGrp"] = at["lineageGrp"].fillna(value=0).astype(str)
+
+    return at
 
 
 def filter_intbcs_final_lineages(
diff --git a/cassiopeia/preprocess/pipeline.py b/cassiopeia/preprocess/pipeline.py
index 1df66ffb..81deb5ba 100755
--- a/cassiopeia/preprocess/pipeline.py
+++ b/cassiopeia/preprocess/pipeline.py
@@ -1015,7 +1015,9 @@ def call_lineage_groups(
     min_umi_per_intbc: int = 1,
     min_cluster_prop: float = 0.005,
     min_intbc_thresh: float = 0.05,
-    inter_doublet_threshold: float = 0.35,
+    inter_doublet_threshold: float = None,
+    doublet_kinship_thresh: float = 0.75,
+    keep_doublets: bool = False,
     kinship_thresh: float = 0.25,
     plot: bool = False,
 ) -> pd.DataFrame:
@@ -1059,6 +1061,15 @@ def call_lineage_groups(
         inter_doublet_threshold: The threshold specifying the minimum proportion
             of kinship a cell shares with its assigned lineage group out of all
             lineage groups for it to be retained during doublet filtering
+        doublet_kinship_thresh: The threshold specifying the minimum kinship a 
+            cell needs to have with a lineage group in order to be considered a
+            singlet. Cells with kinship scores below this threshold will be 
+            filtered out or marked as doublets depending on the value of
+            `keep_doublets`.
+        keep_doublets: Whether or not to keep doublets in the allele table. If 
+            True, doublets will appear in the allele table with a lineage group
+            of (lg1, lg2). If False, doublets will be removed from the 
+            allele table.
         kinship_thresh: The threshold specifying the minimum proportion of
             intBCs shared between a cell and the intBC set of a lineage group
             needed to assign that cell to that lineage group in putative
@@ -1068,16 +1079,29 @@ def call_lineage_groups(
     Returns:
         None, saves output allele table to file.
     """
+
+    if inter_doublet_threshold:
+        logger.warning(
+        "Doublet filtering with the inter_doublet_threshold parameter is"
+        " depreciated and will be removed in Cassiopeia 2.1.0. Please use"
+        " the doublet_kinship_thresh parameter instead."
+        )
+
     logger.info(
         f"{input_df.shape[0]} UMIs (rows), with {input_df.shape[1]} attributes (columns)"
     )
     logger.info(str(len(input_df["cellBC"].unique())) + " Cells")
 
+    if min_umi_per_intbc > 1:
+        logger.info(f"Filtering out intBCs with less than "
+                    f"{min_umi_per_intbc} UMIs...")
+        input_df = input_df.groupby(['cellBC',"intBC"]).filter(
+            lambda x: len(x) >= min_umi_per_intbc)
+
     # Create a pivot_table
     piv = pd.pivot_table(
         input_df, index="cellBC", columns="intBC", values="UMI", aggfunc="count"
     )
-    # Filter out intBCs with fewer than min_umi_per_intbc UMIs
     piv[piv < min_umi_per_intbc] = np.nan
 
     # Normalize by total UMIs per cell
@@ -1108,6 +1132,13 @@ def call_lineage_groups(
         piv_assigned, min_intbc_thresh=min_intbc_thresh
     )
 
+    logger.info(
+        "Redefining lineage groups by removing doublet groups..."
+    )
+    master_LGs, master_intBCs = doublet_utils.filter_doublet_lg_sets(
+        piv_assigned, master_LGs, master_intBCs
+    )
+
     logger.info("Reassigning cells to refined lineage groups by kinship...")
     kinship_scores = lineage_utils.score_lineage_kinships(
         piv_assigned, master_LGs, master_intBCs
@@ -1115,26 +1146,40 @@ def call_lineage_groups(
 
     logger.info("Annotating alignment table with refined lineage groups...")
     allele_table = lineage_utils.annotate_lineage_groups(
-        input_df, kinship_scores, master_intBCs
+        input_df, kinship_scores, doublet_kinship_thresh=doublet_kinship_thresh,
     )
-    if inter_doublet_threshold:
+
+    if doublet_kinship_thresh:
+        if not keep_doublets:
+            logger.info("Filtering out inter-lineage group doublets with"
+                        f" kinship threshold {doublet_kinship_thresh}...")
+            allele_table = allele_table[
+                ~allele_table["lineageGrp"].str.startswith("(")]
+        if inter_doublet_threshold:
+            logger.warning(
+                "Ignoring inter_doublet_threshold parameter since"
+                " doublet_kinship_thresh is set."
+            )
+    elif inter_doublet_threshold:
         logger.info(
-            f"Filtering out inter-lineage group doublets with proportion {inter_doublet_threshold}..."
+            f"Filtering out inter-lineage group doublets with"
+            f" doublet threshold {inter_doublet_threshold}..."
         )
         allele_table = doublet_utils.filter_inter_doublets(
-            allele_table, rule=inter_doublet_threshold
+            allele_table, rule=inter_doublet_threshold,
+            keep_doublets=keep_doublets
         )
 
     logger.info(
         "Filtering out low proportion intBCs in finalized lineage groups..."
     )
-    filtered_lgs = lineage_utils.filter_intbcs_final_lineages(
-        allele_table, min_intbc_thresh=min_intbc_thresh
-    )
+    #filtered_lgs = lineage_utils.filter_intbcs_final_lineages(
+    #    allele_table, min_intbc_thresh=min_intbc_thresh
+    #)
 
-    allele_table = lineage_utils.filtered_lineage_group_to_allele_table(
-        filtered_lgs
-    )
+    #allele_table = lineage_utils.filtered_lineage_group_to_allele_table(
+    #    filtered_lgs
+    #)
 
     logger.debug("Final lineage group assignments:")
     for n, g in allele_table.groupby(["lineageGrp"]):
@@ -1146,7 +1191,7 @@ def call_lineage_groups(
         min_umi_per_cell=int(min_umi_per_cell),
         min_avg_reads_per_umi=min_avg_reads_per_umi,
     )
-    allele_table["lineageGrp"] = allele_table["lineageGrp"].astype(int)
+    allele_table["lineageGrp"] = allele_table["lineageGrp"].astype(str)
 
     if plot:
         logger.info("Producing Plots...")

From 209e9d83b08cc588ddb6b51a5239e9e4ed3f3e42 Mon Sep 17 00:00:00 2001
From: colganwi <colgan.william@gmail.com>
Date: Tue, 19 Sep 2023 19:37:21 -0400
Subject: [PATCH 4/4] removed testing comments:

---
 cassiopeia/preprocess/pipeline.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/cassiopeia/preprocess/pipeline.py b/cassiopeia/preprocess/pipeline.py
index 81deb5ba..9f0bc7bf 100755
--- a/cassiopeia/preprocess/pipeline.py
+++ b/cassiopeia/preprocess/pipeline.py
@@ -1029,11 +1029,12 @@ def call_lineage_groups(
         many intBCs they share with each intBC group (kinship).
 
         2. Refines these putative groups by removing non-informative intBCs
-        and reassigning cells through kinship.
+        and doublet lineage groups and reassigning cells through kinship.
 
-        3. Removes all inter-lineage doublets, defined as cells that have
-        relatively equal kinship scores across multiple lineages and whose
-        assignments are therefore ambigious.
+        3. Identifies all inter-lineage doublets, defined as cells that have
+        max kinship scores below a threshold. If `keep_doublets` is True,
+        these doublets will be retained in the final allele table with a
+        lineage group of (lg1, lg2).
 
         4. Finally, performs one more round of filtering non-informative intBCs
         and cellBCs with low UMI counts before returning a final table of
@@ -1173,13 +1174,13 @@ def call_lineage_groups(
     logger.info(
         "Filtering out low proportion intBCs in finalized lineage groups..."
     )
-    #filtered_lgs = lineage_utils.filter_intbcs_final_lineages(
-    #    allele_table, min_intbc_thresh=min_intbc_thresh
-    #)
+    filtered_lgs = lineage_utils.filter_intbcs_final_lineages(
+        allele_table, min_intbc_thresh=min_intbc_thresh
+    )
 
-    #allele_table = lineage_utils.filtered_lineage_group_to_allele_table(
-    #    filtered_lgs
-    #)
+    allele_table = lineage_utils.filtered_lineage_group_to_allele_table(
+        filtered_lgs
+    )
 
     logger.debug("Final lineage group assignments:")
     for n, g in allele_table.groupby(["lineageGrp"]):