icbi-lab · redst4r · Sep 25, 2022 · Oct 15, 2022 · Oct 17, 2022 · Oct 17, 2022
diff --git a/src/infercnvpy/tl/_infercnv.py b/src/infercnvpy/tl/_infercnv.py
@@ -4,6 +4,7 @@
 from typing import Sequence, Tuple, Union
 
 import numpy as np
+import pandas as pd
 import scipy.ndimage
 import scipy.sparse
 from anndata import AnnData
@@ -30,7 +31,7 @@ def infercnv(
     inplace: bool = True,
     layer: Union[str, None] = None,
     key_added: str = "cnv",
-) -> Union[None, Tuple[dict, scipy.sparse.csr_matrix]]:
+) -> Union[None, Tuple[dict, scipy.sparse.csr_matrix, pd.DataFrame]]:
     """
     Infer Copy Number Variation (CNV) by averaging gene expression over genomic regions.
 
@@ -111,7 +112,7 @@ def infercnv(
 
     var = tmp_adata.var.loc[:, ["chromosome", "start", "end"]]  # type: ignore
 
-    chr_pos, chunks = zip(
+    chr_pos, chunks, convolved_dfs = zip(
         *process_map(
             _infercnv_chunk,
             [expr[i : i + chunksize, :] for i in range(0, adata.shape[0], chunksize)],
@@ -126,13 +127,21 @@ def infercnv(
         )
     )
     res = scipy.sparse.vstack(chunks)
+    convolved_dfs = convolved_dfs[0]  # since each chunk returns the same df
+    chr_pos = chr_pos[0]
+
+    # annotate the genomic range of each window
+    start_dict = var["start"].to_dict()
+    stop_dict = var["end"].to_dict()
+    convolved_dfs["start"] = convolved_dfs["genes"].apply(lambda x: start_dict[x[0]])  # start of the first gene
+    convolved_dfs["end"] = convolved_dfs["genes"].apply(lambda x: stop_dict[x[-1]])  # stop of the last gene
 
     if inplace:
         adata.obsm[f"X_{key_added}"] = res
-        adata.uns[key_added] = {"chr_pos": chr_pos[0]}
+        adata.uns[key_added] = {"chr_pos": chr_pos, "df": convolved_dfs}
 
     else:
-        return chr_pos[0], res
+        return chr_pos, res, convolved_dfs
 
 
 def _natural_sort(l: Sequence):
@@ -166,11 +175,15 @@ def _running_mean(x: Union[np.ndarray, scipy.sparse.spmatrix], n: int = 50, step
     """
     if n < x.shape[1]:  # regular convolution: the filter is smaller than the #genes
         r = np.arange(1, n + 1)
-
         pyramid = np.minimum(r, r[::-1])
-        smoothed_x = np.apply_along_axis(lambda row: np.convolve(row, pyramid, mode="same"), axis=1, arr=x) / np.sum(
-            pyramid
-        )
+        smoothed_x = np.apply_along_axis(
+            lambda row: np.convolve(
+                row,
+                pyramid,
+            ),
+            axis=1,
+            arr=x,
+        ) / np.sum(pyramid)
         return smoothed_x[:, np.arange(0, smoothed_x.shape[1], step)]
 
     else:  # there's less genes than the filtersize. just apply a single conv with a smaller filter (no sliding)
@@ -181,7 +194,7 @@ def _running_mean(x: Union[np.ndarray, scipy.sparse.spmatrix], n: int = 50, step
         return smoothed_x
 
 
-def _running_mean_by_chromosome(expr, var, window_size, step) -> Tuple[dict, np.ndarray]:
+def _running_mean_by_chromosome(expr, var, window_size, step) -> Tuple[dict, np.ndarray, pd.DataFrame]:
     """Compute the running mean for each chromosome independently. Stack the resulting arrays ordered by chromosome.
 
     Parameters
@@ -207,13 +220,23 @@ def _running_mean_by_chromosome(expr, var, window_size, step) -> Tuple[dict, np.
     def _running_mean_for_chromosome(chr):
         genes = var.loc[var["chromosome"] == chr].sort_values("start").index.values
         tmp_x = expr[:, var.index.get_indexer(genes)]
-        return _running_mean(tmp_x, n=window_size, step=step)
+        x_conv = _running_mean(tmp_x, n=window_size, step=step)
+        convolved_gene_names = _gene_list_convolve(genes, window_size=window_size - 1, step=step, mode="same")
+        assert len(convolved_gene_names) == x_conv.shape[1], f"{len(convolved_gene_names)} vs {x_conv.shape[1]}"
+        # DataFrame containing all the genes that go into a specific position
+        convolved_df = pd.DataFrame({"genes": convolved_gene_names, "chromosome": chr})
+
+        return x_conv, convolved_df
 
     running_means = [_running_mean_for_chromosome(chr) for chr in chromosomes]
+    running_means, convolved_dfs = zip(*running_means)
 
-    chr_start_pos = {chr: i for chr, i in zip(chromosomes, np.cumsum([0] + [x.shape[1] for x in running_means]))}
+    convolved_dfs = pd.concat(convolved_dfs)  # since its a list of dfs before
+    convolved_dfs.index.name = "relative_position"
+    convolved_dfs.reset_index(inplace=True)
 
-    return chr_start_pos, np.hstack(running_means)
+    chr_start_pos = {chr: i for chr, i in zip(chromosomes, np.cumsum([0] + [x.shape[1] for x in running_means]))}
+    return chr_start_pos, np.hstack(running_means), convolved_dfs
 
 
 def _get_reference(
@@ -289,7 +312,7 @@ def _infercnv_chunk(tmp_x, var, reference, lfc_cap, window_size, step, dynamic_t
     # Step 2 - clip log fold changes
     x_clipped = np.clip(x_centered, -lfc_cap, lfc_cap)
     # Step 3 - smooth by genomic position
-    chr_pos, x_smoothed = _running_mean_by_chromosome(x_clipped, var, window_size=window_size, step=step)
+    chr_pos, x_smoothed, conv_df = _running_mean_by_chromosome(x_clipped, var, window_size=window_size, step=step)
     # Step 4 - center by cell
     x_cell_centered = x_smoothed - np.median(x_smoothed, axis=1)[:, np.newaxis]
 
@@ -302,4 +325,25 @@ def _infercnv_chunk(tmp_x, var, reference, lfc_cap, window_size, step, dynamic_t
 
     x_res = scipy.sparse.csr_matrix(x_res)
 
-    return chr_pos, x_res
+    return chr_pos, x_res, conv_df
+
+
+def _gene_list_convolve(gene_list, window_size, step, mode):
+    """
+    emulate what happens with the convolution on th expression, just pretending to convolve the gene_list
+    i.e. we group together the genes that get convolved at each position
+    """
+    gene_dict = {}
+
+    len_threshold = (
+        0 if mode == "same" else window_size - 1
+    )  # towards the end, the gene list will get shorter due to lack of overlap
+    # convolving with "same", the gene list will gradually get shorter until 0. for mode==valid, the last convole will still have len==windowlength
+
+    for i in range(len(gene_list)):
+        start = i * step
+        stop = start + window_size
+        x = gene_list[start:stop]
+        if len(x) > len_threshold:
+            gene_dict[i] = x
+    return pd.Series(gene_dict)