diff --git a/src/infercnvpy/tl/_infercnv.py b/src/infercnvpy/tl/_infercnv.py index 28d764f..22b663a 100644 --- a/src/infercnvpy/tl/_infercnv.py +++ b/src/infercnvpy/tl/_infercnv.py @@ -4,6 +4,7 @@ from typing import Sequence, Tuple, Union import numpy as np +import pandas as pd import scipy.ndimage import scipy.sparse from anndata import AnnData @@ -30,7 +31,7 @@ def infercnv( inplace: bool = True, layer: Union[str, None] = None, key_added: str = "cnv", -) -> Union[None, Tuple[dict, scipy.sparse.csr_matrix]]: +) -> Union[None, Tuple[dict, scipy.sparse.csr_matrix, pd.DataFrame]]: """ Infer Copy Number Variation (CNV) by averaging gene expression over genomic regions. @@ -111,7 +112,7 @@ def infercnv( var = tmp_adata.var.loc[:, ["chromosome", "start", "end"]] # type: ignore - chr_pos, chunks = zip( + chr_pos, chunks, convolved_dfs = zip( *process_map( _infercnv_chunk, [expr[i : i + chunksize, :] for i in range(0, adata.shape[0], chunksize)], @@ -126,13 +127,21 @@ def infercnv( ) ) res = scipy.sparse.vstack(chunks) + convolved_dfs = convolved_dfs[0] # since each chunk returns the same df + chr_pos = chr_pos[0] + + # annotate the genomic range of each window + start_dict = var["start"].to_dict() + stop_dict = var["end"].to_dict() + convolved_dfs["start"] = convolved_dfs["genes"].apply(lambda x: start_dict[x[0]]) # start of the first gene + convolved_dfs["end"] = convolved_dfs["genes"].apply(lambda x: stop_dict[x[-1]]) # stop of the last gene if inplace: adata.obsm[f"X_{key_added}"] = res - adata.uns[key_added] = {"chr_pos": chr_pos[0]} + adata.uns[key_added] = {"chr_pos": chr_pos, "df": convolved_dfs} else: - return chr_pos[0], res + return chr_pos, res, convolved_dfs def _natural_sort(l: Sequence): @@ -166,11 +175,15 @@ def _running_mean(x: Union[np.ndarray, scipy.sparse.spmatrix], n: int = 50, step """ if n < x.shape[1]: # regular convolution: the filter is smaller than the #genes r = np.arange(1, n + 1) - pyramid = np.minimum(r, r[::-1]) - smoothed_x = np.apply_along_axis(lambda row: np.convolve(row, pyramid, mode="same"), axis=1, arr=x) / np.sum( - pyramid - ) + smoothed_x = np.apply_along_axis( + lambda row: np.convolve( + row, + pyramid, + ), + axis=1, + arr=x, + ) / np.sum(pyramid) return smoothed_x[:, np.arange(0, smoothed_x.shape[1], step)] else: # there's less genes than the filtersize. just apply a single conv with a smaller filter (no sliding) @@ -181,7 +194,7 @@ def _running_mean(x: Union[np.ndarray, scipy.sparse.spmatrix], n: int = 50, step return smoothed_x -def _running_mean_by_chromosome(expr, var, window_size, step) -> Tuple[dict, np.ndarray]: +def _running_mean_by_chromosome(expr, var, window_size, step) -> Tuple[dict, np.ndarray, pd.DataFrame]: """Compute the running mean for each chromosome independently. Stack the resulting arrays ordered by chromosome. Parameters @@ -207,13 +220,23 @@ def _running_mean_by_chromosome(expr, var, window_size, step) -> Tuple[dict, np. def _running_mean_for_chromosome(chr): genes = var.loc[var["chromosome"] == chr].sort_values("start").index.values tmp_x = expr[:, var.index.get_indexer(genes)] - return _running_mean(tmp_x, n=window_size, step=step) + x_conv = _running_mean(tmp_x, n=window_size, step=step) + convolved_gene_names = _gene_list_convolve(genes, window_size=window_size - 1, step=step, mode="same") + assert len(convolved_gene_names) == x_conv.shape[1], f"{len(convolved_gene_names)} vs {x_conv.shape[1]}" + # DataFrame containing all the genes that go into a specific position + convolved_df = pd.DataFrame({"genes": convolved_gene_names, "chromosome": chr}) + + return x_conv, convolved_df running_means = [_running_mean_for_chromosome(chr) for chr in chromosomes] + running_means, convolved_dfs = zip(*running_means) - chr_start_pos = {chr: i for chr, i in zip(chromosomes, np.cumsum([0] + [x.shape[1] for x in running_means]))} + convolved_dfs = pd.concat(convolved_dfs) # since its a list of dfs before + convolved_dfs.index.name = "relative_position" + convolved_dfs.reset_index(inplace=True) - return chr_start_pos, np.hstack(running_means) + chr_start_pos = {chr: i for chr, i in zip(chromosomes, np.cumsum([0] + [x.shape[1] for x in running_means]))} + return chr_start_pos, np.hstack(running_means), convolved_dfs def _get_reference( @@ -289,7 +312,7 @@ def _infercnv_chunk(tmp_x, var, reference, lfc_cap, window_size, step, dynamic_t # Step 2 - clip log fold changes x_clipped = np.clip(x_centered, -lfc_cap, lfc_cap) # Step 3 - smooth by genomic position - chr_pos, x_smoothed = _running_mean_by_chromosome(x_clipped, var, window_size=window_size, step=step) + chr_pos, x_smoothed, conv_df = _running_mean_by_chromosome(x_clipped, var, window_size=window_size, step=step) # Step 4 - center by cell x_cell_centered = x_smoothed - np.median(x_smoothed, axis=1)[:, np.newaxis] @@ -302,4 +325,25 @@ def _infercnv_chunk(tmp_x, var, reference, lfc_cap, window_size, step, dynamic_t x_res = scipy.sparse.csr_matrix(x_res) - return chr_pos, x_res + return chr_pos, x_res, conv_df + + +def _gene_list_convolve(gene_list, window_size, step, mode): + """ + emulate what happens with the convolution on th expression, just pretending to convolve the gene_list + i.e. we group together the genes that get convolved at each position + """ + gene_dict = {} + + len_threshold = ( + 0 if mode == "same" else window_size - 1 + ) # towards the end, the gene list will get shorter due to lack of overlap + # convolving with "same", the gene list will gradually get shorter until 0. for mode==valid, the last convole will still have len==windowlength + + for i in range(len(gene_list)): + start = i * step + stop = start + window_size + x = gene_list[start:stop] + if len(x) > len_threshold: + gene_dict[i] = x + return pd.Series(gene_dict)