Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding annotation to the convolution windows #58

Closed
wants to merge 6 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 58 additions & 14 deletions src/infercnvpy/tl/_infercnv.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Sequence, Tuple, Union

import numpy as np
import pandas as pd
import scipy.ndimage
import scipy.sparse
from anndata import AnnData
Expand All @@ -30,7 +31,7 @@ def infercnv(
inplace: bool = True,
layer: Union[str, None] = None,
key_added: str = "cnv",
) -> Union[None, Tuple[dict, scipy.sparse.csr_matrix]]:
) -> Union[None, Tuple[dict, scipy.sparse.csr_matrix, pd.DataFrame]]:
"""
Infer Copy Number Variation (CNV) by averaging gene expression over genomic regions.

Expand Down Expand Up @@ -111,7 +112,7 @@ def infercnv(

var = tmp_adata.var.loc[:, ["chromosome", "start", "end"]] # type: ignore

chr_pos, chunks = zip(
chr_pos, chunks, convolved_dfs = zip(
*process_map(
_infercnv_chunk,
[expr[i : i + chunksize, :] for i in range(0, adata.shape[0], chunksize)],
Expand All @@ -126,13 +127,21 @@ def infercnv(
)
)
res = scipy.sparse.vstack(chunks)
convolved_dfs = convolved_dfs[0] # since each chunk returns the same df
chr_pos = chr_pos[0]

# annotate the genomic range of each window
start_dict = var["start"].to_dict()
stop_dict = var["end"].to_dict()
convolved_dfs["start"] = convolved_dfs["genes"].apply(lambda x: start_dict[x[0]]) # start of the first gene
convolved_dfs["end"] = convolved_dfs["genes"].apply(lambda x: stop_dict[x[-1]]) # stop of the last gene

if inplace:
adata.obsm[f"X_{key_added}"] = res
adata.uns[key_added] = {"chr_pos": chr_pos[0]}
adata.uns[key_added] = {"chr_pos": chr_pos, "df": convolved_dfs}

else:
return chr_pos[0], res
return chr_pos, res, convolved_dfs


def _natural_sort(l: Sequence):
Expand Down Expand Up @@ -166,11 +175,15 @@ def _running_mean(x: Union[np.ndarray, scipy.sparse.spmatrix], n: int = 50, step
"""
if n < x.shape[1]: # regular convolution: the filter is smaller than the #genes
r = np.arange(1, n + 1)

pyramid = np.minimum(r, r[::-1])
smoothed_x = np.apply_along_axis(lambda row: np.convolve(row, pyramid, mode="same"), axis=1, arr=x) / np.sum(
pyramid
)
smoothed_x = np.apply_along_axis(
lambda row: np.convolve(
row,
pyramid,
),
axis=1,
arr=x,
grst marked this conversation as resolved.
Show resolved Hide resolved
) / np.sum(pyramid)
return smoothed_x[:, np.arange(0, smoothed_x.shape[1], step)]

else: # there's less genes than the filtersize. just apply a single conv with a smaller filter (no sliding)
Expand All @@ -181,7 +194,7 @@ def _running_mean(x: Union[np.ndarray, scipy.sparse.spmatrix], n: int = 50, step
return smoothed_x


def _running_mean_by_chromosome(expr, var, window_size, step) -> Tuple[dict, np.ndarray]:
def _running_mean_by_chromosome(expr, var, window_size, step) -> Tuple[dict, np.ndarray, pd.DataFrame]:
"""Compute the running mean for each chromosome independently. Stack the resulting arrays ordered by chromosome.

Parameters
Expand All @@ -207,13 +220,23 @@ def _running_mean_by_chromosome(expr, var, window_size, step) -> Tuple[dict, np.
def _running_mean_for_chromosome(chr):
genes = var.loc[var["chromosome"] == chr].sort_values("start").index.values
tmp_x = expr[:, var.index.get_indexer(genes)]
return _running_mean(tmp_x, n=window_size, step=step)
x_conv = _running_mean(tmp_x, n=window_size, step=step)
convolved_gene_names = _gene_list_convolve(genes, window_size=window_size - 1, step=step, mode="same")
assert len(convolved_gene_names) == x_conv.shape[1], f"{len(convolved_gene_names)} vs {x_conv.shape[1]}"
# DataFrame containing all the genes that go into a specific position
convolved_df = pd.DataFrame({"genes": convolved_gene_names, "chromosome": chr})

return x_conv, convolved_df

running_means = [_running_mean_for_chromosome(chr) for chr in chromosomes]
running_means, convolved_dfs = zip(*running_means)

chr_start_pos = {chr: i for chr, i in zip(chromosomes, np.cumsum([0] + [x.shape[1] for x in running_means]))}
convolved_dfs = pd.concat(convolved_dfs) # since its a list of dfs before
convolved_dfs.index.name = "relative_position"
convolved_dfs.reset_index(inplace=True)

return chr_start_pos, np.hstack(running_means)
chr_start_pos = {chr: i for chr, i in zip(chromosomes, np.cumsum([0] + [x.shape[1] for x in running_means]))}
return chr_start_pos, np.hstack(running_means), convolved_dfs


def _get_reference(
Expand Down Expand Up @@ -289,7 +312,7 @@ def _infercnv_chunk(tmp_x, var, reference, lfc_cap, window_size, step, dynamic_t
# Step 2 - clip log fold changes
x_clipped = np.clip(x_centered, -lfc_cap, lfc_cap)
# Step 3 - smooth by genomic position
chr_pos, x_smoothed = _running_mean_by_chromosome(x_clipped, var, window_size=window_size, step=step)
chr_pos, x_smoothed, conv_df = _running_mean_by_chromosome(x_clipped, var, window_size=window_size, step=step)
# Step 4 - center by cell
x_cell_centered = x_smoothed - np.median(x_smoothed, axis=1)[:, np.newaxis]

Expand All @@ -302,4 +325,25 @@ def _infercnv_chunk(tmp_x, var, reference, lfc_cap, window_size, step, dynamic_t

x_res = scipy.sparse.csr_matrix(x_res)

return chr_pos, x_res
return chr_pos, x_res, conv_df


def _gene_list_convolve(gene_list, window_size, step, mode):
"""
emulate what happens with the convolution on th expression, just pretending to convolve the gene_list
i.e. we group together the genes that get convolved at each position
"""
gene_dict = {}

len_threshold = (
0 if mode == "same" else window_size - 1
) # towards the end, the gene list will get shorter due to lack of overlap
# convolving with "same", the gene list will gradually get shorter until 0. for mode==valid, the last convole will still have len==windowlength

for i in range(len(gene_list)):
start = i * step
stop = start + window_size
x = gene_list[start:stop]
if len(x) > len_threshold:
gene_dict[i] = x
return pd.Series(gene_dict)