Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding an example with real data #180

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,6 @@ dmypy.json

# Pyre type checker
.pyre/

# PyCharm
.idea
114 changes: 114 additions & 0 deletions examples/plot_real_data_pydeseq2_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
"""
Pydeseq2 with real data
======================================================

This experiment aims to run pydeseq2 on real data.

"""

import matplotlib.pyplot as plt
import numpy as np

from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
from pydeseq2.utils import load_example_data

# %%
# Data loading
# ------------
#
# Let's first download and load the real data.

counts_ori = load_example_data(
modality="raw_counts",
dataset="real",
debug=False,
)

metadata = load_example_data(
modality="metadata",
dataset="real",
debug=False,
)

print(counts_ori.tail())

# %%
# In this example, the counts data contains both EnsemblIDs
# and gene symbols (gene names).
# We also see that some EnsemblID do not map to any gene symbol.
# We decide to stay with EnsemblID rather than gene symbol and
# continue with some processing step.

# %%
# Data filtering read counts
# ^^^^^^^^^^^^^^^^^^^^^^^^^^

counts_df = counts_ori.drop(columns="Gene Name")
# remove the gene name for now
counts_df = counts_df.set_index("Gene ID").T
# now we have a matrice of shape n_samples x n_genes

# %%
# Further processing might be needed because some genes
# have very high counts on average.

# %%
print(metadata)

# %%
# Data filtering metadata
# ^^^^^^^^^^^^^^^^^^^^^^^^^^

metadata = metadata.set_index("Run")[["Sample Characteristic[biopsy site]"]].rename(
columns={"Sample Characteristic[biopsy site]": "condition"}
)

# %%
# Now that we have loaded and filtered our data, we may proceed with the differential
# analysis.


# %%
# Single factor analysis
# --------------------------
#
# In this analysis, we use the ``condition``
# column as our design factor. That is, we compare gene expressions of samples that have
# ``primary tumor`` to those that have ``normal`` condition.
#

dds = DeseqDataSet(
counts=counts_df,
metadata=metadata,
design_factors="condition",
refit_cooks=True,
n_cpus=8,
)

dds.deseq2()

# %%
# Generate summary statistics
# --------------------------
# We are here interested in comparing "primary tumor vs "normal" samples

stat_res_tumor_vs_normal = DeseqStats(
dds, contrast=["condition", "primary tumor", "normal"], n_cpus=8
)

# As we did not really performed a thorough exploration of the data
# and gene counts, we would like to test for a lfc of 1 at least

stat_res_tumor_vs_normal.summary(lfc_null=1, alt_hypothesis="greaterAbs")

# Some visualization

stat_res_tumor_vs_normal.plot_MA(s=20)

# Volcano plot

plt.scatter(
stat_res_tumor_vs_normal.results_df["log2FoldChange"],
-np.log(stat_res_tumor_vs_normal.results_df["padj"]),
)
47 changes: 36 additions & 11 deletions pydeseq2/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,16 @@

def load_example_data(
modality: Literal["raw_counts", "metadata"] = "raw_counts",
dataset: Literal["synthetic"] = "synthetic",
dataset: Literal["synthetic", "real"] = "synthetic",
url_to_data: Union[str, None] = None,
debug: bool = False,
debug_seed: int = 42,
) -> pd.DataFrame:
"""Load synthetic example data.
"""Load example data.

May load either metadata or rna-seq data. For now, this function may only return the
synthetic data provided as part of this repo, but new datasets might be added in the
future.
May load either metadata or rna-seq data. This function may either return the
synthetic data provided as part of this repo, download an example dataset
from BioStudies, or use a provided direct link to an processed read count dataset.

Parameters
----------
Expand All @@ -45,8 +46,15 @@ def load_example_data(
dataset : str
The dataset for which to return gene expression data.
If "synthetic", will return the synthetic data that is used for CI unit tests.
If "real", will download and return a real dataset from a given url.
(default: ``"synthetic"``).

url_to_data : str
The url for the real dataset.
If not given, it will be set to the url of a public gene expression profiling
study by RNA-seq in colorectal cancer, depending on the desired modality.
(default: None).

debug : bool
If true, subsample 10 samples and 100 genes at random.
(Note that the "synthetic" dataset is already 10 x 100.) (default: ``False``).
Expand All @@ -60,13 +68,15 @@ def load_example_data(
Requested data modality.
"""

assert modality in ["raw_counts", "metadata"], (
"The modality argument must be one of the following: " "raw_counts, metadata"
)
assert modality in [
"raw_counts",
"metadata",
], "The modality argument must be one of the following: raw_counts, metadata"

assert dataset in [
"synthetic"
], "The dataset argument must be one of the following: synthetic."
"synthetic",
"real",
], "The dataset argument must be one of the following: synthetic, real"

# Load data
datasets_path = Path(pydeseq2.__file__).parent.parent / "datasets"
Expand Down Expand Up @@ -100,10 +110,25 @@ def load_example_data(
index_col=0,
)

elif dataset == "real":
default_urls = {
"raw_counts": (
"http://genomedata.org/gen-viz-workshop/intro_to_deseq2/"
"tutorial/E-GEOD-50760-raw-counts.tsv"
),
"metadata": (
"https://www.ebi.ac.uk/gxa/experiments-content/E-GEOD-50760/"
"resources/ExperimentDesignFile.RnaSeq/experiment-design"
),
}
if not url_to_data:
url_to_data = default_urls[modality]
df = pd.read_csv(url_to_data, sep="\t")

if debug:
# TODO: until we provide a larger dataset, this option is useless
# subsample 10 samples and 100 genes
df = df.sample(n=10, axis=0, random_state=debug_seed)
df = df.sample(n=10, axis=1, random_state=debug_seed)
if modality == "raw_counts":
df = df.sample(n=100, axis="index", random_state=debug_seed)

Expand Down
Loading