diff --git a/.gitignore b/.gitignore index fe2ff70e..b61d611b 100644 --- a/.gitignore +++ b/.gitignore @@ -130,3 +130,6 @@ dmypy.json # Pyre type checker .pyre/ + +# PyCharm +.idea diff --git a/examples/plot_real_data_pydeseq2_example.py b/examples/plot_real_data_pydeseq2_example.py new file mode 100644 index 00000000..d34eba2d --- /dev/null +++ b/examples/plot_real_data_pydeseq2_example.py @@ -0,0 +1,114 @@ +""" +Pydeseq2 with real data +====================================================== + +This experiment aims to run pydeseq2 on real data. + +""" + +import matplotlib.pyplot as plt +import numpy as np + +from pydeseq2.dds import DeseqDataSet +from pydeseq2.ds import DeseqStats +from pydeseq2.utils import load_example_data + +# %% +# Data loading +# ------------ +# +# Let's first download and load the real data. + +counts_ori = load_example_data( + modality="raw_counts", + dataset="real", + debug=False, +) + +metadata = load_example_data( + modality="metadata", + dataset="real", + debug=False, +) + +print(counts_ori.tail()) + +# %% +# In this example, the counts data contains both EnsemblIDs +# and gene symbols (gene names). +# We also see that some EnsemblID do not map to any gene symbol. +# We decide to stay with EnsemblID rather than gene symbol and +# continue with some processing step. + +# %% +# Data filtering read counts +# ^^^^^^^^^^^^^^^^^^^^^^^^^^ + +counts_df = counts_ori.drop(columns="Gene Name") +# remove the gene name for now +counts_df = counts_df.set_index("Gene ID").T +# now we have a matrice of shape n_samples x n_genes + +# %% +# Further processing might be needed because some genes +# have very high counts on average. + +# %% +print(metadata) + +# %% +# Data filtering metadata +# ^^^^^^^^^^^^^^^^^^^^^^^^^^ + +metadata = metadata.set_index("Run")[["Sample Characteristic[biopsy site]"]].rename( + columns={"Sample Characteristic[biopsy site]": "condition"} +) + +# %% +# Now that we have loaded and filtered our data, we may proceed with the differential +# analysis. + + +# %% +# Single factor analysis +# -------------------------- +# +# In this analysis, we use the ``condition`` +# column as our design factor. That is, we compare gene expressions of samples that have +# ``primary tumor`` to those that have ``normal`` condition. +# + +dds = DeseqDataSet( + counts=counts_df, + metadata=metadata, + design_factors="condition", + refit_cooks=True, + n_cpus=8, +) + +dds.deseq2() + +# %% +# Generate summary statistics +# -------------------------- +# We are here interested in comparing "primary tumor vs "normal" samples + +stat_res_tumor_vs_normal = DeseqStats( + dds, contrast=["condition", "primary tumor", "normal"], n_cpus=8 +) + +# As we did not really performed a thorough exploration of the data +# and gene counts, we would like to test for a lfc of 1 at least + +stat_res_tumor_vs_normal.summary(lfc_null=1, alt_hypothesis="greaterAbs") + +# Some visualization + +stat_res_tumor_vs_normal.plot_MA(s=20) + +# Volcano plot + +plt.scatter( + stat_res_tumor_vs_normal.results_df["log2FoldChange"], + -np.log(stat_res_tumor_vs_normal.results_df["padj"]), +) diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py index 5b18559e..5280f9ce 100644 --- a/pydeseq2/utils.py +++ b/pydeseq2/utils.py @@ -27,15 +27,16 @@ def load_example_data( modality: Literal["raw_counts", "metadata"] = "raw_counts", - dataset: Literal["synthetic"] = "synthetic", + dataset: Literal["synthetic", "real"] = "synthetic", + url_to_data: Union[str, None] = None, debug: bool = False, debug_seed: int = 42, ) -> pd.DataFrame: - """Load synthetic example data. + """Load example data. - May load either metadata or rna-seq data. For now, this function may only return the - synthetic data provided as part of this repo, but new datasets might be added in the - future. + May load either metadata or rna-seq data. This function may either return the + synthetic data provided as part of this repo, download an example dataset + from BioStudies, or use a provided direct link to an processed read count dataset. Parameters ---------- @@ -45,8 +46,15 @@ def load_example_data( dataset : str The dataset for which to return gene expression data. If "synthetic", will return the synthetic data that is used for CI unit tests. + If "real", will download and return a real dataset from a given url. (default: ``"synthetic"``). + url_to_data : str + The url for the real dataset. + If not given, it will be set to the url of a public gene expression profiling + study by RNA-seq in colorectal cancer, depending on the desired modality. + (default: None). + debug : bool If true, subsample 10 samples and 100 genes at random. (Note that the "synthetic" dataset is already 10 x 100.) (default: ``False``). @@ -60,13 +68,15 @@ def load_example_data( Requested data modality. """ - assert modality in ["raw_counts", "metadata"], ( - "The modality argument must be one of the following: " "raw_counts, metadata" - ) + assert modality in [ + "raw_counts", + "metadata", + ], "The modality argument must be one of the following: raw_counts, metadata" assert dataset in [ - "synthetic" - ], "The dataset argument must be one of the following: synthetic." + "synthetic", + "real", + ], "The dataset argument must be one of the following: synthetic, real" # Load data datasets_path = Path(pydeseq2.__file__).parent.parent / "datasets" @@ -100,10 +110,25 @@ def load_example_data( index_col=0, ) + elif dataset == "real": + default_urls = { + "raw_counts": ( + "http://genomedata.org/gen-viz-workshop/intro_to_deseq2/" + "tutorial/E-GEOD-50760-raw-counts.tsv" + ), + "metadata": ( + "https://www.ebi.ac.uk/gxa/experiments-content/E-GEOD-50760/" + "resources/ExperimentDesignFile.RnaSeq/experiment-design" + ), + } + if not url_to_data: + url_to_data = default_urls[modality] + df = pd.read_csv(url_to_data, sep="\t") + if debug: # TODO: until we provide a larger dataset, this option is useless # subsample 10 samples and 100 genes - df = df.sample(n=10, axis=0, random_state=debug_seed) + df = df.sample(n=10, axis=1, random_state=debug_seed) if modality == "raw_counts": df = df.sample(n=100, axis="index", random_state=debug_seed)