owkin · jbschiratti · Oct 17, 2023 · Oct 17, 2023 · Oct 17, 2023 · Oct 17, 2023
@@ -130,3 +130,6 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# PyCharm
+.idea
@@ -0,0 +1,114 @@
+"""
+Pydeseq2 with real data
+======================================================
+
+This experiment aims to run pydeseq2 on real data.
+
+"""
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from pydeseq2.dds import DeseqDataSet
+from pydeseq2.ds import DeseqStats
+from pydeseq2.utils import load_example_data
+
+# %%
+# Data loading
+# ------------
+#
+# Let's first download and load the real data.
+
+counts_ori = load_example_data(
+    modality="raw_counts",
+    dataset="real",
+    debug=False,
+)
+
+metadata = load_example_data(
+    modality="metadata",
+    dataset="real",
+    debug=False,
+)
+
+print(counts_ori.tail())
+
+# %%
+# In this example, the counts data contains both EnsemblIDs
+# and gene symbols (gene names).
+# We also see that some EnsemblID do not map to any gene symbol.
+# We decide to stay with EnsemblID rather than gene symbol and
+# continue with some processing step.
+
+# %%
+# Data filtering read counts
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+counts_df = counts_ori.drop(columns="Gene Name")
+# remove the gene name for now
+counts_df = counts_df.set_index("Gene ID").T
+# now we have a matrice of shape n_samples x n_genes
+
+# %%
+# Further processing might be needed because some genes
+# have very high counts on average.
+
+# %%
+print(metadata)
+
+# %%
+# Data filtering metadata
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+metadata = metadata.set_index("Run")[["Sample Characteristic[biopsy site]"]].rename(
+    columns={"Sample Characteristic[biopsy site]": "condition"}
+)
+
+# %%
+# Now that we have loaded and filtered our data, we may proceed with the differential
+# analysis.
+
+
+# %%
+# Single factor analysis
+# --------------------------
+#
+# In this analysis, we use the ``condition``
+# column as our design factor. That is, we compare gene expressions of samples that have
+# ``primary tumor`` to those that have ``normal`` condition.
+#
+
+dds = DeseqDataSet(
+    counts=counts_df,
+    metadata=metadata,
+    design_factors="condition",
+    refit_cooks=True,
+    n_cpus=8,
+)
+
+dds.deseq2()
+
+# %%
+# Generate summary statistics
+# --------------------------
+# We are here interested in comparing "primary tumor vs "normal" samples
+
+stat_res_tumor_vs_normal = DeseqStats(
+    dds, contrast=["condition", "primary tumor", "normal"], n_cpus=8
+)
+
+# As we did not really performed a thorough exploration of the data
+# and gene counts, we would like to test for a lfc of 1 at least
+
+stat_res_tumor_vs_normal.summary(lfc_null=1, alt_hypothesis="greaterAbs")
+
+# Some visualization
+
+stat_res_tumor_vs_normal.plot_MA(s=20)
+
+# Volcano plot
+
+plt.scatter(
+    stat_res_tumor_vs_normal.results_df["log2FoldChange"],
+    -np.log(stat_res_tumor_vs_normal.results_df["padj"]),
+)
@@ -27,15 +27,16 @@
 
 def load_example_data(
     modality: Literal["raw_counts", "metadata"] = "raw_counts",
-    dataset: Literal["synthetic"] = "synthetic",
+    dataset: Literal["synthetic", "real"] = "synthetic",
+    url_to_data: Union[str, None] = None,
     debug: bool = False,
     debug_seed: int = 42,
 ) -> pd.DataFrame:
-    """Load synthetic example data.
+    """Load example data.
 
-    May load either metadata or rna-seq data. For now, this function may only return the
-    synthetic data provided as part of this repo, but new datasets might be added in the
-    future.
+    May load either metadata or rna-seq data. This function may either return the
+    synthetic data provided as part of this repo, download an example dataset
+    from BioStudies, or use a provided direct link to an processed read count dataset.
 
     Parameters
     ----------
@@ -45,8 +46,15 @@ def load_example_data(
     dataset : str
         The dataset for which to return gene expression data.
         If "synthetic", will return the synthetic data that is used for CI unit tests.
+        If "real", will download and return a real dataset from a given url.
         (default: ``"synthetic"``).
 
+    url_to_data : str
+        The url for the real dataset.
+        If not given, it will be set to the url of a public gene expression profiling
+        study by RNA-seq in colorectal cancer, depending on the desired modality.
+        (default: None).
+
     debug : bool
         If true, subsample 10 samples and 100 genes at random.
         (Note that the "synthetic" dataset is already 10 x 100.) (default: ``False``).
@@ -60,13 +68,15 @@ def load_example_data(
         Requested data modality.
     """
 
-    assert modality in ["raw_counts", "metadata"], (
-        "The modality argument must be one of the following: " "raw_counts, metadata"
-    )
+    assert modality in [
+        "raw_counts",
+        "metadata",
+    ], "The modality argument must be one of the following: raw_counts, metadata"
 
     assert dataset in [
-        "synthetic"
-    ], "The dataset argument must be one of the following: synthetic."
+        "synthetic",
+        "real",
+    ], "The dataset argument must be one of the following: synthetic, real"
 
     # Load data
     datasets_path = Path(pydeseq2.__file__).parent.parent / "datasets"
@@ -100,10 +110,25 @@ def load_example_data(
                 index_col=0,
             )
 
+    elif dataset == "real":
+        default_urls = {
+            "raw_counts": (
+                "http://genomedata.org/gen-viz-workshop/intro_to_deseq2/"
+                "tutorial/E-GEOD-50760-raw-counts.tsv"
+            ),
+            "metadata": (
+                "https://www.ebi.ac.uk/gxa/experiments-content/E-GEOD-50760/"
+                "resources/ExperimentDesignFile.RnaSeq/experiment-design"
+            ),
+        }
+        if not url_to_data:
+            url_to_data = default_urls[modality]
+        df = pd.read_csv(url_to_data, sep="\t")
+
     if debug:
         # TODO: until we provide a larger dataset, this option is useless
         # subsample 10 samples and 100 genes
-        df = df.sample(n=10, axis=0, random_state=debug_seed)
+        df = df.sample(n=10, axis=1, random_state=debug_seed)
         if modality == "raw_counts":
             df = df.sample(n=100, axis="index", random_state=debug_seed)
-Original file line number
+Diff line change
@@ Expand Up / @@ -130,3 +130,6 @@ dmypy.json @@
     # Pyre type checker
     .pyre/
+    # PyCharm
+    .idea