clear all outputs

sanjaynagi · Aug 22, 2023 · 6ea46f5 · 6ea46f5
1 parent 8179124
commit 6ea46f5
Showing 1 changed file with 327 additions and 0 deletions.
diff --git a/workflow/notebooks/differential-expression-meta-analysis b/workflow/notebooks/differential-expression-meta-analysis
@@ -0,0 +1,327 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "authorship_tag": "ABX9TyNvheDHpizoW314Fm/cuodU",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "ir",
+      "display_name": "R"
+    },
+    "language_info": {
+      "name": "R"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/sanjaynagi/AnoExpress/blob/main/workflow/notebooks/differential-expression-meta-analysis\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "if (!require(\"BiocManager\", quietly = TRUE))\n",
+        "    install.packages(\"BiocManager\")\n",
+        "\n",
+        "BiocManager::install(\"DESeq2\")\n",
+        "\n",
+        "install.packages(c(\"pheatmap\", \"data.table\", \"ggrepel\", \"openxlsx\", \"tidyverse\", \"plotly\", \"RColorBrewer\"))"
+      ],
+      "metadata": {
+        "id": "Y2oEs5J6XPJA"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "library(DESeq2)\n",
+        "library(pheatmap)\n",
+        "library(data.table)\n",
+        "library(ggrepel)\n",
+        "library(openxlsx)\n",
+        "library(glue)\n",
+        "library(RColorBrewer)\n",
+        "library(tidyverse)\n",
+        "library(plotly)"
+      ],
+      "metadata": {
+        "id": "NlVr0JHcw7Cz"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## AnoExpress - Differential expression meta-analysis with DESeq2\n",
+        "In this notebook, we perform the differential expression analysis for AnoExpress."
+      ],
+      "metadata": {
+        "id": "pr9o9vgsa-pm"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dir.create(\"results/genediff\", recursive=TRUE)"
+      ],
+      "metadata": {
+        "id": "6ppADEcHkndP"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "round_df = function(df, digits) {\n",
+        "  #' This function rounds all the numeric columns of a data.frame\n",
+        "  nums = vapply(df, is.numeric, FUN.VALUE = logical(1))\n",
+        "  df[,nums] = round(df[,nums], digits = digits)\n",
+        "  (df)\n",
+        "}\n",
+        "\n",
+        "diff_exp = function(analysis, names_df){\n",
+        "  # this function runs the diff exp analysis for one AnoExpress analysis\n",
+        "\n",
+        "  results_list = list()\n",
+        "  nsig_list = list()\n",
+        "\n",
+        "  # load metadata\n",
+        "  metadata = fread(\"https://raw.githubusercontent.com/sanjaynagi/AnoExpress/main/config/sample_metadata.tsv\") %>% as.data.frame()\n",
+        "  metadata[(metadata$batch == 1) &(metadata$species == 'arabiensis'), 'batch'] = 0\n",
+        "  metadata$batch = factor(metadata$batch)\n",
+        "\n",
+        "  # load analysis\n",
+        "  counts = fread(f(\"https://github.com/sanjaynagi/AnoExpress/raw/main/results/log2counts.{analysis}.tsv\"), sep=\"\\t\") %>%\n",
+        "    as.data.frame() %>%\n",
+        "    column_to_rownames(\"GeneID\") %>%\n",
+        "    2^. %>%\n",
+        "    mutate_if(is.numeric, as.integer)\n",
+        "\n",
+        "  # get boolean indexer for species depending on analysis\n",
+        "  if (analysis == 'gamb_colu'){\n",
+        "    sp_bool = metadata$species %in% c(\"gambiae\", \"coluzzii\")\n",
+        "  } else if (analysis == 'gamb_colu_arab'){\n",
+        "    sp_bool = metadata$species %in% c(\"gambiae\", \"coluzzii\", \"arabiensis\")\n",
+        "  } else if (analysis == 'gamb_colu_arab_fun'){\n",
+        "    sp_bool = metadata$species %in% c(\"gambiae\", \"coluzzii\", \"arabiensis\", \"funestus\")\n",
+        "  } else if (analysis == 'fun'){\n",
+        "    sp_bool = metadata$species == \"funestus\"\n",
+        "  }\n",
+        "\n",
+        "  print(analysis)\n",
+        "  # subset to analysis\n",
+        "  meta = metadata[sp_bool, ]\n",
+        "  print(dim(meta))\n",
+        "  print(dim(counts))\n",
+        "\n",
+        "  # analyse each experiment separately\n",
+        "  for (experiment in unique(meta$batch)){\n",
+        "    if (experiment == 5){\n",
+        "      next\n",
+        "    }\n",
+        "\n",
+        "    # quit if shape of arrays incorrect\n",
+        "    stopifnot(nrow(meta) == length(counts))\n",
+        "\n",
+        "    # subset to batch\n",
+        "    meta2 = meta %>% filter(batch == experiment)\n",
+        "    counts2 = counts[, meta2$sampleID]\n",
+        "\n",
+        "    # quit if order not correct\n",
+        "    stopifnot(all(meta2$sampleID == colnames(counts2)))\n",
+        "\n",
+        "    # get res and sus for each comparison\n",
+        "    resistants = unique(meta2[meta2$resistance == 'resistant',]$condition)\n",
+        "    susceptibles = unique(meta2[meta2$resistance == 'susceptible',]$condition)\n",
+        "    comparisons = crossing(resistants, susceptibles)\n",
+        "    print(experiment)\n",
+        "    print(as.data.frame(comparisons))\n",
+        "\n",
+        "    for (i in 1:nrow(comparisons)){\n",
+        "      res = comparisons[i, 'resistants']\n",
+        "      sus = comparisons[i, 'susceptibles']\n",
+        "      comp = glue(\"{res}_v_{sus}\")\n",
+        "      print(comp)\n",
+        "\n",
+        "      controls = which(meta2$condition %in% sus)\n",
+        "      cases = which(meta2$condition %in% res)\n",
+        "\n",
+        "      idxs = c(controls, cases)\n",
+        "      subcounts = counts2[, idxs]\n",
+        "      subsamples = meta2[idxs,]\n",
+        "\n",
+        "      # make treatment a factor with the 'susceptible' as reference\n",
+        "      subsamples$treatment = as.factor(subsamples$resistance)\n",
+        "      subsamples$treatment = relevel(subsamples$treatment, \"susceptible\")\n",
+        "\n",
+        "      # make DESeq analysis\n",
+        "      print(\"subcounts shape\")\n",
+        "      print(dim(subcounts))\n",
+        "      print(head(subcounts))\n",
+        "      print(\"subsamples shape\")\n",
+        "      print(dim(subsamples))\n",
+        "      print(head(subsamples))\n",
+        "      dds = DESeqDataSetFromMatrix(countData = subcounts,\n",
+        "                                   colData = subsamples,\n",
+        "                                   design = ~ treatment)\n",
+        "\n",
+        "      ###### estimate paramters and normalise\n",
+        "      dds = estimateSizeFactors(dds)\n",
+        "      dds = estimateDispersions(dds)\n",
+        "      dds = estimateDispersions(dds)\n",
+        "      cds = nbinomWaldTest(dds)\n",
+        "      results = results(cds, contrast = c(\"treatment\", \"susceptible\", \"resistant\")) %>% as.data.frame()\n",
+        "      results = results[order(results$padj),] #order by pvalue\n",
+        "      results = results %>% mutate(log2FoldChange=log2FoldChange*-1)\n",
+        "      results = results %>% rownames_to_column(\"GeneID\") %>% dplyr::mutate(\"FC\" = (2^log2FoldChange))\n",
+        "\n",
+        "      ### absolute difference\n",
+        "      #### Get rowsums of counts, grouping by case/control. Then get difference of counts and join with DE results\n",
+        "      readdiff = data.frame(t(rowsum(t(subcounts), group = subsamples$treatment, na.rm = T))) #transpose and get rowsums for each group\n",
+        "      readdiff$absolute_diff = readdiff[,\"resistant\"] - readdiff[,\"susceptible\"] #get difference\n",
+        "      readdiff = data.frame(readdiff) %>% rownames_to_column('GeneID')\n",
+        "      results = unique(left_join(results, readdiff[,c('GeneID','absolute_diff')]))\n",
+        "\n",
+        "      # join DE results with normal gene names\n",
+        "      results = unique(left_join(results, names_df))\n",
+        "      results_list[[comp]] = results\n",
+        "\n",
+        "      fwrite(results, glue(\"results/genediff/{comp}_diffexp.csv\")) #write to csv\n",
+        "      #get number of sig genes\n",
+        "      res1 = results %>% filter(padj < 0.05) %>%\n",
+        "        count(\"direction\" = FC > 1) %>%\n",
+        "        dplyr::mutate(\"direction\" = case_when(direction == FALSE ~ \"Downregulated, padj = 0.05\",\n",
+        "                                              direction == TRUE ~ \"Upregulated, padj = 0.05\")) %>%\n",
+        "        dplyr::rename(!!glue(\"{comp}_ngenes\") := \"n\")\n",
+        "\n",
+        "      res2 = results %>% filter(padj < 0.001) %>%\n",
+        "        count(\"direction\" = FC > 1) %>%\n",
+        "        dplyr::mutate(\"direction\" = case_when(direction == FALSE ~ \"Downregulated, padj = 0.001\",\n",
+        "                                              direction == TRUE ~ \"Upregulated, padj = 0.001\")) %>%\n",
+        "        dplyr::rename(!!glue(\"{comp}_ngenes\") := \"n\")\n",
+        "\n",
+        "      nsig_list[[comp]] = bind_rows(res1, res2)\n",
+        "      cat(\"\\n\", glue(\"{comp} complete!\"), \"\\n\")\n",
+        "      }\n",
+        "  }\n",
+        "  #### write to excel file on diff sheets ####\n",
+        "  sheets = names(results_list)\n",
+        "  wb <- createWorkbook(\"Workbook\")\n",
+        "  for (i in 1:length(sheets)){\n",
+        "    addWorksheet(wb, glue(\"{sheets[[i]]}\"))\n",
+        "    writeData(wb, sheets[i], results_list[[i]], rowNames = FALSE, colNames = TRUE)\n",
+        "  }\n",
+        "  #### save workbook to disk once all worksheets and data have been added ####\n",
+        "  saveWorkbook(wb,file=f(\"results/genediff/{analysis}_genediff.xlsx\"), overwrite = TRUE)\n",
+        "  # Join different comparisons together and write out number of sig genes\n",
+        "  purrr::reduce(nsig_list, inner_join) %>% fwrite(f(\"results/genediff/{analysis}_nsig_genes.tsv\"), sep=\"\\t\", col.names = TRUE)\n",
+        "\n",
+        "  fc_data = data.frame(\"GeneID\" = results_list[[1]]$GeneID)\n",
+        "  pval_data = data.frame(\"GeneID\" = results_list[[1]]$GeneID)\n",
+        "  for (i in 1:length(results_list)){\n",
+        "    name = sheets[i]\n",
+        "    name_var = glue(\"{name}_log2FoldChange\")\n",
+        "    name_pval = glue(\"{name}_padj\")\n",
+        "    df = results_list[[i]] %>%\n",
+        "      select(c(\"GeneID\", \"log2FoldChange\")) %>%\n",
+        "      rename({{ name_var }} := log2FoldChange)\n",
+        "\n",
+        "    pval_df = results_list[[i]] %>%\n",
+        "      select(c(\"GeneID\", \"padj\")) %>%\n",
+        "      rename({{ name_var }} := padj)\n",
+        "\n",
+        "    fc_data = fc_data %>% inner_join(df) %>% distinct()\n",
+        "    pval_data = pval_data %>% inner_join(pval_df) %>% distinct()\n",
+        "  }\n",
+        "\n",
+        "  fc_data = fc_data %>% inner_join(names_df)\n",
+        "  pval_data = pval_data %>% inner_join(names_df)\n",
+        "\n",
+        "  pval_data %>%\n",
+        "    select(-TranscriptID) %>%\n",
+        "    round_df(3) %>%\n",
+        "    distinct() %>%\n",
+        "    fwrite(., file=f(\"results/pvals.{analysis}.tsv\"), sep=\"\\t\")\n",
+        "\n",
+        "  fc_data %>%\n",
+        "    select(-TranscriptID) %>%\n",
+        "    round_df(2) %>%\n",
+        "    distinct() %>%\n",
+        "    fwrite(., file=f(\"results/fcs.{analysis}.tsv\"), sep=\"\\t\")\n",
+        "\n",
+        "  return(list(results_list, nsig_list))\n",
+        "}"
+      ],
+      "metadata": {
+        "id": "QKRvivSKbViu"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "#### **Run analyses**"
+      ],
+      "metadata": {
+        "id": "K8rXoikKj1rw"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "bphla7K5WCCX"
+      },
+      "outputs": [],
+      "source": [
+        "f = glue\n",
+        "ag_pest_analyses = c(\"gamb_colu\", \"gamb_colu_arab\", \"gamb_colu_arab_fun\")\n",
+        "\n",
+        "AGAMnames_df = fread(\"https://github.com/sanjaynagi/rna-seq-pop/raw/master/resources/exampleGene2TranscriptMap.tsv\", sep=\"\\t\") %>% distinct()\n",
+        "AFUNnames_df = fread(\"https://github.com/sanjaynagi/AnoExpress/raw/main/resources/AfunGenes2TranscriptMap.tsv\", sep=\"\\t\") %>% distinct()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# gambiae pest analyses\n",
+        "for (analysis in ag_pest_analyses){\n",
+        "  res_list = diff_exp(analysis, names_df = AGAMnames_df)\n",
+        "}\n",
+        "\n",
+        "# funestus only\n",
+        "res = diff_exp(analysis = \"fun\", names_df = AFUNnames_df)"
+      ],
+      "metadata": {
+        "id": "aGlzmQ1ebwiP"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "sessionInfo()"
+      ],
+      "metadata": {
+        "id": "PiGr0E5RgA0R"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}