-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
8179124
commit 6ea46f5
Showing
1 changed file
with
327 additions
and
0 deletions.
There are no files selected for viewing
327 changes: 327 additions & 0 deletions
327
workflow/notebooks/differential-expression-meta-analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,327 @@ | ||
{ | ||
"nbformat": 4, | ||
"nbformat_minor": 0, | ||
"metadata": { | ||
"colab": { | ||
"provenance": [], | ||
"authorship_tag": "ABX9TyNvheDHpizoW314Fm/cuodU", | ||
"include_colab_link": true | ||
}, | ||
"kernelspec": { | ||
"name": "ir", | ||
"display_name": "R" | ||
}, | ||
"language_info": { | ||
"name": "R" | ||
} | ||
}, | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "view-in-github", | ||
"colab_type": "text" | ||
}, | ||
"source": [ | ||
"<a href=\"https://colab.research.google.com/github/sanjaynagi/AnoExpress/blob/main/workflow/notebooks/differential-expression-meta-analysis\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"if (!require(\"BiocManager\", quietly = TRUE))\n", | ||
" install.packages(\"BiocManager\")\n", | ||
"\n", | ||
"BiocManager::install(\"DESeq2\")\n", | ||
"\n", | ||
"install.packages(c(\"pheatmap\", \"data.table\", \"ggrepel\", \"openxlsx\", \"tidyverse\", \"plotly\", \"RColorBrewer\"))" | ||
], | ||
"metadata": { | ||
"id": "Y2oEs5J6XPJA" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"library(DESeq2)\n", | ||
"library(pheatmap)\n", | ||
"library(data.table)\n", | ||
"library(ggrepel)\n", | ||
"library(openxlsx)\n", | ||
"library(glue)\n", | ||
"library(RColorBrewer)\n", | ||
"library(tidyverse)\n", | ||
"library(plotly)" | ||
], | ||
"metadata": { | ||
"id": "NlVr0JHcw7Cz" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"## AnoExpress - Differential expression meta-analysis with DESeq2\n", | ||
"In this notebook, we perform the differential expression analysis for AnoExpress." | ||
], | ||
"metadata": { | ||
"id": "pr9o9vgsa-pm" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"dir.create(\"results/genediff\", recursive=TRUE)" | ||
], | ||
"metadata": { | ||
"id": "6ppADEcHkndP" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"round_df = function(df, digits) {\n", | ||
" #' This function rounds all the numeric columns of a data.frame\n", | ||
" nums = vapply(df, is.numeric, FUN.VALUE = logical(1))\n", | ||
" df[,nums] = round(df[,nums], digits = digits)\n", | ||
" (df)\n", | ||
"}\n", | ||
"\n", | ||
"diff_exp = function(analysis, names_df){\n", | ||
" # this function runs the diff exp analysis for one AnoExpress analysis\n", | ||
"\n", | ||
" results_list = list()\n", | ||
" nsig_list = list()\n", | ||
"\n", | ||
" # load metadata\n", | ||
" metadata = fread(\"https://raw.githubusercontent.com/sanjaynagi/AnoExpress/main/config/sample_metadata.tsv\") %>% as.data.frame()\n", | ||
" metadata[(metadata$batch == 1) &(metadata$species == 'arabiensis'), 'batch'] = 0\n", | ||
" metadata$batch = factor(metadata$batch)\n", | ||
"\n", | ||
" # load analysis\n", | ||
" counts = fread(f(\"https://github.com/sanjaynagi/AnoExpress/raw/main/results/log2counts.{analysis}.tsv\"), sep=\"\\t\") %>%\n", | ||
" as.data.frame() %>%\n", | ||
" column_to_rownames(\"GeneID\") %>%\n", | ||
" 2^. %>%\n", | ||
" mutate_if(is.numeric, as.integer)\n", | ||
"\n", | ||
" # get boolean indexer for species depending on analysis\n", | ||
" if (analysis == 'gamb_colu'){\n", | ||
" sp_bool = metadata$species %in% c(\"gambiae\", \"coluzzii\")\n", | ||
" } else if (analysis == 'gamb_colu_arab'){\n", | ||
" sp_bool = metadata$species %in% c(\"gambiae\", \"coluzzii\", \"arabiensis\")\n", | ||
" } else if (analysis == 'gamb_colu_arab_fun'){\n", | ||
" sp_bool = metadata$species %in% c(\"gambiae\", \"coluzzii\", \"arabiensis\", \"funestus\")\n", | ||
" } else if (analysis == 'fun'){\n", | ||
" sp_bool = metadata$species == \"funestus\"\n", | ||
" }\n", | ||
"\n", | ||
" print(analysis)\n", | ||
" # subset to analysis\n", | ||
" meta = metadata[sp_bool, ]\n", | ||
" print(dim(meta))\n", | ||
" print(dim(counts))\n", | ||
"\n", | ||
" # analyse each experiment separately\n", | ||
" for (experiment in unique(meta$batch)){\n", | ||
" if (experiment == 5){\n", | ||
" next\n", | ||
" }\n", | ||
"\n", | ||
" # quit if shape of arrays incorrect\n", | ||
" stopifnot(nrow(meta) == length(counts))\n", | ||
"\n", | ||
" # subset to batch\n", | ||
" meta2 = meta %>% filter(batch == experiment)\n", | ||
" counts2 = counts[, meta2$sampleID]\n", | ||
"\n", | ||
" # quit if order not correct\n", | ||
" stopifnot(all(meta2$sampleID == colnames(counts2)))\n", | ||
"\n", | ||
" # get res and sus for each comparison\n", | ||
" resistants = unique(meta2[meta2$resistance == 'resistant',]$condition)\n", | ||
" susceptibles = unique(meta2[meta2$resistance == 'susceptible',]$condition)\n", | ||
" comparisons = crossing(resistants, susceptibles)\n", | ||
" print(experiment)\n", | ||
" print(as.data.frame(comparisons))\n", | ||
"\n", | ||
" for (i in 1:nrow(comparisons)){\n", | ||
" res = comparisons[i, 'resistants']\n", | ||
" sus = comparisons[i, 'susceptibles']\n", | ||
" comp = glue(\"{res}_v_{sus}\")\n", | ||
" print(comp)\n", | ||
"\n", | ||
" controls = which(meta2$condition %in% sus)\n", | ||
" cases = which(meta2$condition %in% res)\n", | ||
"\n", | ||
" idxs = c(controls, cases)\n", | ||
" subcounts = counts2[, idxs]\n", | ||
" subsamples = meta2[idxs,]\n", | ||
"\n", | ||
" # make treatment a factor with the 'susceptible' as reference\n", | ||
" subsamples$treatment = as.factor(subsamples$resistance)\n", | ||
" subsamples$treatment = relevel(subsamples$treatment, \"susceptible\")\n", | ||
"\n", | ||
" # make DESeq analysis\n", | ||
" print(\"subcounts shape\")\n", | ||
" print(dim(subcounts))\n", | ||
" print(head(subcounts))\n", | ||
" print(\"subsamples shape\")\n", | ||
" print(dim(subsamples))\n", | ||
" print(head(subsamples))\n", | ||
" dds = DESeqDataSetFromMatrix(countData = subcounts,\n", | ||
" colData = subsamples,\n", | ||
" design = ~ treatment)\n", | ||
"\n", | ||
" ###### estimate paramters and normalise\n", | ||
" dds = estimateSizeFactors(dds)\n", | ||
" dds = estimateDispersions(dds)\n", | ||
" dds = estimateDispersions(dds)\n", | ||
" cds = nbinomWaldTest(dds)\n", | ||
" results = results(cds, contrast = c(\"treatment\", \"susceptible\", \"resistant\")) %>% as.data.frame()\n", | ||
" results = results[order(results$padj),] #order by pvalue\n", | ||
" results = results %>% mutate(log2FoldChange=log2FoldChange*-1)\n", | ||
" results = results %>% rownames_to_column(\"GeneID\") %>% dplyr::mutate(\"FC\" = (2^log2FoldChange))\n", | ||
"\n", | ||
" ### absolute difference\n", | ||
" #### Get rowsums of counts, grouping by case/control. Then get difference of counts and join with DE results\n", | ||
" readdiff = data.frame(t(rowsum(t(subcounts), group = subsamples$treatment, na.rm = T))) #transpose and get rowsums for each group\n", | ||
" readdiff$absolute_diff = readdiff[,\"resistant\"] - readdiff[,\"susceptible\"] #get difference\n", | ||
" readdiff = data.frame(readdiff) %>% rownames_to_column('GeneID')\n", | ||
" results = unique(left_join(results, readdiff[,c('GeneID','absolute_diff')]))\n", | ||
"\n", | ||
" # join DE results with normal gene names\n", | ||
" results = unique(left_join(results, names_df))\n", | ||
" results_list[[comp]] = results\n", | ||
"\n", | ||
" fwrite(results, glue(\"results/genediff/{comp}_diffexp.csv\")) #write to csv\n", | ||
" #get number of sig genes\n", | ||
" res1 = results %>% filter(padj < 0.05) %>%\n", | ||
" count(\"direction\" = FC > 1) %>%\n", | ||
" dplyr::mutate(\"direction\" = case_when(direction == FALSE ~ \"Downregulated, padj = 0.05\",\n", | ||
" direction == TRUE ~ \"Upregulated, padj = 0.05\")) %>%\n", | ||
" dplyr::rename(!!glue(\"{comp}_ngenes\") := \"n\")\n", | ||
"\n", | ||
" res2 = results %>% filter(padj < 0.001) %>%\n", | ||
" count(\"direction\" = FC > 1) %>%\n", | ||
" dplyr::mutate(\"direction\" = case_when(direction == FALSE ~ \"Downregulated, padj = 0.001\",\n", | ||
" direction == TRUE ~ \"Upregulated, padj = 0.001\")) %>%\n", | ||
" dplyr::rename(!!glue(\"{comp}_ngenes\") := \"n\")\n", | ||
"\n", | ||
" nsig_list[[comp]] = bind_rows(res1, res2)\n", | ||
" cat(\"\\n\", glue(\"{comp} complete!\"), \"\\n\")\n", | ||
" }\n", | ||
" }\n", | ||
" #### write to excel file on diff sheets ####\n", | ||
" sheets = names(results_list)\n", | ||
" wb <- createWorkbook(\"Workbook\")\n", | ||
" for (i in 1:length(sheets)){\n", | ||
" addWorksheet(wb, glue(\"{sheets[[i]]}\"))\n", | ||
" writeData(wb, sheets[i], results_list[[i]], rowNames = FALSE, colNames = TRUE)\n", | ||
" }\n", | ||
" #### save workbook to disk once all worksheets and data have been added ####\n", | ||
" saveWorkbook(wb,file=f(\"results/genediff/{analysis}_genediff.xlsx\"), overwrite = TRUE)\n", | ||
" # Join different comparisons together and write out number of sig genes\n", | ||
" purrr::reduce(nsig_list, inner_join) %>% fwrite(f(\"results/genediff/{analysis}_nsig_genes.tsv\"), sep=\"\\t\", col.names = TRUE)\n", | ||
"\n", | ||
" fc_data = data.frame(\"GeneID\" = results_list[[1]]$GeneID)\n", | ||
" pval_data = data.frame(\"GeneID\" = results_list[[1]]$GeneID)\n", | ||
" for (i in 1:length(results_list)){\n", | ||
" name = sheets[i]\n", | ||
" name_var = glue(\"{name}_log2FoldChange\")\n", | ||
" name_pval = glue(\"{name}_padj\")\n", | ||
" df = results_list[[i]] %>%\n", | ||
" select(c(\"GeneID\", \"log2FoldChange\")) %>%\n", | ||
" rename({{ name_var }} := log2FoldChange)\n", | ||
"\n", | ||
" pval_df = results_list[[i]] %>%\n", | ||
" select(c(\"GeneID\", \"padj\")) %>%\n", | ||
" rename({{ name_var }} := padj)\n", | ||
"\n", | ||
" fc_data = fc_data %>% inner_join(df) %>% distinct()\n", | ||
" pval_data = pval_data %>% inner_join(pval_df) %>% distinct()\n", | ||
" }\n", | ||
"\n", | ||
" fc_data = fc_data %>% inner_join(names_df)\n", | ||
" pval_data = pval_data %>% inner_join(names_df)\n", | ||
"\n", | ||
" pval_data %>%\n", | ||
" select(-TranscriptID) %>%\n", | ||
" round_df(3) %>%\n", | ||
" distinct() %>%\n", | ||
" fwrite(., file=f(\"results/pvals.{analysis}.tsv\"), sep=\"\\t\")\n", | ||
"\n", | ||
" fc_data %>%\n", | ||
" select(-TranscriptID) %>%\n", | ||
" round_df(2) %>%\n", | ||
" distinct() %>%\n", | ||
" fwrite(., file=f(\"results/fcs.{analysis}.tsv\"), sep=\"\\t\")\n", | ||
"\n", | ||
" return(list(results_list, nsig_list))\n", | ||
"}" | ||
], | ||
"metadata": { | ||
"id": "QKRvivSKbViu" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"#### **Run analyses**" | ||
], | ||
"metadata": { | ||
"id": "K8rXoikKj1rw" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "bphla7K5WCCX" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"f = glue\n", | ||
"ag_pest_analyses = c(\"gamb_colu\", \"gamb_colu_arab\", \"gamb_colu_arab_fun\")\n", | ||
"\n", | ||
"AGAMnames_df = fread(\"https://github.com/sanjaynagi/rna-seq-pop/raw/master/resources/exampleGene2TranscriptMap.tsv\", sep=\"\\t\") %>% distinct()\n", | ||
"AFUNnames_df = fread(\"https://github.com/sanjaynagi/AnoExpress/raw/main/resources/AfunGenes2TranscriptMap.tsv\", sep=\"\\t\") %>% distinct()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"# gambiae pest analyses\n", | ||
"for (analysis in ag_pest_analyses){\n", | ||
" res_list = diff_exp(analysis, names_df = AGAMnames_df)\n", | ||
"}\n", | ||
"\n", | ||
"# funestus only\n", | ||
"res = diff_exp(analysis = \"fun\", names_df = AFUNnames_df)" | ||
], | ||
"metadata": { | ||
"id": "aGlzmQ1ebwiP" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"sessionInfo()" | ||
], | ||
"metadata": { | ||
"id": "PiGr0E5RgA0R" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
} | ||
] | ||
} |