Skip to content


clear all outputs
Browse files Browse the repository at this point in the history
  • Loading branch information
sanjaynagi committed Aug 22, 2023
1 parent 8179124 commit 6ea46f5
Showing 1 changed file with 327 additions and 0 deletions.
327 changes: 327 additions & 0 deletions workflow/notebooks/differential-expression-meta-analysis
Original file line number Diff line number Diff line change
@@ -0,0 +1,327 @@
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyNvheDHpizoW314Fm/cuodU",
"include_colab_link": true
"kernelspec": {
"name": "ir",
"display_name": "R"
"language_info": {
"name": "R"
"cells": [
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
"source": [
"<a href=\"\" target=\"_parent\"><img src=\"\" alt=\"Open In Colab\"/></a>"
"cell_type": "code",
"source": [
"if (!require(\"BiocManager\", quietly = TRUE))\n",
" install.packages(\"BiocManager\")\n",
"install.packages(c(\"pheatmap\", \"data.table\", \"ggrepel\", \"openxlsx\", \"tidyverse\", \"plotly\", \"RColorBrewer\"))"
"metadata": {
"id": "Y2oEs5J6XPJA"
"execution_count": null,
"outputs": []
"cell_type": "code",
"source": [
"metadata": {
"id": "NlVr0JHcw7Cz"
"execution_count": null,
"outputs": []
"cell_type": "markdown",
"source": [
"## AnoExpress - Differential expression meta-analysis with DESeq2\n",
"In this notebook, we perform the differential expression analysis for AnoExpress."
"metadata": {
"id": "pr9o9vgsa-pm"
"cell_type": "code",
"source": [
"dir.create(\"results/genediff\", recursive=TRUE)"
"metadata": {
"id": "6ppADEcHkndP"
"execution_count": null,
"outputs": []
"cell_type": "code",
"source": [
"round_df = function(df, digits) {\n",
" #' This function rounds all the numeric columns of a data.frame\n",
" nums = vapply(df, is.numeric, FUN.VALUE = logical(1))\n",
" df[,nums] = round(df[,nums], digits = digits)\n",
" (df)\n",
"diff_exp = function(analysis, names_df){\n",
" # this function runs the diff exp analysis for one AnoExpress analysis\n",
" results_list = list()\n",
" nsig_list = list()\n",
" # load metadata\n",
" metadata = fread(\"\") %>%\n",
" metadata[(metadata$batch == 1) &(metadata$species == 'arabiensis'), 'batch'] = 0\n",
" metadata$batch = factor(metadata$batch)\n",
" # load analysis\n",
" counts = fread(f(\"{analysis}.tsv\"), sep=\"\\t\") %>%\n",
" %>%\n",
" column_to_rownames(\"GeneID\") %>%\n",
" 2^. %>%\n",
" mutate_if(is.numeric, as.integer)\n",
" # get boolean indexer for species depending on analysis\n",
" if (analysis == 'gamb_colu'){\n",
" sp_bool = metadata$species %in% c(\"gambiae\", \"coluzzii\")\n",
" } else if (analysis == 'gamb_colu_arab'){\n",
" sp_bool = metadata$species %in% c(\"gambiae\", \"coluzzii\", \"arabiensis\")\n",
" } else if (analysis == 'gamb_colu_arab_fun'){\n",
" sp_bool = metadata$species %in% c(\"gambiae\", \"coluzzii\", \"arabiensis\", \"funestus\")\n",
" } else if (analysis == 'fun'){\n",
" sp_bool = metadata$species == \"funestus\"\n",
" }\n",
" print(analysis)\n",
" # subset to analysis\n",
" meta = metadata[sp_bool, ]\n",
" print(dim(meta))\n",
" print(dim(counts))\n",
" # analyse each experiment separately\n",
" for (experiment in unique(meta$batch)){\n",
" if (experiment == 5){\n",
" next\n",
" }\n",
" # quit if shape of arrays incorrect\n",
" stopifnot(nrow(meta) == length(counts))\n",
" # subset to batch\n",
" meta2 = meta %>% filter(batch == experiment)\n",
" counts2 = counts[, meta2$sampleID]\n",
" # quit if order not correct\n",
" stopifnot(all(meta2$sampleID == colnames(counts2)))\n",
" # get res and sus for each comparison\n",
" resistants = unique(meta2[meta2$resistance == 'resistant',]$condition)\n",
" susceptibles = unique(meta2[meta2$resistance == 'susceptible',]$condition)\n",
" comparisons = crossing(resistants, susceptibles)\n",
" print(experiment)\n",
" print(\n",
" for (i in 1:nrow(comparisons)){\n",
" res = comparisons[i, 'resistants']\n",
" sus = comparisons[i, 'susceptibles']\n",
" comp = glue(\"{res}_v_{sus}\")\n",
" print(comp)\n",
" controls = which(meta2$condition %in% sus)\n",
" cases = which(meta2$condition %in% res)\n",
" idxs = c(controls, cases)\n",
" subcounts = counts2[, idxs]\n",
" subsamples = meta2[idxs,]\n",
" # make treatment a factor with the 'susceptible' as reference\n",
" subsamples$treatment = as.factor(subsamples$resistance)\n",
" subsamples$treatment = relevel(subsamples$treatment, \"susceptible\")\n",
" # make DESeq analysis\n",
" print(\"subcounts shape\")\n",
" print(dim(subcounts))\n",
" print(head(subcounts))\n",
" print(\"subsamples shape\")\n",
" print(dim(subsamples))\n",
" print(head(subsamples))\n",
" dds = DESeqDataSetFromMatrix(countData = subcounts,\n",
" colData = subsamples,\n",
" design = ~ treatment)\n",
" ###### estimate paramters and normalise\n",
" dds = estimateSizeFactors(dds)\n",
" dds = estimateDispersions(dds)\n",
" dds = estimateDispersions(dds)\n",
" cds = nbinomWaldTest(dds)\n",
" results = results(cds, contrast = c(\"treatment\", \"susceptible\", \"resistant\")) %>%\n",
" results = results[order(results$padj),] #order by pvalue\n",
" results = results %>% mutate(log2FoldChange=log2FoldChange*-1)\n",
" results = results %>% rownames_to_column(\"GeneID\") %>% dplyr::mutate(\"FC\" = (2^log2FoldChange))\n",
" ### absolute difference\n",
" #### Get rowsums of counts, grouping by case/control. Then get difference of counts and join with DE results\n",
" readdiff = data.frame(t(rowsum(t(subcounts), group = subsamples$treatment, na.rm = T))) #transpose and get rowsums for each group\n",
" readdiff$absolute_diff = readdiff[,\"resistant\"] - readdiff[,\"susceptible\"] #get difference\n",
" readdiff = data.frame(readdiff) %>% rownames_to_column('GeneID')\n",
" results = unique(left_join(results, readdiff[,c('GeneID','absolute_diff')]))\n",
" # join DE results with normal gene names\n",
" results = unique(left_join(results, names_df))\n",
" results_list[[comp]] = results\n",
" fwrite(results, glue(\"results/genediff/{comp}_diffexp.csv\")) #write to csv\n",
" #get number of sig genes\n",
" res1 = results %>% filter(padj < 0.05) %>%\n",
" count(\"direction\" = FC > 1) %>%\n",
" dplyr::mutate(\"direction\" = case_when(direction == FALSE ~ \"Downregulated, padj = 0.05\",\n",
" direction == TRUE ~ \"Upregulated, padj = 0.05\")) %>%\n",
" dplyr::rename(!!glue(\"{comp}_ngenes\") := \"n\")\n",
" res2 = results %>% filter(padj < 0.001) %>%\n",
" count(\"direction\" = FC > 1) %>%\n",
" dplyr::mutate(\"direction\" = case_when(direction == FALSE ~ \"Downregulated, padj = 0.001\",\n",
" direction == TRUE ~ \"Upregulated, padj = 0.001\")) %>%\n",
" dplyr::rename(!!glue(\"{comp}_ngenes\") := \"n\")\n",
" nsig_list[[comp]] = bind_rows(res1, res2)\n",
" cat(\"\\n\", glue(\"{comp} complete!\"), \"\\n\")\n",
" }\n",
" }\n",
" #### write to excel file on diff sheets ####\n",
" sheets = names(results_list)\n",
" wb <- createWorkbook(\"Workbook\")\n",
" for (i in 1:length(sheets)){\n",
" addWorksheet(wb, glue(\"{sheets[[i]]}\"))\n",
" writeData(wb, sheets[i], results_list[[i]], rowNames = FALSE, colNames = TRUE)\n",
" }\n",
" #### save workbook to disk once all worksheets and data have been added ####\n",
" saveWorkbook(wb,file=f(\"results/genediff/{analysis}_genediff.xlsx\"), overwrite = TRUE)\n",
" # Join different comparisons together and write out number of sig genes\n",
" purrr::reduce(nsig_list, inner_join) %>% fwrite(f(\"results/genediff/{analysis}_nsig_genes.tsv\"), sep=\"\\t\", col.names = TRUE)\n",
" fc_data = data.frame(\"GeneID\" = results_list[[1]]$GeneID)\n",
" pval_data = data.frame(\"GeneID\" = results_list[[1]]$GeneID)\n",
" for (i in 1:length(results_list)){\n",
" name = sheets[i]\n",
" name_var = glue(\"{name}_log2FoldChange\")\n",
" name_pval = glue(\"{name}_padj\")\n",
" df = results_list[[i]] %>%\n",
" select(c(\"GeneID\", \"log2FoldChange\")) %>%\n",
" rename({{ name_var }} := log2FoldChange)\n",
" pval_df = results_list[[i]] %>%\n",
" select(c(\"GeneID\", \"padj\")) %>%\n",
" rename({{ name_var }} := padj)\n",
" fc_data = fc_data %>% inner_join(df) %>% distinct()\n",
" pval_data = pval_data %>% inner_join(pval_df) %>% distinct()\n",
" }\n",
" fc_data = fc_data %>% inner_join(names_df)\n",
" pval_data = pval_data %>% inner_join(names_df)\n",
" pval_data %>%\n",
" select(-TranscriptID) %>%\n",
" round_df(3) %>%\n",
" distinct() %>%\n",
" fwrite(., file=f(\"results/pvals.{analysis}.tsv\"), sep=\"\\t\")\n",
" fc_data %>%\n",
" select(-TranscriptID) %>%\n",
" round_df(2) %>%\n",
" distinct() %>%\n",
" fwrite(., file=f(\"results/fcs.{analysis}.tsv\"), sep=\"\\t\")\n",
" return(list(results_list, nsig_list))\n",
"metadata": {
"id": "QKRvivSKbViu"
"execution_count": null,
"outputs": []
"cell_type": "markdown",
"source": [
"#### **Run analyses**"
"metadata": {
"id": "K8rXoikKj1rw"
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "bphla7K5WCCX"
"outputs": [],
"source": [
"f = glue\n",
"ag_pest_analyses = c(\"gamb_colu\", \"gamb_colu_arab\", \"gamb_colu_arab_fun\")\n",
"AGAMnames_df = fread(\"\", sep=\"\\t\") %>% distinct()\n",
"AFUNnames_df = fread(\"\", sep=\"\\t\") %>% distinct()"
"cell_type": "code",
"source": [
"# gambiae pest analyses\n",
"for (analysis in ag_pest_analyses){\n",
" res_list = diff_exp(analysis, names_df = AGAMnames_df)\n",
"# funestus only\n",
"res = diff_exp(analysis = \"fun\", names_df = AFUNnames_df)"
"metadata": {
"id": "aGlzmQ1ebwiP"
"execution_count": null,
"outputs": []
"cell_type": "code",
"source": [
"metadata": {
"id": "PiGr0E5RgA0R"
"execution_count": null,
"outputs": []

0 comments on commit 6ea46f5

Please sign in to comment.