diff --git a/DESCRIPTION b/DESCRIPTION index 97258038..4d159446 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,7 +20,6 @@ License: MIT + file LICENSE Depends: R (>= 4.1.0) Remotes: - umccr/gpgr, umccr/RNAsum.data Imports: AnnotationDbi, @@ -31,12 +30,11 @@ Imports: DT, EDASeq, edgeR, - EnsDb.Hsapiens.v86, ensembldb, fs, + ggforce, ggplot2, glue, - gpgr, htmltools, htmlwidgets, knitr, @@ -68,5 +66,5 @@ Config/testthat/edition: 3 Encoding: UTF-8 Roxygen: list(markdown = TRUE, roclets = c("namespace", "rd", "roxytest::testthat_roclet")) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 LazyData: true diff --git a/NAMESPACE b/NAMESPACE index e2819793..a49b932d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -38,6 +38,7 @@ export(pca) export(pcgr_expr) export(pcgr_tiers_tsv_read) export(perc_rank) +export(ppl_cnv_som_gene_read) export(prepare2write) export(purple_cnv_summary) export(rcircos_cyto_info38) diff --git a/R/sample_data.R b/R/sample_data.R index 4fb5563a..a40c8552 100644 --- a/R/sample_data.R +++ b/R/sample_data.R @@ -140,7 +140,7 @@ read_wgs_data <- function(p) { purple_gene_tsv <- .read( p = p, subdir = "purple", pat = "purple\\.cnv\\.gene\\.tsv$", - nm = "purple_gene_tsv", func = gpgr::purple_cnv_som_gene_read + nm = "purple_gene_tsv", func = ppl_cnv_som_gene_read ) manta_tsv <- .read( @@ -324,3 +324,33 @@ immune_summary <- function(tbl_imarkers, tbl_igram = NULL, igram_param = TRUE) { res <- unique(c(res1, res2)) |> stats::na.omit() res } + +#' Read PURPLE CNV Gene File +#' +#' Reads the `purple.cnv.gene.tsv` file, which summarises copy number +#' alterations of each gene in the HMF panel +#' (see https://github.com/hartwigmedical/hmftools/tree/master/purple#gene-copy-number-file). +#' +#' @param x Path to `purple.cnv.gene.tsv` file. +#' +#' @return The input file as a tibble. +#' +#' @export +ppl_cnv_som_gene_read <- function(x) { + nm <- c( + "chromosome" = "c", "start" = "i", "end" = "i", "gene" = "c", + "minCopyNumber" = "d", "maxCopyNumber" = "d", + "unused" = "c", "somaticRegions" = "d", "germlineHomDeletionRegions" = "d", + "germlineHetToHomDeletionRegions" = "d", + "transcriptId" = "c", "transcriptVersion" = "c", "chromosomeBand" = "c", + "minRegions" = "d", "minRegionStart" = "i", "minRegionEnd" = "i", + "minRegionStartSupport" = "c", "minRegionEndSupport" = "c", + "minRegionMethod" = "c", "minMinorAlleleCopyNumber" = "d" + ) + + ctypes <- paste(nm, collapse = "") + purple_cnv_gene <- readr::read_tsv(x, col_types = ctypes) + assertthat::assert_that(ncol(purple_cnv_gene) == length(nm)) + assertthat::assert_that(all(colnames(purple_cnv_gene) == names(nm))) + purple_cnv_gene +} diff --git a/deploy/conda/recipe/meta.yaml b/deploy/conda/recipe/meta.yaml index 524024de..4f42dde6 100644 --- a/deploy/conda/recipe/meta.yaml +++ b/deploy/conda/recipe/meta.yaml @@ -21,17 +21,16 @@ requirements: - bioconductor-annotationfilter - r-assertthat - r-conflicted - - r-dt - r-dplyr + - r-dt - bioconductor-edaseq ==2.28.0 - - bioconductor-ensdb.hsapiens.v86 - bioconductor-edger ==3.36.0 - bioconductor-ensembldb - r-fs - r-ggforce - r-ggplot2 - r-glue - - umccr::r-gpgr + - r-here - r-htmltools - r-htmlwidgets - r-knitr @@ -61,17 +60,16 @@ requirements: - bioconductor-annotationfilter - r-assertthat - r-conflicted - - r-dt - r-dplyr + - r-dt - bioconductor-edaseq ==2.28.0 - - bioconductor-ensdb.hsapiens.v86 - bioconductor-edger ==3.36.0 - bioconductor-ensembldb - r-fs - r-ggforce - r-ggplot2 - r-glue - - umccr::r-gpgr + - r-here - r-htmltools - r-htmlwidgets - r-knitr diff --git a/inst/rmd/.gitignore b/inst/rmd/.gitignore index 9b39fad3..654b996c 100644 --- a/inst/rmd/.gitignore +++ b/inst/rmd/.gitignore @@ -1,3 +1,3 @@ -rnasum.html rnasum_files/ rnasum.md +/html/ diff --git a/inst/rmd/WTS_WGS_samples_mapping.Rmd b/inst/rmd/WTS_WGS_samples_mapping.Rmd deleted file mode 100644 index 122a17ce..00000000 --- a/inst/rmd/WTS_WGS_samples_mapping.Rmd +++ /dev/null @@ -1,240 +0,0 @@ ---- -title: "UMCCR WTS-WGS Results Mapping" -author: "UMCCR" -date: "`r Sys.Date()`" -output: - html_document: - theme: readable - toc: false - toc_float: false - code_folding: hide -params: - project: "Avner" - secondary: - sample_names: "CCR170028R_MHP002_T,CCR170058R_MHP002_O,CCR170012R_MHP013_T,CCR170093R_MHP013_O,CCR170013R_MHP006_T,CCR170123R_MHP006_O,CCR170090R_MHP031_T,CCR170119R_MHP032_O,CCR170031R_MHP010_T,CCR170057_RNA_AH17T001P001,CCR170058_RNA_WPT002,CCR170063_RNA_WH17T001P002,CCR170070_RNA_VPT_A001,CCR170093_RNA_WPT_013,CCR170119_RNA_VPT_M032,CCR170122_RNA_VPT_M030,CCR170123_RNA_VPT_M006B,CCR180010_RNA_WH18T002P010,CCR180021_RNA_VPX_AH001_P2,CCR180044_RNA_VPT_W003_p13,CCR180054_RNA_VPT_WH010,CCR180059_RNA_VPT_M030,CCR170131B_RNA-MM17T002P008,CCR170012_MH17T001P013,CCR170030_17MH008S2,CCR180051_VPY_MH002,CCR180065_VPT_MM008,CCR180072_VPT_WH16,CCR180038_SV18T002P006_RNA,CCR180084_VPT-MH013_RNA,CCR180081_MH18T002P053_RNA,CCR180109_VPT-EH09_RNA,CCR170057_AH17T001P001_MN_RNA,CCR170070_VPT_A001_MN_RNA,CCR180029_MH18T002P038_MN_RNA,CCR170063_RNA_WH17T001P002,CCR170085_RNA_WH17F002P005,CCR180081_RNA_MH18T002P053,CCR180088_RNA_NH18T002P003,CCR180108_RNA_MH18F002P057,CCR180116_RNA_WH18T002P022,CCR180122_RNA_VPT_WH002,CCR180124_RNA_VPT_MH057,CCR180128_RNA_VPT_NH3_PEA,CCR180129_RNA_VPT_NH4,CCR180130_RNA_VPT_WH5,CCR180131_RNA_VPT_WH22,CCR180137_RNA_VPT_WH23,CCR180104_RNA_NH18F001P004,CCR180027_RNA_MH18T002P043,CCR180029_RNA_MH18T002P038,CCR180051_RNA_VPY-MH002,CCR170115b_MH17T002P033_RNA,CCR180140_NH18F001P005_RNA,CCR170085_WH17F002P005_RNA,CCR180130_VPT-WH5_RNA,CCR180108_MH18F002P057_RNA,CCR180124_VPT-MH057_RNA,CCR180116_WH18T002P022_RNA,CCR180131_VPT-WH22_RNA,CCR180141_VPT-NH5_RNA,CCR180171_AH18T002P008_RNA,CCR180183_VPT-AH008_RNA,CCR170109_AH17T002P005_RNA,CCR180004_SV17T002P003_RNA,CCR180064_SV18T002P007_RNA,CCR180085_VPT-SV006_RNA,CCR180014_AH18T002P006_RNA,CCR180029_MH18T002P038_RNA,CCR180055_AH18T002P007_RNA,CCR180095_WH18T002P019_RNA,CCR180101_MM18T002P003_RNA,CCR180038_SV18T002P006_RNA,CCR180023_MH18F001P040,CCR180121_WH18F001P023,CCR180148_MH18F001P062,CCR180109_VPT-EH09,CCR180159_VPT-WH017A,CCR180075_WH18T002P017,CCR180172_VPT-MH062,CCR180128_VPT-NH3PEA,CCR180088_NH18T002P003,CCR180137_VPT-WH23,CCR180161_VPT-WH029,CCR180086_VPT-MH051,CCR180160_VPT-WH031,CCR180129_VPT-NH4,CCR180195_AH18T002P009,CCR180092_SV18T002P009,CCR190007_VPT-CH003,CCR170125_VPT-M033,CCR190005_VPT-MH074,CCR170045_MH17T001P020,CCR180219_AH18T002P011,CCR180229_MM18T002P004,CCR180149_VPT-WH025-E,CCR190006_VPT-MH065,CCR180083_MH18F001P054,CCR170034_MH17F001P017,CCR180200_SV18T002P011_rna,CCR170032_MH17T001P012_rna,CCR170079_MH17T002P027_rna,CCR170095_VPT-M027_rna,CCR170046_MH17T001P021_rna,CCR170078_MH17F002P028_rna,CCR170080_MH17T002P030_rna,CCR180136_WH18F001P025_rna,CCR180222_CH18T002P003_rna,CCR170036_MH17F001P018_rna,CCR180215_MH18T002P074_rna,CCR190028_VPT-MH075A_RNA,CCR190022_MH19T002P075_RNA" - debug: FALSE ---- - -```{r custom, echo=FALSE, message=FALSE, warning=FALSE} -library(ggplot2) -library(knitr) -library(tidyr) -library(rmarkdown) -library(dplyr) -library(readr) -library(forcats) -library(stringr) -library(janitor) -library(googledrive) -library(here) -library(skimr) -library(purrr) -library(data.table) -``` - -```{r define_functions, comment=NA, message=FALSE, warning=FALSE} -##### Define functions -##### Find length of overlap between two strings (from https://stackoverflow.com/questions/48701107/find-length-of-overlap-in-strings) -str_overlap <- function(str1, str2, ignore.case = FALSE, verbose = FALSE) { - - if(ignore.case) { - str1 <- tolower(str1) - str2 <- tolower(str2) - } - - if(nchar(str1) < nchar(str2)) { - x <- str2 - str2 <- str1 - str1 <- x - } - - x <- strsplit(str2, "")[[1L]] - n <- length(x) - s <- sequence(seq_len(n)) - s <- split(s, cumsum(s == 1L)) - s <- rep(list(s), n) - - for(i in seq_along(s)) { - s[[i]] <- lapply(s[[i]], function(x) { - x <- x + (i-1L) - x[x <= n] - }) - s[[i]] <- unique(s[[i]]) - } - - s <- unlist(s, recursive = FALSE) - s <- unique(s[order(-lengths(s))]) - - i <- 1L - len_s <- length(s) - while(i < len_s) { - lcs <- paste(x[s[[i]]], collapse = "") - if(verbose) cat("now checking:", lcs, "\n") - check <- grepl(lcs, str1, fixed = TRUE) - if(check) { - #cat("the (first) longest common substring is:", lcs, "of length", nchar(lcs), "\n") - break - } else { - i <- i + 1L - } - } - return( list(lcs, nchar(lcs)) ) -} -``` - -## Introduction - -Code snippets to generate result summaries for WTS-WGS runs for RNAseq report generation, based on a [Google Spreadsheet](https://docs.google.com/spreadsheets/d/1DwvyfVrgr5TIcYtGVXZeIWWRbld-nuX-4o4z2kZHNWs/edit#gid=0), aka the dreadful LIMS stand-in. - -### 1. Setting up project information - -This currently requires specifying the project name and the WTS samples of interest. It will pull out all files of that type and project which do not have results associated with them yet. This should be generalized at some point to support sample extraction by Illumina RunID or by patient ID. The exact names can be copied from the `project` and `sample_name` (assigned to WTS samples of interest) columns of the [Google-LIMS sheet](https://docs.google.com/spreadsheets/u/1/d/1aaTvXrZSdA1ekiLEpW60OeNq2V7D_oEMBzTgC-uDJAM/edit#gid=0). The `googledrive` framework requires authentication with oAuth. This can be done interactive, but storing a token simplifies the process; see the [googlesheet authentication vignette](https://rawgit.com/jennybc/googlesheets/master/vignettes/managing-auth-tokens.html) for details. - -An alternative use is to set the "secondary" analysis flag to match the samples that need to be processed, regardless of processing status. The `PROJECT` name will be used to name config and sample files, but any sample with the matching `SECONDARY` entry in the `secondary analysis` column of the spreadsheet will be added. This is useful when re-processing samples for research projects. Long term, the idea is that we get rid of this filtering step completely and just generate sync lists and templates for all samples that still need to be processed, then mark the processing stage in Google-LIMS to avoid duplication. - -```{r project} -##### Collect samples selection criteria -PROJECT <- params$project - -if ( is.null(params$secondary) ) { - - SECONDARY <- "" - -} else { - SECONDARY <- params$secondary -} - -SAMPLES <- unlist(strsplit(params$sample_names, split=',', fixed=TRUE)) - -# Extract info about individual samples -SAMPLES <- unlist(lapply(SAMPLES, str_replace_all, pattern = '-', replacement ='_')) -``` - -### 2. Import data from Google spreadsheet - -This step generates a data frame from the Google spreadsheet, unifying variable names and simplifying subject IDs for bcbio along the way. It replaces empty `results` cells (which indicate samples that still need processing) with a `-` to distinguish from true NAs and gets rid of whitespace in subject identifiers (although ideally there should not be any empty cells in the sheet in the first place). Otherwise standard data cleanup with `janitor`. We are also creating a timestamped backup of the Google Doc each time it is accessed, just in case. - -```{r importData, comment = NA, message=FALSE, warning=FALSE} -# Register UMCCR spreadsheet. Use cached authentication -#gs_auth(token = "./googlesheets_token_umccr.rds") - -# Create a backup copy each time we access the sample info -tm <- as.POSIXlt(Sys.time(), "UTC", "%Y-%m-%dT%H:%M:%S") -timestamp <- strftime(tm , "%Y-%m-%dT%H%M") -filename <- paste(timestamp, 'backup.csv', sep = '_') -backup_dir <- file.path(here::here('output', 'backup')) -dir.create(backup_dir, recursive = TRUE) # mkdir -p (warns if it already exists): - -# Google Drive implementation -lims_key <- drive_find('^Google LIMS$', team_drive = 'LIMS')$id -drive_download(as_id(lims_key), path = file.path(backup_dir, filename), overwrite = TRUE) - -# Import downloaded spreadsheet (sheet #1) as a tibble -samples_gs <- read_csv(file.path(backup_dir, filename)) - -# Tweak for analysis -samples <- samples_gs %>% - clean_names() %>% - mutate(secondary_analysis = ifelse(is.na(secondary_analysis), '-', secondary_analysis)) %>% - remove_empty(c('rows', 'cols')) %>% - mutate(subject_id = gsub(' ', '.', subject_id)) %>% - mutate(results = ifelse(is.na(results), '-', results)) %>% - dplyr::select(-matches('number_fastqs')) # Drop FASTQ count introduced with new version for now - -# Add unique id to each row -samples <- samples %>% - mutate(row_id = rownames(samples)) -``` - -### 3. Find results ready for reporting - -Find all samples that belong to the provided project and that have been processed: - -```{r subsetSamples} -# Keep rows matching project and type requested; extract path to FASTQ -# and generate new file name for these -bcbio <- samples %>% - dplyr::filter(((project == PROJECT & results != '-') | - (secondary_analysis == SECONDARY))) %>% - dplyr::select(fastq, results, run, project, sample_id, sample_name, subject_id, type, phenotype, row_id) %>% - dplyr::mutate(targetname = paste(sample_name, sep = '_')) %>% - dplyr::mutate(targetname = str_replace_all(targetname, '-', '_')) -``` - -### 4. Remove topup and normal samples - -Samples with the exact same name (but from different runs) are expected to be top-ups and would have been merged with original samples during upstream processing. Also, WGS has normal samples, which are not required for the mapping. - -```{r removeTopups} -# Remove top-up samples (samples with the exact same description) -# before calculating WTS-WGS result assignments. -# This assumes topups are consistently flagged with a `_topup` -# suffix -template <- bcbio[!(stringr::str_detect(bcbio$sample_id, pattern = '_topup$')), ] -template <- template[!(template$type=='WGS' & template$phenotype=='normal'), ] -``` - -### 5. Generating templates for WTS-WGS results mapping {.tabset} - -Generate a file with pointers to the sample FASTQs and their WGS-WTS result path (if both available). - -First, for each queried WTS sample get the `subject_id`, list WGS (column `type`) samples with the corresponding `subject_id` and, in case there are multiple WGS samples listed for given `subject_id`, use pattern matching to get the WGS sample that best matches with the queried WTS `sample_name`. To do this, for each *WGS `sample_name`* search for the the *overlap* with *WTS `sample_name`* and then report the *WGS `sample_name`* with the greatest *overlap* with the corresponding *WTS `sample_name`*. - -```{r wts_wgs_templates, comment = NA, message=FALSE, warning=FALSE} -# Create a list with relevant info each euqried sample, -SAMPLES_match <- vector("list", length(SAMPLES)) -names(SAMPLES_match) <- SAMPLES - -# Loop though each queried samples, get the "subject_id", list WGS ("type") samples with the corresponding "subject_id" and in case there are multiple WGS samples use pattern matching to get the WGS sample that best matches with the queried WTS "sample_name" -for ( sample_WTS in SAMPLES ) { - - # Get the "subject_id" - WTS_subset <- template[ template$targetname == sample_WTS & template$type == "WTS" , ] - subject.id <- WTS_subset$subject_id - - WGS_subset <- template %>% - dplyr::filter(((type == 'WGS' & subject_id == subject.id))) - - # Select the WGS sample that best matches with the queried WTS "sample_name" - if ( nrow(WGS_subset) > 1 ) { - - if ( params$debug ) { cat("\n\nWTS sample: ", sample_WTS) } - - sample_WTGS.overlaps <- NULL - - # For each WGS "sample_name" (actually "targetname") search for the the overlap with WTS sample (after removing "_" to minimise chance that these will contribute to the matching patterns") - for ( sample_WGS in WGS_subset$targetname ) { - - sample_WTGS.overlap <- str_overlap(str_replace_all(sample_WTS, '_', ''), str_replace_all(sample_WGS, '_', '')) - sample_WTGS.overlaps <- c( sample_WTGS.overlaps, sample_WTGS.overlap[[2]]) - - if ( params$debug ) { cat("\nWGS: ", sample_WGS, "\tWTGS overlap: ", sample_WTGS.overlap[[1]], "\tOverlap length: ", sample_WTGS.overlap[[2]]) } - } - - if ( params$debug ) { cat("\nWTS - WGS match: ", sample_WTS, " - ", WGS_subset[ which.max(sample_WTGS.overlaps), ]$sample_name) } - - # Report WGS sample with greatest name overlap and combine info with WTS sample info - SAMPLES_match[[ sample_WTS ]] <- cbind(WTS_subset, WGS_subset[ which.max(sample_WTGS.overlaps), ]) - - } else if ( nrow(WGS_subset) == 1 ) { - - SAMPLES_match[[ sample_WTS ]] <- cbind(WTS_subset, WGS_subset) - } -} - -# Combined info for all samples -SAMPLES_match.table <- rbindlist(SAMPLES_match, use.names=TRUE, fill=FALSE) -names(SAMPLES_match.table) <- paste( names(SAMPLES_match[[ sample_WTS ]]), c(rep("WTS", ncol(WTS_subset)), rep("WGS", ncol(WGS_subset))), sep = "." ) -``` - -#### Summary table - -```{r wts_wgs_templates_table_summary, comment = NA, message=FALSE, warning=FALSE} -DT::datatable( data = SAMPLES_match.table[ , c("project.WTS", "subject_id.WTS", "sample_name.WTS", "sample_name.WGS", "phenotype.WTS", "phenotype.WGS", "results.WTS", "results.WGS", "fastq.WTS", "fastq.WGS") ], filter="none", rownames = FALSE, extensions = c('Buttons','Scroller'), options = list(pageLength = 10, dom = 'Bfrtip', buttons = c('excel', 'csv', 'pdf','copy','colvis'), scrollX = TRUE, deferRender = TRUE, scrollY = 200, scroller = TRUE), width = 900, caption = htmltools::tags$caption( style = 'caption-side: top; text-align: left; color:grey; font-size:100% ;'), escape = FALSE) -``` - -#### Full table - -```{r wts_wgs_templates_table_full, comment = NA, message=FALSE, warning=FALSE} -DT::datatable( data = SAMPLES_match.table, filter="none", rownames = FALSE, extensions = c('Buttons','Scroller'), options = list(pageLength = 10, dom = 'Bfrtip', buttons = c('excel', 'csv', 'pdf','copy','colvis'), scrollX = TRUE, deferRender = TRUE, scrollY = 200, scroller = TRUE), width = 900, caption = htmltools::tags$caption( style = 'caption-side: top; text-align: left; color:grey; font-size:100% ;'), escape = FALSE) -``` diff --git a/man/ppl_cnv_som_gene_read.Rd b/man/ppl_cnv_som_gene_read.Rd new file mode 100644 index 00000000..fae18aaa --- /dev/null +++ b/man/ppl_cnv_som_gene_read.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sample_data.R +\name{ppl_cnv_som_gene_read} +\alias{ppl_cnv_som_gene_read} +\title{Read PURPLE CNV Gene File} +\usage{ +ppl_cnv_som_gene_read(x) +} +\arguments{ +\item{x}{Path to \code{purple.cnv.gene.tsv} file.} +} +\value{ +The input file as a tibble. +} +\description{ +Reads the \code{purple.cnv.gene.tsv} file, which summarises copy number +alterations of each gene in the HMF panel +(see https://github.com/hartwigmedical/hmftools/tree/master/purple#gene-copy-number-file). +} diff --git a/tests/testthat/test-roxytest-testexamples-sample_data.R b/tests/testthat/test-roxytest-testexamples-sample_data.R index 952b6d2a..97f733de 100644 --- a/tests/testthat/test-roxytest-testexamples-sample_data.R +++ b/tests/testthat/test-roxytest-testexamples-sample_data.R @@ -18,7 +18,7 @@ test_that("Function read_sample_data() @ L27", { }) -test_that("Function read_wgs_data() @ L103", { +test_that("Function read_wgs_data() @ L107", { p <- list( umccrise = system.file("rawdata/test_data/umccrised/test_sample_WGS", package = "RNAsum"),