(This is a draft)
lrRNAseqBenchmark
is as R
package that facilitates the validation of
genetic variants called from long-read RNA-seq (lrRNA-seq) data by one
or more variant callers, by comparing their VCF files with a
ground-truth VCF file (commonly generated from high-coverage short-read
data).
lrRNAseqBenchmark
constructs a big data.frame (master table) that
contains information about the variants called by the methods to be
validated and the variants in the ground truth. From this master table,
many plots can be made by different functions available in the package
(ggplot2
-base packages used internally), which allow the visial
analyses including variant proximity to splice junctions, variants in
homopolymers, and variants from allele specific expressed (ASE) genes.
To install this package just type in your R
devtools::install_github("vladimirsouza/lrRNAseqBenchmark@main")
For a full list and description of all functions available in this package, click here.
For less computational efforts, we use data only from chromosome 22.
Generate a master table to compare and validate the variant calls from Iso-Seq data when using variant caller DeepVariant alone and DeepVariant combined with SplitNCigarReads and flagCorrection. The ground-truth VCF was generated by GATK pipeline on high-coverage short-read DNA data. We treat these two pipelines as methods.
- Reference_fasta, FASTA file with the reference sequence of chromosomes 21 and 22.
- ORIGINAL_BAM, BAM file with Iso-Seq reads aligned to the reference.
- SNCR+fC_BAM,
BAM file after manipulations with
SplitNCigarReads
andflagCorrection
inORIGINAL_BAM
. - DeepVariant_VCF,
VCF files with variants called from
ORIGINAL_BAM
. - SNCR+fC+DeepVariant_VCF,
VCF files with variants called from
SNCR+fC+DeepVariant_VCF
. - Ground-truth_VCF, VCF files with ground-truth variants called from high-coverage short-read DNA data.
### methods to validate
# name of the methods to validate
METHOD_NAMES <- c("dv", "sncr_fc_dv")
# name of the dataset used with the methods to validate
METHOD_DATASET_NAME <- "isoSeq_chr21_22"
# VCF files
METHOD_VCF_FILES <- c(
"/home/vbarbo/master_table_example/data/deepvariant_chr21_22.vcf",
"/home/vbarbo/master_table_example/data/sncr_fc_deepvariant_chr21_22.vcf"
)
# BAM of the data
METHOD_BAM_FILE <- "/home/vbarbo/master_table_example/data/isoseq_chr21_22.bam"
### ground-truth
# name
TRUTH_NAME <- "truth"
# name of the dataset used to generate the ground-truth
TRUTH_DATASET_NAME <- "shortRead_chr21_22"
# VCF file
TRUTH_VCF_FILE <- "/home/vbarbo/master_table_example/data/ground_truth_chr21_22.vcf"
# BAM file
TRUTH_BAM_FILE <- "/home/vbarbo/master_table_example/data/short_read_chr21_22.bam"
### variables
# the maximum distance (number of bases) of a variant from a splice junction to be
# consider near the splice junction
MAX_DIST_FROM_SPLICE_SITE <- 20
# number of threads to use
THREADS <- 10
GENOME_REF_FILE <- "/home/vbarbo/master_table_example/data/chr21_22.fa"
library(lrRNAseqBenchmark)
library(GenomicAlignments)
library(dplyr)
library(sarlacc)
library(snakecase)
library(ggplot2)
library(readr)
Get variant positions and which method could call them.
dat <- initiate_master_table(
METHOD_VCF_FILES[1],
METHOD_VCF_FILES[2],
TRUTH_VCF_FILE,
method_names=c(METHOD_NAMES, TRUTH_NAME)
)
dat <- add_info_tag_from_vcf(dat, "qd_truth", "QD", TRUTH_VCF_FILE)
dat <- add_qual_from_vcf(dat, "dv", METHOD_VCF_FILES[1])
dat <- add_qual_from_vcf(dat, "sncrFcDv", METHOD_VCF_FILES[2])
method_bam <- readGAlignments(METHOD_BAM_FILE)
splice_sites <- get_splice_sites_info(method_bam, THREADS)
dat <- add_splice_site_info_to_master_table(
dat,
splice_sites,
MAX_DIST_FROM_SPLICE_SITE
)
Add iso-seq read coverage.
method_coverage <- coverage(METHOD_BAM_FILE)
dat <- add_read_coverage_from_bam_to_master_table(
dat,
method_coverage,
METHOD_DATASET_NAME
)
Add short-read coverage (ground truth).
### this takes a long time to run
truth_coverage <- coverage(TRUTH_BAM_FILE)
dat <- add_read_coverage_from_bam_to_master_table(
dat,
truth_coverage,
TRUTH_DATASET_NAME
)
dat <- add_number_of_n_cigar_reads_to_master_table(
dat,
method_bam,
METHOD_DATASET_NAME
)
### method 1
dat <- add_method_vs_truth_comparison_to_master_table(
dat,
METHOD_NAMES[1],
TRUTH_NAME
)
### method 2
dat <- add_method_vs_truth_comparison_to_master_table(
dat,
METHOD_NAMES[2],
TRUTH_NAME
)
### ground truth
dat <- add_variant_type_to_master_table(dat, TRUTH_VCF_FILE, TRUTH_NAME)
### methods
dat <- add_variant_type_to_master_table(dat, METHOD_VCF_FILES[1],
METHOD_NAMES[1])
dat <- add_variant_type_to_master_table(dat, METHOD_VCF_FILES[2],
METHOD_NAMES[2])
- 1 = is indel
- 0 = is not indel
- -1 = non defined (heterozygous alternative) (truth’s gt has priority over the method’s)
- NA = variant not calls by the method or the truth
### truth
dat <- is_indel_method(dat, TRUTH_NAME)
### dv
dat <- is_indel_method(dat, TRUTH_NAME, "dv")
### dv_s
dat <- is_indel_method(dat, TRUTH_NAME, "sncr_fc_dv")
### TRUTH_NAME
dat <- add_variant_density_of_a_method(dat, 201, TRUTH_NAME)
### all methods
dat <- add_variant_density_of_a_method(dat, 201, METHOD_NAMES)
dat <- add_format_tag_from_vcf(dat, TRUTH_NAME, "GT", TRUTH_VCF_FILE)
dat <- add_format_tag_from_vcf(dat, "dv", "GT", METHOD_VCF_FILES[1])
dat <- add_format_tag_from_vcf(dat, "sncr_fc_dv", "GT", METHOD_VCF_FILES[2])
sel_cols <- paste0( "gt_", c(TRUTH_NAME, METHOD_NAMES) )
k <- is.na( dat[,sel_cols] )
dat[,sel_cols] [k] <- "0/0"
# get all homopolymers of the reference genome
genome_ref <- readDNAStringSet(GENOME_REF_FILE)
names(genome_ref) <- sub("(^chr[0-9]+|X|Y).*", "\\1", names(genome_ref))
homopolymers <- homopolymerFinder(genome_ref)
dat <- add_homopolymer_length_when_indels(dat, homopolymers)
head(dat)
Sites to keep:
* short-read variant density: x <= 3 (ignore positions)
* ignore intronic regions (isoSeq_coverage > 0) (ignore
positions)
* DeepVariant QUAL: x >= 15 (remove method calling)
* short-read coverare : 20 reads <= x <= 95% quantil (ignore
positions)
# no filtering
dat1 <- dat
dim(dat1)
shortread_cover_quantiles <- quantile(dat1$shortRead_chr21_22_coverage, probs=.95)
# if a region contains high variant density, probably this is because of a bad alignmnet.
# we filter out variants in 201bp regions with more than 3 variants in the ground truth
k <- dat1$variantDensity_truth <= 3
dat1 <- dat1[k,]
dim(dat1)
# ignore intronic regions (iso-seq coverage equal to zero)
dat1 <- filter(dat1, isoSeq_chr21_22_coverage>0)
dim(dat1)
# ignore regions based on short-read coverage quantiles: keep 20 reads <= x <= 95% percentile
dat1 <- filter(dat1, shortRead_chr21_22_coverage >= 20 &
shortRead_chr21_22_coverage <= shortread_cover_quantiles)
dim(dat1)
# Filter out DeepVariant calls by QUAL -- but keep the position
# dv
k <- which( dat1$qual_dv < 15 )
dat1$in_dv[k] <- 0
dat1 <- add_method_vs_truth_comparison_to_master_table(
dat1,
METHOD_NAMES[1],
TRUTH_NAME,
replace_column=TRUE
)
# sncr_fc_dv
k <- which( dat1$qual_sncrFcDv < 15 )
dat1$in_sncr_fc_dv[k] <- 0
dat1 <- add_method_vs_truth_comparison_to_master_table(
dat1,
METHOD_NAMES[2],
TRUTH_NAME,
replace_column=TRUE
)
In this section we use functions available in package
lrRNAseqBenchmark
to validate the used methods/pipelines.
Calculate precison measures according to different minimum Iso-Seq read coverage threshold.
# define different minimum coverage threshold
min_coverage <- c(3, 15, 40, 100)
output_method_names <- c("DeepVariant", "SNCR+fC+DeepVariant")
experiment_names <- "Experiment 1"
### calculate precision
dat_plot <- calculate_precision_recall_for_multi_master_tables(
dat1,
experiment_names = experiment_names,
method_names = METHOD_NAMES,
output_method_names = output_method_names,
data_names = METHOD_DATASET_NAME,
truth_names = TRUTH_NAME,
coverage_thresholds = min_coverage,
what = "snps_indels"
)
### plot the results
ggplot(dat_plot, aes(recall, precision, colour=method)) +
facet_grid(experiment~variant) +
geom_point(aes(size=`coverage >= n`), alpha=.5) +
geom_path(size=1.2, alpha=.5) +
theme(legend.position="bottom", legend.box="vertical") +
coord_fixed(ratio=1, xlim=c(0.24, 0.98), ylim=c(0.69, 0.96)) +
xlab("Recall") +
ylab("Precision") +
labs(colour="Method", size="Minimum Iso-Seq read coverage") +
scale_x_continuous(breaks=seq(.2, 1, .1), labels=seq(.2, 1, .1)) +
guides(colour = guide_legend(order=1)) +
NULL
Here, we consider a variant near to a splice junction when it’s not further than 20 bases.
variant_type <- "snp"
min_coverage <- 20
sj_proximity_snps <- splice_junction_analysis_table(dat1,
experiment_names = experiment_names,
truth_names = TRUTH_NAME,
method_dataset_name = METHOD_DATASET_NAME,
method_names = METHOD_NAMES,
output_method_names = output_method_names,
variant_type = variant_type,
min_isoseq_coverage = min_coverage)
ggplot( sj_proximity_snps$acc_sj, aes(x=is_near, y=Score, group=Measures, colour=Measures) ) +
facet_grid(experiment~Method) +
theme(strip.text = element_text(size = 14)) +
geom_point(size=4, alpha=.5) +
geom_line(size=1.2, alpha=.5) +
xlab("SNP candidates are near a splice junction") +
theme(text = element_text(size=18)) +
geom_text( data=sj_proximity_snps$n_text, mapping= aes(x=x, y=y, label=label), size=5 ) +
scale_y_continuous(breaks=seq(0, 1, .25)) +
guides(colour = guide_legend(override.aes = list(size = 1, shape = 11))) +
theme(legend.position="bottom") +
NULL
### for higher variant quality, we also filter out variants in high-variant-density regions according the calls of the methods to be compared.
dat2 <- filter(dat1, variantDensity_dv_sncrFcDv <= 3)
dim(dat2)
### and filter out variants near splice junctions
dat2 <- filter(dat2, is_near_ss==0)
dim(dat2)
### for each pipeline, make a table that stores indels and the homopolymer length where they are inside.
### if a indel is outside homopolymers, the returned homopolymer length is equal to 1.
### keep only variants that are:
### * in Iso-Seq coverage >= 20 reads;
### * classified as heterozygous alternative;
### * indels.
# indels called by DeepVaraint (alone)
dat_hom_dv <- method_homopolymer_indels(input_table = dat2,
first_method_name = TRUTH_NAME,
second_method_name = METHOD_NAMES[1],
vcf_first = TRUTH_VCF_FILE,
vcf_second = METHOD_VCF_FILES[1],
method_dataset_name = METHOD_DATASET_NAME,
homopolymers = homopolymers,
ref_fasta_seqs = genome_ref,
min_isoseq_coverage = 20,
genotyped_alt = "find")
# indels called by SNCR+fC+DeepVaraint
dat_hom_sncrFcDv <- method_homopolymer_indels(input_table = dat2,
first_method_name = TRUTH_NAME,
second_method_name = METHOD_NAMES[2],
vcf_first = TRUTH_VCF_FILE,
vcf_second = METHOD_VCF_FILES[2],
method_dataset_name = METHOD_DATASET_NAME,
homopolymers = homopolymers,
ref_fasta_seqs = genome_ref,
min_isoseq_coverage = 20,
genotyped_alt = "find")
### get the data to make the plot
hom_length_intervals <- c(1, 2, 8)
interval_names <- c("non-hp", "2-5", ">=8")
dat_homopolymer <-list(class_counts=NULL, dat_text=NULL)
# DeepVaraint (alone), deletions
k <- make_homopolymer_table_to_plot(
input_hom_table = dat_hom_dv,
variant_type = "deletion",
method_name = METHOD_NAMES[1],
truth_name = TRUTH_NAME,
hom_length_intervals = hom_length_intervals,
interval_names = interval_names,
to_calculate = "pre_rec_f1",
output_method_name = output_method_names[1]
)
dat_homopolymer <- mapply(rbind, dat_homopolymer, k, SIMPLIFY=FALSE)
# DeepVaraint (alone), insertions
k <- make_homopolymer_table_to_plot(
input_hom_table = dat_hom_dv,
variant_type = "insertion",
method_name = METHOD_NAMES[1],
truth_name = TRUTH_NAME,
hom_length_intervals = hom_length_intervals,
interval_names = interval_names,
to_calculate = "pre_rec_f1",
output_method_name = output_method_names[1]
)
dat_homopolymer <- mapply(rbind, dat_homopolymer, k, SIMPLIFY=FALSE)
# DeepVaraint (alone), deletions
k <- make_homopolymer_table_to_plot(
input_hom_table = dat_hom_sncrFcDv,
variant_type = "deletion",
method_name = METHOD_NAMES[2],
truth_name = TRUTH_NAME,
hom_length_intervals = hom_length_intervals,
interval_names = interval_names,
to_calculate = "pre_rec_f1",
output_method_name = output_method_names[2]
)
dat_homopolymer <- mapply(rbind, dat_homopolymer, k, SIMPLIFY=FALSE)
# DeepVaraint (alone), insertions
k <- make_homopolymer_table_to_plot(
input_hom_table = dat_hom_sncrFcDv,
variant_type = "insertion",
method_name = METHOD_NAMES[2],
truth_name = TRUTH_NAME,
hom_length_intervals = hom_length_intervals,
interval_names = interval_names,
to_calculate = "pre_rec_f1",
output_method_name = output_method_names[2]
)
dat_homopolymer <- mapply(rbind, dat_homopolymer, k, SIMPLIFY=FALSE)
ggplot(dat_homopolymer$class_counts,
aes(x=homopolymer_length_intervals, y=percent,
group=Measures, colour=Measures)) +
facet_grid(variant_type~method) +
theme(strip.text = element_text(size = 14)) +
geom_point(size=4, alpha=.5) +
geom_line(size=1.2, alpha=.5) +
# xlab("Variant within a homopolymer of length l") +
# xlab("Homopolymer length") +
# ylab("Score") +
theme(text = element_text(size=20)) +
geom_text( data=dat_homopolymer$dat_text, mapping= aes(x=x, y=y, label=label), size=5 ) +
guides(colour = guide_legend(override.aes = list(size = 1, shape = 11))) +
theme(legend.position="bottom") +
NULL
Compare the performance of the methods on variant calling from Allele Specific Expressed (ASE) and non-ASE genes
(at the moment, there is not a ready-to-use function for this analysis)
To identify ASE genes, first we use GATK’s ASEReadCounter function to get a table with read counts per allele. For the analysis presented in this README, this table can be downloaded here.
Once got the table of counts, the chi-squared test is used to identify ASE sites.
### inputs
ALLELE_COUNTS_FILE <- "/home/vbarbo/master_table_example/ase_table_count/ase_read_count_tables/ase_read_count_all.table"
### load allele counts
allele_counts <- read_tsv(ALLELE_COUNTS_FILE)
### add column of p-values for ase identification
ase_chiSquare <- mapply(function(ref, alt){
suppressWarnings(
tryCatch(
chisq.test( c(ref, alt) ) $p.value,
error=function(e){NA}
)
)
}, allele_counts$refCount, allele_counts$altCount)
ase_chiSquare_adj <- p.adjust(ase_chiSquare, method="BH")
allele_counts <- mutate(allele_counts, ase_chiSquare_adj)
### add ase q-value to the master table
allele_counts <- rename(allele_counts, "chrm"="contig", "pos"="position")
allele_p <- subset(allele_counts,
select=c("chrm", "pos", "refCount", "altCount",
"totalCount", "otherBases", "ase_chiSquare_adj"))
dat3 <- merge(dat1, allele_p, by=c("chrm", "pos"), all.x=TRUE, sort=FALSE)
### keep only variants that are SNPs and heterozygous (according to the
### ground truth), and have a p-value for ASE identification associated to
### them
dat3 <- filter(dat3, variantType_truth=="snp" & gt_truth=="0/1")
dat3 <- filter(dat3, !is.na(ase_chiSquare_adj))
### keep only variants with reasonable read coverage in the ground truth and
### in the rna short read data
dat3 <- filter(dat3, isoSeq_chr21_22_coverage>=10)
dat3 <- filter(dat3, totalCount>=40)
### chi-square test to test whether the proportions of FNs between ASE and
### non-ASE variants are different
# for DeepVariant (alone)
k <- table(dat3$dv_classification, dat3$ase_chiSquare_adj<.05)
k <- k[rownames(k) %in% c("FN", "TP"),]
chisq.test(k)$p.value
# for SNCR+fC+DeepVariant
k <- table(dat3$sncr_fc_dv_classification, dat3$ase_chiSquare_adj<.05)
k <- k[rownames(k) %in% c("FN", "TP"),]
chisq.test(k)$p.value
Make a plot to compare the performance of the methods from ASE- and non-ASE sites.
### create the tables from which ggplot2 makes the plots
calcFreq <- function(d){
d$Ratio <- 0
d$Ratio[1] <- d$Freq[1] / sum(d$Freq[1:2])
d$Ratio[2] <- d$Freq[2] / sum(d$Freq[1:2])
d$Ratio[3] <- d$Freq[3] / sum(d$Freq[3:4])
d$Ratio[4] <- d$Freq[4] / sum(d$Freq[3:4])
d
}
# DeepVariant (alone)
dat_ase <- table(dat3$dv_classification, dat3$ase_chiSquare_adj<.05)
dat_ase <- dat_ase[c("FN", "TP"),]
dat_ase <- as.data.frame(dat_ase)
names(dat_ase) <- c("Classification", "ASE_Identification", "Frequency")
dat_ase$ASE_Identification <- ifelse( as.logical(dat_ase$ASE_Identification),
"ASE", "Non-ASE" ) %>% factor
dat_ase_dv <- calcFreq(dat_ase)
# SNCR+flagCorrecion+DeepVariant
dat_ase <- table(dat3$sncr_fc_dv_classification, dat3$ase_chiSquare_adj<.05)
dat_ase <- dat_ase[c("FN", "TP"),]
dat_ase <- as.data.frame(dat_ase)
names(dat_ase) <- c("Classification", "ASE_Identification", "Frequency")
dat_ase$ASE_Identification <- ifelse( as.logical(dat_ase$ASE_Identification),
"ASE", "Non-ASE") %>% factor
dat_ase_dvSFc <- calcFreq(dat_ase)
### put all tables together in a single table
dat_ase_dv$method <- "DeepVariant"
dat_ase_dvSFc$method <- "SNCR+fC+DeepVariant"
dat_ase <- rbind(dat_ase_dv, dat_ase_dvSFc)
k <- c("DeepVariant", "SNCR+fC+DeepVariant")
dat_ase$method <- factor(dat_ase$method, levels=k, ordered=TRUE)
### draw the plot
ggplot(dat_ase, aes(x=ASE_Identification, y=Ratio, fill=Classification)) +
facet_grid(.~method) +
geom_bar(stat="identity", position=position_dodge()) +
geom_text(aes(label=round(Ratio,2)), vjust=-0.1, color="gray40",
position = position_dodge(0.9), size=5)+
theme(text = element_text(size = 20)) +
ylim(0,1) +
xlab("Allele specific expression identification") +
NULL