Amplicon Sequence Variant (ASV) processing pipeline
Tools: MultiQC DADA2 phyloseq
library(dada2) library(ggplot2) library(phyloseq)
path <- 'Path/to/files' list.files(path) # verify all of the files of interest are present.
F_reads <- sort(list.files(path, pattern='_R1_001.fastq')) R_reads <- sort(list.files(path, pattern='_R2_001.fastq'))
sample.names <- sapply(strsplit(F_reads, '_'), [
, 1)
F_reads <- file.path(path, F_reads) R_reads <- file.path(path, R_reads)
filtered_path <- file.path(path, 'filtered')
F_filtered <- file.path(filtered_path, paste0(sample.names, '_F_filtered.fastq.gz')) R_filtered <- file.path(filtered_path, paste0(sample.names, '_R_filtered.fastq.gz'))
output <- filterAndTrim(F_reads, F_filtered, R_reads, R_filtered, truncLen=c(290,290), # truncLen should be set based on fastQC results maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE, compress=TRUE, multithread=TRUE) head(output)
errorRate_F <- learnErrors(F_filtered, multithread=TRUE) errorRate_R <- learnErrors(R_filtered, multithread=TRUE)
plotErrors(errorRate_F, nominalQ=TRUE) plotErrors(errorRate_R, nominalQ=TRUE)
derep_F <- derepFastq(F_filtered, verbose=TRUE) derep_R <- derepFastq(R_filtered, verbose=TRUE) names(derep_F) <- sample.names names(derep_R) <- sample.names
F_inference <- dada(derep_F, err=errorRate_F, multithread=TRUE) R_inference <- dada(derep_R, err=errorRate_R, multithread=TRUE)
merged <- mergePairs(F_inference, derep_F, R_inference, derep_F, verbose=TRUE)
seq_table <- makeSequenceTable(merged) table(nchar(getSequences(seq_table))) # Provides count of sequence variants of various lengths