From 994436e252eed6543b51450ae1a0911a6d6f2bdf Mon Sep 17 00:00:00 2001 From: capoony Date: Sat, 15 Oct 2022 15:47:38 +0200 Subject: [PATCH] update --- AutDeNovo.sh | 35 ++++++++- FullPipeline/racon.sh | 166 ++++++++++++++++++++++++++++++++++++++++++ README.md | 20 ++++- 3 files changed, 218 insertions(+), 3 deletions(-) create mode 100644 FullPipeline/racon.sh diff --git a/AutDeNovo.sh b/AutDeNovo.sh index 197b9c7..fe1e8f9 100755 --- a/AutDeNovo.sh +++ b/AutDeNovo.sh @@ -50,6 +50,9 @@ do Trimmer=*) Trimmer="${i#*=}" ;; + Racon=*) + racon="${i#*=}" + ;; *) # unknown option ;; @@ -107,6 +110,7 @@ if [ -z "$threads" ]; then threads="10"; fi if [ -z "$RAM" ]; then RAM="20"; fi if [ -z "$RAMAssembly" ]; then RAMAssembly="20"; fi if [ -z "$SmudgePlot" ]; then SmudgePlot="no"; fi +if [ -z "$racon" ]; then racon="no"; fi ## Test which data are available if [[ !(-z "$fwd") && -z "$ont" && -z "$pb" ]]; then data="ILL"; @@ -352,8 +356,37 @@ $RAMAssembly \ printf "########################\n\n" \ | tee -a ${out}/shell/pipeline.sh + +if [[ $racon != "no" ]] +then + + ############################################### + ########### (5) Polishing with Racon ############### + + ## denovo assembly with Spades + + printf "## Starting polishing with Racon\n# " \ + | tee -a ${out}/shell/pipeline.sh + date \ + | tee -a ${out}/shell/pipeline.sh + + sh FullPipeline/racon.sh \ + $out \ + $name \ + $data \ + $PWD \ + $threads \ + $RAMAssembly \ + $racon \ + $decont \ + | tee -a ${out}/shell/pipeline.sh + printf "########################\n\n" \ + | tee -a ${out}/shell/pipeline.sh + +fi + ############################################### -########### (5) Assembly QC ############### +########### (6) Assembly QC ############### ## (A) QUAST analysis diff --git a/FullPipeline/racon.sh b/FullPipeline/racon.sh new file mode 100644 index 0000000..ade9933 --- /dev/null +++ b/FullPipeline/racon.sh @@ -0,0 +1,166 @@ +## Polish contigs with Racon + +out=$1 +name=$2 +data=$3 +pwd=$4 +threads=$5 +RAM=$6 +racon=$7 +decont=$8 + +printf "sh FullPipeline/racon.sh $1 $2 $3 $4 $5 $6 $7 $8\n# " + +if [[ $data == *'ILL'* ]] +then + if [[ $decont == 'no' ]] + then + IllInp1=${name}_1_val_1 + IllInp2=${name}_2_val_2 + else + IllInp1=kraken_illumina_${name}_1 + IllInp2=kraken_illumina_${name}_2 + fi +fi + +if [[ $data == *'ONT'* ]] +then + if [[ $decont == 'no' ]] + then + OntInp=${name}_ont + else + OntInp=raken_ont_${name} + fi +fi + +if [[ $data == *'PB'* ]] +then + if [[ $decont == 'no' ]] + then + PbInp=${name}_pb + else + PbInp=raken_${name}_pb + fi +fi + +############################# + +mkdir -p ${out}/results/Racon + + + +echo """ + + #!/bin/sh + + ## name of Job + #PBS -N Racon_${name} + + ## Redirect output stream to this file. + #PBS -o ${out}/log/Racon_${name}_log.txt + + ## Stream Standard Output AND Standard Error to outputfile (see above) + #PBS -j oe + + ## Select ${threads} cores and ${RAM}gb of RAM + #PBS -l select=1:ncpus=${threads}:mem=${RAM}g + + ######## load dependencies ####### + + source /opt/anaconda3/etc/profile.d/conda.sh + module load NGSmapper/minimap2-2.17 + conda activate racon_1.5.0 + conda activate medaka-1.4.4 + + ######## run analyses ####### + + ## Go to pwd + cd ${pwd} + + ## concatenate Illumina data (if needed) + + if [[ $data == 'ILL' ]] + then + gunzip -c ${out}/data/Illumina/${IllInp1}.fq.gz \ + | sed 's/ 1:.*/\/1/g' \ + | gzip > ${out}/results/Racon/Ill.fq.gz + + gunzip -c ${out}/data/Illumina/${IllInp2}.fq.gz \ + | sed 's/ 2:.*/\/2/g' \ + | gzip >> ${out}/results/Racon/Ill.fq.gz + fi + + ## make copy of unpolished contigs + + cp ${out}/output/${name}_${data}.fa ${out}/output/${name}_${data}_unpolished.fa + pigz ${out}/output/${name}_${data}_unpolished.fa + + ## do Racon polishing + for (( i=1; i<=${racon}; i++ )) + + do + + if [[ $data == 'ILL' ]] + then + + minimap2 \ + -x sr \ + -t ${threads} \ + ${out}/output/${name}_${data}.fa \ + ${out}/results/Racon/Ill.fq.gz \ + > ${out}/results/Racon/temp_reads_to_draft.paf + + racon \ + -t ${threads} \ + ${out}/results/Racon/Ill.fq.gz \ + ${out}/results/Racon/temp_reads_to_draft.paf \ + ${out}/output/${name}_${data}.fa \ + > ${out}/results/Racon/temp_draft_new.fa + + elif [[ $data == *'ONT'* ]] + then + + minimap2 \ + -x map-ont \ + -t ${threads} \ + ${out}/output/${name}_${data}.fa \ + ${out}/data/ONT/${OntInp}.fq.gz \ + > ${out}/results/Racon/temp_reads_to_draft.paf + + racon \ + -t ${threads} \ + ${out}/data/ONT/${OntInp}.fq.gz \ + ${out}/results/Racon/temp_reads_to_draft.paf \ + ${out}/output/${name}_${data}.fa \ + > ${out}/results/Racon/temp_draft_new.fa + + else + + minimap2 \ + -x map-pb \ + -t ${threads} \ + ${out}/output/${name}_${data}.fa \ + ${out}/data/PB/${PbInp}.fq.gz \ + > ${out}/results/Racon/temp_reads_to_draft.paf + + racon \ + -t ${threads} \ + ${out}/data/PB/${PbInp}.fq.gz \ + ${out}/results/Racon/temp_reads_to_draft.paf \ + ${out}/output/${name}_${data}.fa + > ${out}/results/Racon/temp_draft_new.fa + + fi + + mv ${out}/results/Racon/temp_draft_new.fa ${out}/output/${name}_${data}.fa + + done + + if [[ $data == 'ILL' ]] + then + rm -f ${out}/results/Racon/Ill.fq.gz + fi + +""" > ${out}/shell/qsub_racon_${name}.sh + +qsub -W block=true ${out}/shell/qsub_racon_${name}.sh diff --git a/README.md b/README.md index 90d616c..e4608b0 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,10 @@ The purpose of this repository is to provide a simple yet state-of-the-art de-no - **Trimmer**: String (default: "Trimgalore") Choose any of the four options: Atria, FastP, Trimgalore or UrQt. Note, the program will quit if the name is wrongly written. By default, TimGalore is used. +**(8) Polishing** you can optionally choose to polish the raw contigs with Racon: + +- **Racon**: Number (default: "no") The number chosen defines the number of polishing iterations. For example, if you choose 3, Racon polishing will be repeated three times. If multiple datatypes are provide the following types will preferrably used for polishing ONT > PB > ILL. + ## Command The pipeline is a simple shell script that executes a series of sub-shell scripts that serially send jobs to OpenPBS. A typcial commandline looks like this: @@ -80,7 +84,8 @@ cd AutDeNovo decont=no \ SmudgePlot=no \ BuscoDB=vertebrata_odb10 \ - Trimmer=Atria + Trimmer=TrimGalore \ + Racon=4 ``` ## Pipeline @@ -103,7 +108,11 @@ After that, the pipeline uses [Jellyfish](https://github.com/gmarcais/Jellyfish) In case Illumina high-quality sequencing data are available, the de-novo assembly is based on [SPAdes](https://github.com/ablab/spades) with standard parameters using trimmed and decontaminated reads. In case a combination of Illumina high-quality sequencing data and long-read data (ONT and/or PacBio) is available, the pipleine will nevertheless employ SPAdes for de-novo assembly based on Illumina reads. The long-read sequencing data will in this case be used for scaffolding. See [here](https://cab.spbu.ru/files/release3.15.4/manual.html) for more details on how SPAdes works. Conversely, the pipeline employs the [Flye](https://github.com/fenderglass/Flye) assembler with standard parameters for either ONT or PacBio long-read sequencing data alone or for a combination of both. If ONT and PacBio reads are processed jointly, the pipleine follows the [best practice recommendations](https://github.com/fenderglass/Flye/blob/flye/docs/FAQ.md#can-i-use-both-pacbio-and-ont-reads-for-assembly) and uses the ONT data for the initial assembly and the PacBio data for polishing. -### (5) Assembly QC +### (5) Contig polishing with [Racon](/media/inter/pipelines/AutDeNovo/Test/SomeFish) + +As an optional final step, the raw contigs can be polished based on the raw reads using [Racon](/media/inter/pipelines/AutDeNovo/Test/SomeFish). This step can be iterated multiple times by setting parameter Racon=, where is the number iterations. For example, if you choose 3, Racon polishing will be repeated three times. If multiple datatypes are provide for assembly, only one read-type will be used for polishing in the following preference order: ONT > PB > ILL. + +### (6) Assembly QC After the assembly is finished, the quality of the assembled genome will be assessed with a combination of different analysis tools as described below: @@ -144,6 +153,13 @@ Moreover, the full pipeline including all commands will be written to a file nam ## ChangeLog +### v.2.2 (15/10/2022) + +Minor update with several improvements + +- [x] Optionally perform polishing of raw contigs with Racon +- [x] Bug fixes + ### v.2.1 (10/08/2022) Minor update with several improvements