From f19e20505ff76a8c836896eae99af0a294533b74 Mon Sep 17 00:00:00 2001 From: Jon Palmer Date: Sun, 4 Oct 2015 15:52:18 -0500 Subject: [PATCH] v0.1.2 support just forward MiSeq reads --- ...1-1_TCCGGAGA-CCTATCCT_L001_R1_001.fastq.gz | Bin 13950 -> 13950 bytes ...1-1_TCCGGAGA-CCTATCCT_L001_R2_001.fastq.gz | Bin 16203 -> 16203 bytes ...1-2_TCCGGAGA-GGCTCTGA_L001_R1_001.fastq.gz | Bin 14208 -> 14208 bytes ...1-2_TCCGGAGA-GGCTCTGA_L001_R2_001.fastq.gz | Bin 15887 -> 15887 bytes ...ike_CGCTCATT-GGCTCTGA_L001_R1_001.fastq.gz | Bin 58331 -> 58331 bytes ...ike_CGCTCATT-GGCTCTGA_L001_R2_001.fastq.gz | Bin 66344 -> 66344 bytes ufits-process_illumina_folder.py | 88 ++++++++++-------- ufits.py | 7 +- 8 files changed, 51 insertions(+), 44 deletions(-) diff --git a/test_data/illumina_test_data/301-1_TCCGGAGA-CCTATCCT_L001_R1_001.fastq.gz b/test_data/illumina_test_data/301-1_TCCGGAGA-CCTATCCT_L001_R1_001.fastq.gz index 5abf84dfadc1dc16e54e7099293daf775dd73838..a945c2815d741d8da2de4a875041be37e2072602 100644 GIT binary patch delta 16 XcmeyD^Dl>8zMF$1WP;#E_Hr`-J3$5T delta 16 XcmeyD^Dl>8zMF&NUo7uN_Hr`-KA{F< diff --git a/test_data/illumina_test_data/301-1_TCCGGAGA-CCTATCCT_L001_R2_001.fastq.gz b/test_data/illumina_test_data/301-1_TCCGGAGA-CCTATCCT_L001_R2_001.fastq.gz index 85c3a41b390d897d746f7c83196cadd89f424285..e843bcaa9453b3d108d6f0c7041e764860934506 100644 GIT binary patch delta 16 XcmX?Ice;*UzMF$1WP;#Eb{Bg9HsS?@ delta 16 XcmX?Ice;*UzMF&NUo7uNb{Bg9Izk2a diff --git a/test_data/illumina_test_data/301-2_TCCGGAGA-GGCTCTGA_L001_R1_001.fastq.gz b/test_data/illumina_test_data/301-2_TCCGGAGA-GGCTCTGA_L001_R1_001.fastq.gz index 51c934396e43d585e6aae252d726d69e0defbad3..69fe007d23b707e4e24658c6560456eb9bffaee8 100644 GIT binary patch delta 16 XcmZq3Z^&nt@8;kLnIO24z0w>2EY<}5 delta 16 XcmZq3Z^&nt@8;mJkK^6QUTF>hD+2_w diff --git a/test_data/illumina_test_data/301-2_TCCGGAGA-GGCTCTGA_L001_R2_001.fastq.gz b/test_data/illumina_test_data/301-2_TCCGGAGA-GGCTCTGA_L001_R2_001.fastq.gz index 019fb5ecb86e32096b56f27a10d4d6b0f0c2d020..9907e105767a9baff1082c49fc263f1b4fbb6809 100644 GIT binary patch delta 16 XcmeCL>91jz@8;kLnIO24ox=_QFDV4e delta 16 XcmeCL>91jz@8;mJkK^6Q&S3`tEmj18 diff --git a/test_data/illumina_test_data/spike_CGCTCATT-GGCTCTGA_L001_R1_001.fastq.gz b/test_data/illumina_test_data/spike_CGCTCATT-GGCTCTGA_L001_R1_001.fastq.gz index 59c56c3e500ac086a3de23040c378a84faae7f3d..bb944c174f49459043863e6086db0a31ced3362e 100644 GIT binary patch delta 18 acmcb8ocZ=~W_I~*4vvrsf*aYdJO%(q!3S^v delta 18 acmcb8ocZ=~W_I~*4i2UY-i_>69s>YG-v7-JEI~q07J_K761SM diff --git a/ufits-process_illumina_folder.py b/ufits-process_illumina_folder.py index 248b727..0895762 100755 --- a/ufits-process_illumina_folder.py +++ b/ufits-process_illumina_folder.py @@ -22,8 +22,9 @@ class col: epilog="""Written by Jon Palmer (2015) nextgenusfs@gmail.com""", formatter_class=MyFormatter) -parser.add_argument('-i','--input', dest='input', required=True, help='Folder of Illumina PE Data') +parser.add_argument('-i','--input', dest='input', required=True, help='Folder of Illumina Data') parser.add_argument('-o','--out', dest="out", default='ufits-data', help='Name for output folder') +parser.add_argument('--reads', dest="reads", default='paired', choices=['paired', 'forward'], help='PE or forward reads') parser.add_argument('-f','--fwd_primer', dest="F_primer", default='GTGARTCATCGAATCTTTG', help='Forward Primer (fITS7)') parser.add_argument('-r','--rev_primer', dest="R_primer", default='TCCTCCGCTTATTGATATGC', help='Reverse Primer (ITS4)') parser.add_argument('-n','--name_prefix', dest="prefix", default='R_', help='Prefix for renaming reads') @@ -106,7 +107,8 @@ def setupLogging(LOGNAME): ReadFile = InFile.read() OutFile = open(OutName, 'w') OutFile.write(ReadFile) - OutFile.close() + OutFile.close() + os.remove(os.path.join(args.input, file)) #remove .gz file #now get the FASTQ files and proceed filenames = [] @@ -126,7 +128,7 @@ def setupLogging(LOGNAME): uniq_names = [] fastq_for = [] fastq_rev = [] -map = os.path.join(args.out, 'ufits-filenames.txt') +map = 'ufits-filenames.txt' map_file = open(map, 'w') map_file.write("Name\t[i5]\t[i7]\tLane\tSet_num\n") for item in sorted(filenames): @@ -151,50 +153,54 @@ def setupLogging(LOGNAME): log.debug("Non-standard names detected, skipping mapping file") map_file.close() + #loop through each set for i in range(len(fastq_for)): name = fastq_for[i].split("_")[0] for_reads = os.path.join(args.input, fastq_for[i]) rev_reads = os.path.join(args.input, fastq_rev[i]) log.info("Working on reads from sample %s" % name) - #get read length - fp = open(for_reads) - for i, line in enumerate(fp): - if i == 1: - read_length = len(line) - read_length = myround(read_length) - elif i > 2: - break - fp.close() - - #now trim the last bp off of the Illumina data (there for phasing, i.e. 250 bp reads are 251 bp) - pretrim_R1 = os.path.join(args.out, 'pretrim_R1.fq') - pretrim_R2 = os.path.join(args.out, 'pretrim_R2.fq') - log.info("Merging Overlaping Pairs using USEARCH8") - log.debug("%s -fastq_filter %s -fastq_trunclen %s -fastqout %s" % (usearch, for_reads, str(read_length), pretrim_R1)) - log.debug("%s -fastq_filter %s -fastq_trunclen %s -fastqout %s" % (usearch, rev_reads, str(read_length), pretrim_R2)) - subprocess.call([usearch, '-fastq_filter', for_reads, '-fastq_trunclen', str(read_length), '-fastqout', pretrim_R1], stdout = FNULL, stderr = FNULL) - subprocess.call([usearch, '-fastq_filter', rev_reads, '-fastq_trunclen', str(read_length), '-fastqout', pretrim_R2], stdout = FNULL, stderr = FNULL) - - #next run USEARCH8 mergepe - merge_out = os.path.join(args.out, 'merged.fq') - skip_for = os.path.join(args.out, 'notmerged.R1.fq') - log.debug("%s -fastq_mergepairs %s -reverse %s -fastqout %s -fastqout_notmerged_fwd %s -fastq_truncqual 5 -fastq_allowmergestagger -minhsp 12" % (usearch, pretrim_R1, pretrim_R2, merge_out, skip_for)) - subprocess.call([usearch, '-fastq_mergepairs', for_reads, '-reverse', rev_reads, '-fastqout', merge_out, '-fastqout_notmerged_fwd', skip_for, '-fastq_truncqual', '5','-fastq_allowmergestagger','-minhsp', '12'], stdout = FNULL, stderr = FNULL) - - #now concatenate files for downstream pre-process_illumina.py script - outname = name + '.fq' - final_out = os.path.join(args.out, outname) - cat_file = open(final_out, 'wb') - shutil.copyfileobj(open(merge_out,'rU'), cat_file) - shutil.copyfileobj(open(skip_for,'rU'), cat_file) - cat_file.close() - - #clean and close up intermediate files - os.remove(merge_out) - os.remove(pretrim_R1) - os.remove(pretrim_R2) - os.remove(skip_for) + if args.reads == 'paired': + #get read length + fp = open(for_reads) + for i, line in enumerate(fp): + if i == 1: + read_length = len(line) + read_length = myround(read_length) + elif i > 2: + break + fp.close() + + #now trim the last bp off of the Illumina data (there for phasing, i.e. 250 bp reads are 251 bp) + pretrim_R1 = os.path.join(args.out, 'pretrim_R1.fq') + pretrim_R2 = os.path.join(args.out, 'pretrim_R2.fq') + log.info("Merging Overlaping Pairs using USEARCH8") + log.debug("%s -fastq_filter %s -fastq_trunclen %s -fastqout %s" % (usearch, for_reads, str(read_length), pretrim_R1)) + log.debug("%s -fastq_filter %s -fastq_trunclen %s -fastqout %s" % (usearch, rev_reads, str(read_length), pretrim_R2)) + subprocess.call([usearch, '-fastq_filter', for_reads, '-fastq_trunclen', str(read_length), '-fastqout', pretrim_R1], stdout = FNULL, stderr = FNULL) + subprocess.call([usearch, '-fastq_filter', rev_reads, '-fastq_trunclen', str(read_length), '-fastqout', pretrim_R2], stdout = FNULL, stderr = FNULL) + + #next run USEARCH8 mergepe + merge_out = os.path.join(args.out, 'merged.fq') + skip_for = os.path.join(args.out, 'notmerged.R1.fq') + log.debug("%s -fastq_mergepairs %s -reverse %s -fastqout %s -fastqout_notmerged_fwd %s -fastq_truncqual 5 -fastq_allowmergestagger -minhsp 12" % (usearch, pretrim_R1, pretrim_R2, merge_out, skip_for)) + subprocess.call([usearch, '-fastq_mergepairs', for_reads, '-reverse', rev_reads, '-fastqout', merge_out, '-fastqout_notmerged_fwd', skip_for, '-fastq_truncqual', '5','-fastq_allowmergestagger','-minhsp', '12'], stdout = FNULL, stderr = FNULL) + + #now concatenate files for downstream pre-process_illumina.py script + outname = name + '.fq' + final_out = os.path.join(args.out, outname) + cat_file = open(final_out, 'wb') + shutil.copyfileobj(open(merge_out,'rU'), cat_file) + shutil.copyfileobj(open(skip_for,'rU'), cat_file) + cat_file.close() + + #clean and close up intermediate files + os.remove(merge_out) + os.remove(pretrim_R1) + os.remove(pretrim_R2) + os.remove(skip_for) + elif args.reads == 'forward': + final_out = for_reads log.info("Strip primers, trim/pad to %s bp" % args.trim_len) diff --git a/ufits.py b/ufits.py index e7c5401..4e53b04 100755 --- a/ufits.py +++ b/ufits.py @@ -5,14 +5,14 @@ import sys, os, subprocess, inspect script_path = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) -version = '0.1.1' +version = '0.1.2' default_help = """ Usage: ufits version: %s -Command: ion pre-process Ion Torrent data (find barcodes, trim/pad) - illumina pre-process folder of de-multiplexed Illumina data (gunzip, merge PE, trim/pad, concatenate) +Command: ion pre-process Ion Torrent data (find barcodes, remove primers, trim/pad) + illumina pre-process folder of de-multiplexed Illumina data (gunzip, merge PE, remove primers, trim/pad) cluster cluster OTUs (using UPARSE algorithm) filter OTU table filtering heatmap Create heatmap from OTU table @@ -56,6 +56,7 @@ Arguments: -i, --fastq Input FASTQ file (Required) -o, --out Output folder name. Default: ufits-data + --reads Paired-end or forward reads. Default: paired [paired, forward] -f, --fwd_primer Forward primer sequence. Default: GTGARTCATCGAATCTTTG (fITS7) -r, --rev_primer Reverse primer sequence Default: TCCTCCGCTTATTGATATGC (ITS4) -n, --name_prefix Prefix for re-naming reads. Default: R_