Skip to content

Commit

Permalink
v0.1.2 support just forward MiSeq reads
Browse files Browse the repository at this point in the history
  • Loading branch information
Jon Palmer authored and Jon Palmer committed Oct 4, 2015
1 parent ab0baca commit f19e205
Show file tree
Hide file tree
Showing 8 changed files with 51 additions and 44 deletions.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
88 changes: 47 additions & 41 deletions ufits-process_illumina_folder.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@ class col:
epilog="""Written by Jon Palmer (2015) [email protected]""",
formatter_class=MyFormatter)

parser.add_argument('-i','--input', dest='input', required=True, help='Folder of Illumina PE Data')
parser.add_argument('-i','--input', dest='input', required=True, help='Folder of Illumina Data')
parser.add_argument('-o','--out', dest="out", default='ufits-data', help='Name for output folder')
parser.add_argument('--reads', dest="reads", default='paired', choices=['paired', 'forward'], help='PE or forward reads')
parser.add_argument('-f','--fwd_primer', dest="F_primer", default='GTGARTCATCGAATCTTTG', help='Forward Primer (fITS7)')
parser.add_argument('-r','--rev_primer', dest="R_primer", default='TCCTCCGCTTATTGATATGC', help='Reverse Primer (ITS4)')
parser.add_argument('-n','--name_prefix', dest="prefix", default='R_', help='Prefix for renaming reads')
Expand Down Expand Up @@ -106,7 +107,8 @@ def setupLogging(LOGNAME):
ReadFile = InFile.read()
OutFile = open(OutName, 'w')
OutFile.write(ReadFile)
OutFile.close()
OutFile.close()
os.remove(os.path.join(args.input, file)) #remove .gz file

#now get the FASTQ files and proceed
filenames = []
Expand All @@ -126,7 +128,7 @@ def setupLogging(LOGNAME):
uniq_names = []
fastq_for = []
fastq_rev = []
map = os.path.join(args.out, 'ufits-filenames.txt')
map = 'ufits-filenames.txt'
map_file = open(map, 'w')
map_file.write("Name\t[i5]\t[i7]\tLane\tSet_num\n")
for item in sorted(filenames):
Expand All @@ -151,50 +153,54 @@ def setupLogging(LOGNAME):
log.debug("Non-standard names detected, skipping mapping file")
map_file.close()


#loop through each set
for i in range(len(fastq_for)):
name = fastq_for[i].split("_")[0]
for_reads = os.path.join(args.input, fastq_for[i])
rev_reads = os.path.join(args.input, fastq_rev[i])
log.info("Working on reads from sample %s" % name)
#get read length
fp = open(for_reads)
for i, line in enumerate(fp):
if i == 1:
read_length = len(line)
read_length = myround(read_length)
elif i > 2:
break
fp.close()

#now trim the last bp off of the Illumina data (there for phasing, i.e. 250 bp reads are 251 bp)
pretrim_R1 = os.path.join(args.out, 'pretrim_R1.fq')
pretrim_R2 = os.path.join(args.out, 'pretrim_R2.fq')
log.info("Merging Overlaping Pairs using USEARCH8")
log.debug("%s -fastq_filter %s -fastq_trunclen %s -fastqout %s" % (usearch, for_reads, str(read_length), pretrim_R1))
log.debug("%s -fastq_filter %s -fastq_trunclen %s -fastqout %s" % (usearch, rev_reads, str(read_length), pretrim_R2))
subprocess.call([usearch, '-fastq_filter', for_reads, '-fastq_trunclen', str(read_length), '-fastqout', pretrim_R1], stdout = FNULL, stderr = FNULL)
subprocess.call([usearch, '-fastq_filter', rev_reads, '-fastq_trunclen', str(read_length), '-fastqout', pretrim_R2], stdout = FNULL, stderr = FNULL)

#next run USEARCH8 mergepe
merge_out = os.path.join(args.out, 'merged.fq')
skip_for = os.path.join(args.out, 'notmerged.R1.fq')
log.debug("%s -fastq_mergepairs %s -reverse %s -fastqout %s -fastqout_notmerged_fwd %s -fastq_truncqual 5 -fastq_allowmergestagger -minhsp 12" % (usearch, pretrim_R1, pretrim_R2, merge_out, skip_for))
subprocess.call([usearch, '-fastq_mergepairs', for_reads, '-reverse', rev_reads, '-fastqout', merge_out, '-fastqout_notmerged_fwd', skip_for, '-fastq_truncqual', '5','-fastq_allowmergestagger','-minhsp', '12'], stdout = FNULL, stderr = FNULL)

#now concatenate files for downstream pre-process_illumina.py script
outname = name + '.fq'
final_out = os.path.join(args.out, outname)
cat_file = open(final_out, 'wb')
shutil.copyfileobj(open(merge_out,'rU'), cat_file)
shutil.copyfileobj(open(skip_for,'rU'), cat_file)
cat_file.close()

#clean and close up intermediate files
os.remove(merge_out)
os.remove(pretrim_R1)
os.remove(pretrim_R2)
os.remove(skip_for)
if args.reads == 'paired':
#get read length
fp = open(for_reads)
for i, line in enumerate(fp):
if i == 1:
read_length = len(line)
read_length = myround(read_length)
elif i > 2:
break
fp.close()

#now trim the last bp off of the Illumina data (there for phasing, i.e. 250 bp reads are 251 bp)
pretrim_R1 = os.path.join(args.out, 'pretrim_R1.fq')
pretrim_R2 = os.path.join(args.out, 'pretrim_R2.fq')
log.info("Merging Overlaping Pairs using USEARCH8")
log.debug("%s -fastq_filter %s -fastq_trunclen %s -fastqout %s" % (usearch, for_reads, str(read_length), pretrim_R1))
log.debug("%s -fastq_filter %s -fastq_trunclen %s -fastqout %s" % (usearch, rev_reads, str(read_length), pretrim_R2))
subprocess.call([usearch, '-fastq_filter', for_reads, '-fastq_trunclen', str(read_length), '-fastqout', pretrim_R1], stdout = FNULL, stderr = FNULL)
subprocess.call([usearch, '-fastq_filter', rev_reads, '-fastq_trunclen', str(read_length), '-fastqout', pretrim_R2], stdout = FNULL, stderr = FNULL)

#next run USEARCH8 mergepe
merge_out = os.path.join(args.out, 'merged.fq')
skip_for = os.path.join(args.out, 'notmerged.R1.fq')
log.debug("%s -fastq_mergepairs %s -reverse %s -fastqout %s -fastqout_notmerged_fwd %s -fastq_truncqual 5 -fastq_allowmergestagger -minhsp 12" % (usearch, pretrim_R1, pretrim_R2, merge_out, skip_for))
subprocess.call([usearch, '-fastq_mergepairs', for_reads, '-reverse', rev_reads, '-fastqout', merge_out, '-fastqout_notmerged_fwd', skip_for, '-fastq_truncqual', '5','-fastq_allowmergestagger','-minhsp', '12'], stdout = FNULL, stderr = FNULL)

#now concatenate files for downstream pre-process_illumina.py script
outname = name + '.fq'
final_out = os.path.join(args.out, outname)
cat_file = open(final_out, 'wb')
shutil.copyfileobj(open(merge_out,'rU'), cat_file)
shutil.copyfileobj(open(skip_for,'rU'), cat_file)
cat_file.close()

#clean and close up intermediate files
os.remove(merge_out)
os.remove(pretrim_R1)
os.remove(pretrim_R2)
os.remove(skip_for)
elif args.reads == 'forward':
final_out = for_reads

log.info("Strip primers, trim/pad to %s bp" % args.trim_len)

Expand Down
7 changes: 4 additions & 3 deletions ufits.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@
import sys, os, subprocess, inspect
script_path = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))

version = '0.1.1'
version = '0.1.2'

default_help = """
Usage: ufits <command> <arguments>
version: %s
Command: ion pre-process Ion Torrent data (find barcodes, trim/pad)
illumina pre-process folder of de-multiplexed Illumina data (gunzip, merge PE, trim/pad, concatenate)
Command: ion pre-process Ion Torrent data (find barcodes, remove primers, trim/pad)
illumina pre-process folder of de-multiplexed Illumina data (gunzip, merge PE, remove primers, trim/pad)
cluster cluster OTUs (using UPARSE algorithm)
filter OTU table filtering
heatmap Create heatmap from OTU table
Expand Down Expand Up @@ -56,6 +56,7 @@
Arguments: -i, --fastq Input FASTQ file (Required)
-o, --out Output folder name. Default: ufits-data
--reads Paired-end or forward reads. Default: paired [paired, forward]
-f, --fwd_primer Forward primer sequence. Default: GTGARTCATCGAATCTTTG (fITS7)
-r, --rev_primer Reverse primer sequence Default: TCCTCCGCTTATTGATATGC (ITS4)
-n, --name_prefix Prefix for re-naming reads. Default: R_
Expand Down

0 comments on commit f19e205

Please sign in to comment.