diff --git a/Dockerfile-base b/Dockerfile-base index 2f6b60e..211ac09 100644 --- a/Dockerfile-base +++ b/Dockerfile-base @@ -24,7 +24,9 @@ RUN wget https://github.com/torognes/vsearch/archive/v2.4.3.tar.gz && \ make install && \ cd .. -RUN git clone git://github.com/nextgenusfs/amptk.git && \ +RUN wget https://github.com/nextgenusfs/amptk/archive/0.10.2.tar.gz && \ + tar xzf 0.10.2.tar.gz && \ + mv amptk-0.10.2 amptk && \ cd amptk && \ make && \ cd .. diff --git a/amptk.py b/amptk.py index 502fcc4..060dcd9 100755 --- a/amptk.py +++ b/amptk.py @@ -59,7 +59,7 @@ def download(url, name): sys.stdout.write(status) f.close() -version = '0.10.1' +version = '0.10.2' default_help = """ Usage: amptk @@ -762,6 +762,7 @@ def download(url, name): -m, --mapping_file QIIME-like mapping file -t, --taxonomy Taxonomy calculated elsewhere. 2 Column file. --method Taxonomy method. Default: hybrid [utax, sintax, usearch, hybrid, rdp, blast] + --add2db Add FASTA files to DB on the fly. --fasta_db Alternative database of fasta sequenes to use for global alignment. --utax_db UTAX formatted database. Default: ITS2.udb [See configured DB's below] --utax_cutoff UTAX confidence value threshold. Default: 0.8 [0 to 0.9] diff --git a/bin/amptk-fastq2sra.py b/bin/amptk-fastq2sra.py index 8b26663..c9870f6 100755 --- a/bin/amptk-fastq2sra.py +++ b/bin/amptk-fastq2sra.py @@ -41,13 +41,6 @@ class col: parser.add_argument('-a','--append', help='Append a name to all sample names for a run, i.e. --append run1 would yield Sample_run1') args=parser.parse_args() -def FindBarcode(Seq, BarcodeDict): - for BarcodeLabel in BarcodeDict.keys(): - Barcode = BarcodeDict[BarcodeLabel] - if Seq.startswith(Barcode): - return Barcode, BarcodeLabel - return "", "" - log_name = args.out + '.amptk-sra.log' if os.path.isfile(log_name): os.remove(log_name) @@ -238,12 +231,12 @@ def FindBarcode(Seq, BarcodeDict): #look for forward primer if args.require_primer != 'off': #means we only want ones with forward primer and or reverse, but don't remove them #now search for forward primer - foralign = edlib.align(FwdPrimer, seq, mode="HW", k=args.primer_mismatch) + foralign = edlib.align(FwdPrimer, seq, mode="HW", k=args.primer_mismatch, additionalEqualities=amptklib.degenNuc) if foralign["editDistance"] < 0: continue if args.require_primer == 'both': #now search for reverse primer - revalign = edlib.align(ReverseCompRev, seq, mode="HW", task="locations", k=args.primer_mismatch) + revalign = edlib.align(ReverseCompRev, seq, mode="HW", task="locations", k=args.primer_mismatch, additionalEqualities=amptklib.degenNuc) if revalign["editDistance"] < 0: #reverse primer was not found continue #check size diff --git a/bin/amptk-process_illumina_folder.py b/bin/amptk-process_illumina_folder.py index 97e94ed..b940f69 100755 --- a/bin/amptk-process_illumina_folder.py +++ b/bin/amptk-process_illumina_folder.py @@ -63,7 +63,7 @@ def processRead(input): for title, seq, qual in FastqGeneralIterator(open(input)): Total += 1 #first thing is look for forward primer, if found trim it off - foralign = edlib.align(FwdPrimer, seq, mode="HW", k=args.primer_mismatch) + foralign = edlib.align(FwdPrimer, seq, mode="HW", k=args.primer_mismatch, additionalEqualities=amptklib.degenNuc) #if require primer is on make finding primer in amplicon required if amplicon is larger than read length #if less than read length, can't enforce primer because could have been trimmed via staggered trim in fastq_mergepairs if args.primer == 'on' and len(seq) > ReadLen: @@ -83,7 +83,7 @@ def processRead(input): Seq = seq Qual = qual #now look for reverse primer - revalign = edlib.align(RevPrimer, Seq, mode="HW", task="locations", k=args.primer_mismatch) + revalign = edlib.align(RevPrimer, Seq, mode="HW", task="locations", k=args.primer_mismatch, additionalEqualities=amptklib.degenNuc) if revalign["editDistance"] >= 0: RevPrimerFound += 1 RevCutPos = revalign["locations"][0][0] diff --git a/bin/amptk-process_illumina_raw.py b/bin/amptk-process_illumina_raw.py index 0b32304..f5f4052 100755 --- a/bin/amptk-process_illumina_raw.py +++ b/bin/amptk-process_illumina_raw.py @@ -68,7 +68,7 @@ def processRead(input): NoBC += 1 continue #first thing is look for forward primer, if found trim it off - foralign = edlib.align(FwdPrimer, seq, mode="HW", k=args.primer_mismatch) + foralign = edlib.align(FwdPrimer, seq, mode="HW", k=args.primer_mismatch, additionalEqualities=amptklib.degenNuc) #if require primer is on make finding primer in amplicon required if amplicon is larger than read length #if less than read length, can't enforce primer because could have been trimmed via staggered trim in fastq_mergepairs if args.primer == 'on' and len(seq) > ReadLen: @@ -88,7 +88,7 @@ def processRead(input): Seq = seq Qual = qual #now look for reverse primer - revalign = edlib.align(RevPrimer, Seq, mode="HW", task="locations", k=args.primer_mismatch) + revalign = edlib.align(RevPrimer, Seq, mode="HW", task="locations", k=args.primer_mismatch, additionalEqualities=amptklib.degenNuc) if revalign["editDistance"] >= 0: RevPrimerFound += 1 RevCutPos = revalign["locations"][0][0] diff --git a/bin/amptk-process_ion.py b/bin/amptk-process_ion.py index f9c98f3..b84c9e3 100755 --- a/bin/amptk-process_ion.py +++ b/bin/amptk-process_ion.py @@ -76,13 +76,13 @@ def processRead(input): Seq = seq[BarcodeLength:] Qual = qual[BarcodeLength:] #now search for forward primer - foralign = edlib.align(FwdPrimer, Seq, mode="HW", k=args.primer_mismatch) + foralign = edlib.align(FwdPrimer, Seq, mode="HW", k=args.primer_mismatch, additionalEqualities=amptklib.degenNuc) if foralign["editDistance"] < 0: NoPrimer += 1 continue ForTrim = foralign["locations"][0][1]+1 #now search for reverse primer - revalign = edlib.align(RevPrimer, Seq, mode="HW", task="locations", k=args.primer_mismatch) + revalign = edlib.align(RevPrimer, Seq, mode="HW", task="locations", k=args.primer_mismatch, additionalEqualities=amptklib.degenNuc) if revalign["editDistance"] >= 0: #reverse primer was found RevPrimerFound += 1 #location to trim sequences @@ -169,7 +169,7 @@ def processRead(input): #check if mapping file passed, use this if present, otherwise use command line arguments if args.mapping_file: if not os.path.isfile(args.mapping_file): - amptklib.error("Mapping file is not valid: %s" % args.mapping_file) + amptklib.log.error("Mapping file is not valid: %s" % args.mapping_file) sys.exit(1) mapdata = amptklib.parseMappingFile(args.mapping_file, barcode_file) #forward primer in first item in tuple, reverse in second diff --git a/lib/amptklib.py b/lib/amptklib.py index bc00094..f86a8ca 100644 --- a/lib/amptklib.py +++ b/lib/amptklib.py @@ -10,11 +10,132 @@ primer_db = {'fITS7': 'GTGARTCATCGAATCTTTG', 'ITS4': 'TCCTCCGCTTATTGATATGC', 'ITS1-F': 'CTTGGTCATTTAGAGGAAGTAA', 'ITS2': 'GCTGCGTTCTTCATCGATGC', 'ITS3': 'GCATCGATGAAGAACGCAGC', 'ITS4-B': 'CAGGAGACTTGTACACGGTCCAG', 'ITS1': 'TCCGTAGGTGAACCTGCGG', 'LR0R': 'ACCCGCTGAACTTAAGC', 'LR2R': 'AAGAACTTTGAAAAGAG', 'JH-LS-369rc': 'CTTCCCTTTCAACAATTTCAC', '16S_V3': 'CCTACGGGNGGCWGCAG', '16S_V4': 'GACTACHVGGGTATCTAATCC', 'ITS3_KYO2': 'GATGAAGAACGYAGYRAA', 'COI-F': 'GGTCAACAAATCATAAAGATATTGG', 'COI-R': 'GGWACTAATCAATTTCCAAATCC', '515FB': 'GTGYCAGCMGCCGCGGTAA', '806RB': 'GGACTACNVGGGTWTCTAAT'} + +degenNuc = [("R", "A"), ("R", "G"), + ("M", "A"), ("M", "C"), + ("W", "A"), ("W", "T"), + ("S", "C"), ("S", "G"), + ("Y", "C"), ("Y", "T"), + ("K", "G"), ("K", "T"), + ("V", "A"), ("V", "C"), ("V", "G"), + ("H", "A"), ("H", "C"), ("H", "T"), + ("D", "A"), ("D", "G"), ("D", "T"), + ("B", "C"), ("B", "G"), ("B", "T"), + ("N", "G"), ("N", "A"), ("N", "T"), ("N", "C"), + ("X", "G"), ("X", "A"), ("X", "T"), ("X", "C")] + class colr: GRN = '\033[92m' END = '\033[0m' WARN = '\033[93m' +#functions for system checks, etc +def get_version(): + version = subprocess.Popen(['amptk', 'version'], stdout=subprocess.PIPE).communicate()[0].rstrip() + return version + +def get_usearch_version(usearch): + try: + version = subprocess.Popen([usearch, '-version'], stderr=subprocess.PIPE, stdout=subprocess.PIPE).communicate() + except OSError: + log.error("%s not found in your PATH, exiting." % usearch) + sys.exit(1) + vers = version[0].replace('usearch v', '') + vers2 = vers.split('_')[0] + return vers2 + +def get_vsearch_version(): + version = subprocess.Popen(['vsearch', '--version'], stderr=subprocess.PIPE, stdout=subprocess.PIPE).communicate() + for v in version: + if v.startswith('vsearch'): + vers = v.replace('vsearch v', '') + vers2 = vers.split('_')[0] + return vers2 + +def versiontuple(v): + return tuple(map(int, (v.split(".")))) + +def gvc(input, check): + if versiontuple(input) >= versiontuple(check): + return True + else: + return False + +def versionDependencyChecks(usearch): + #to run amptk need usearch > 9.0.2132 and vsearch > 2.2.0 + amptk_version = get_version() + usearch_version = get_usearch_version(usearch) + vsearch_version = get_vsearch_version() + usearch_pass = '9.0.2132' + vsearch_pass = '2.2.0' + if not gvc(usearch_version, usearch_pass): + log.error("USEARCH v%s detected, needs to be atleast v%s" % (usearch_version, usearch_pass)) + sys.exit(1) + if not gvc(vsearch_version, vsearch_pass): + log.error("VSEARCH v%s detected, needs to be atleast v%s" % (vsearch_version, vsearch_pass)) + sys.exit(1) + log.info("%s, USEARCH v%s, VSEARCH v%s" % (amptk_version, usearch_version, vsearch_version)) + +def checkRversion(): + #need to have R version > 3.2 + cmd = ['Rscript', '--vanilla', os.path.join(parentdir, 'util', 'check_version.R')] + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = proc.communicate() + versions = stdout.replace(' \n', '') + Rvers = versions.split(',')[0] + dada2 = versions.split(',')[1] + return (Rvers, dada2) + +def checkfastqsize(input): + filesize = os.path.getsize(input) + return filesize + +def getCPUS(): + cores = multiprocessing.cpu_count() + return cores + +def MemoryCheck(): + import psutil + mem = psutil.virtual_memory() + RAM = int(mem.total) + return round(RAM / 1024000000) + +def systemOS(): + if sys.platform == 'darwin': + system_os = 'MacOSX '+ platform.mac_ver()[0] + elif sys.platform == 'linux': + linux_version = platform.linux_distribution() + system_os = linux_version[0]+ ' '+linux_version[1] + else: + system_os = sys.platform + return system_os + +def SystemInfo(): + system_os = systemOS() + python_vers = str(sys.version_info[0])+'.'+str(sys.version_info[1])+'.'+str(sys.version_info[2]) + log.info("OS: %s, %i cores, ~ %i GB RAM. Python: %s" % (system_os, multiprocessing.cpu_count(), MemoryCheck(), python_vers)) + #print python modules to logfile + mod_versions() + +def mod_versions(): + import pkg_resources + modules = ['numpy', 'pandas', 'matplotlib', 'psutil', 'natsort', 'biopython', 'edlib', 'biom-format'] + results = [] + for x in modules: + try: + vers = pkg_resources.get_distribution(x).version + hit = "%s v%s" % (x, vers) + except pkg_resources.DistributionNotFound: + hit = "%s NOT installed!" % x + results.append(hit) + if x == 'edlib': + vers = vers.replace('.post2', '') + if not gvc(vers, '1.2.0'): + log.error("Edlib v%s detected, at least v1.2.0 required for degenerate nucleotide search, please upgrade. e.g. pip install -U edlib" % vers) + sys.exit(1) + log.debug("Python Modules: %s" % ', '.join(results)) + + class gzopen(object): """Generic opener that decompresses gzipped files if needed. Encapsulates an open file or a GzipFile. @@ -101,20 +222,6 @@ def Fzip_inplace(input): except NameError: subprocess.call(cmd) - -def mod_versions(): - import pkg_resources - modules = ['numpy', 'pandas', 'matplotlib', 'psutil', 'natsort', 'biopython', 'edlib', 'biom-format'] - results = [] - for x in modules: - try: - vers = pkg_resources.get_distribution(x).version - hit = "%s v%s" % (x, vers) - except pkg_resources.DistributionNotFound: - hit = "%s NOT installed!" % x - results.append(hit) - log.debug("Python Modules: %s" % ', '.join(results)) - def fastalen2dict(input): Lengths = {} with open(input, 'rU') as infile: @@ -330,8 +437,13 @@ def AlignRevBarcode(Seq, BarcodeDict, mismatch): if besthit[2] <= int(mismatch): return besthit[0], besthit[1] return "", "" - - + +def findFwdPrimer(primer, sequence, mismatch): + return edlib.align(primer, sequence, mode="HW", k=mismatch, additionalEqualities=degenNuc) + +def findRevPrimer(primer, sequence, mismatch, degen): + return edlib.align(primer, sequence, mode="HW", task="locations", k=mismatch, additionalEqualities=degenNuc) + def MergeReads(R1, R2, tmpdir, outname, read_length, minlen, usearch, rescue, method, index, mismatch): removelist = [] if mismatch == 0: @@ -418,30 +530,6 @@ def dictFlip(input): print "duplicate ID found: %s" % i return outDict -def fuzzymatch(seq1, seq2, num_errors): - from Bio import pairwise2 - seq1_a, seq2_a, score, start, end = pairwise2.align.localms(seq1, seq2, 5.0, -4.0, -9.0, -0.5, one_alignment_only=True, gap_char='-')[0] - if start < 2: #align needs to start in first 2 bp - if end < len(seq1)+2: - match_region = seq2_a[start:end] - seq_region = seq1_a[start:end] - matches = sum((1 if s == match_region[i] else 0) for i, s in enumerate(seq_region)) - # too many errors -- no trimming - if (len(seq1) - matches) <= int(num_errors): - return (score, start, end) - -def exactmatch(seq1, seq2): - from Bio import pairwise2 - seq1_a, seq2_a, score, start, end = pairwise2.align.localms(seq1, seq2, 5.0, -4.0, -9.0, -0.5, one_alignment_only=True, gap_char='-')[0] - match_region = seq2_a[start:end] - seq_region = seq1_a[start:end] - matches = sum((1 if s == match_region[i] else 0) for i, s in enumerate(seq_region)) - # too many errors -- no trimming - if (len(seq1) - matches) <= 0: - return (score, start, end) - else: - return False - def classifier2dict(input, pcutoff): ClassyDict = {} with open(input, 'rU') as infile: @@ -625,62 +713,6 @@ def mapping2dict(input): sys.exit(1) return MapDict -def get_version(): - version = subprocess.Popen(['amptk', 'version'], stdout=subprocess.PIPE).communicate()[0].rstrip() - return version - -def get_usearch_version(usearch): - try: - version = subprocess.Popen([usearch, '-version'], stderr=subprocess.PIPE, stdout=subprocess.PIPE).communicate() - except OSError: - log.error("%s not found in your PATH, exiting." % usearch) - sys.exit(1) - vers = version[0].replace('usearch v', '') - vers2 = vers.split('_')[0] - return vers2 - -def get_vsearch_version(): - version = subprocess.Popen(['vsearch', '--version'], stderr=subprocess.PIPE, stdout=subprocess.PIPE).communicate() - for v in version: - if v.startswith('vsearch'): - vers = v.replace('vsearch v', '') - vers2 = vers.split('_')[0] - return vers2 - -def versiontuple(v): - return tuple(map(int, (v.split(".")))) - -def gvc(input, check): - if versiontuple(input) >= versiontuple(check): - return True - else: - return False - -def versionDependencyChecks(usearch): - #to run amptk need usearch > 9.0.2132 and vsearch > 2.2.0 - amptk_version = get_version() - usearch_version = get_usearch_version(usearch) - vsearch_version = get_vsearch_version() - usearch_pass = '9.0.2132' - vsearch_pass = '2.2.0' - if not gvc(usearch_version, usearch_pass): - log.error("USEARCH v%s detected, needs to be atleast v%s" % (usearch_version, usearch_pass)) - sys.exit(1) - if not gvc(vsearch_version, vsearch_pass): - log.error("VSEARCH v%s detected, needs to be atleast v%s" % (vsearch_version, vsearch_pass)) - sys.exit(1) - log.info("%s, USEARCH v%s, VSEARCH v%s" % (amptk_version, usearch_version, vsearch_version)) - -def checkRversion(): - #need to have R version > 3.2 - cmd = ['Rscript', '--vanilla', os.path.join(parentdir, 'util', 'check_version.R')] - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = proc.communicate() - versions = stdout.replace(' \n', '') - Rvers = versions.split(',')[0] - dada2 = versions.split(',')[1] - return (Rvers, dada2) - def runMultiProgress(function, inputList, cpus): #setup pool p = multiprocessing.Pool(cpus) @@ -700,29 +732,6 @@ def runMultiProgress(function, inputList, cpus): p.close() p.join() -def MemoryCheck(): - import psutil - mem = psutil.virtual_memory() - RAM = int(mem.total) - return round(RAM / 1024000000) - -def systemOS(): - if sys.platform == 'darwin': - system_os = 'MacOSX '+ platform.mac_ver()[0] - elif sys.platform == 'linux': - linux_version = platform.linux_distribution() - system_os = linux_version[0]+ ' '+linux_version[1] - else: - system_os = sys.platform - return system_os - -def SystemInfo(): - system_os = systemOS() - python_vers = str(sys.version_info[0])+'.'+str(sys.version_info[1])+'.'+str(sys.version_info[2]) - log.info("OS: %s, %i cores, ~ %i GB RAM. Python: %s" % (system_os, multiprocessing.cpu_count(), MemoryCheck(), python_vers)) - #print python modules to logfile - mod_versions() - def batch_iterator(iterator, batch_size): entry = True #Make sure we loop once @@ -907,13 +916,6 @@ def guess_csv_dialect(header): dialect = csv.Sniffer().sniff(header, delimiters=possible_delims) return dialect -def checkfastqsize(input): - filesize = os.path.getsize(input) - return filesize - -def getCPUS(): - cores = multiprocessing.cpu_count() - return cores def fasta2list(input): seqlist = [] @@ -953,10 +955,11 @@ def parseMappingFile(input, output): continue cols = line.split('\t') outfile.write('>%s\n%s\n' % (cols[0], cols[1])) - match = exactmatch(cols[1], cols[2]) - if match: + match = edlib.align(cols[1], cols[2], mode="HW", k=0) + if match["editDistance"] == 0: + Trim = match["locations"][0][1]+1 if fwdprimer == '': - fwdprimer = cols[2][int(match[2]):] + fwdprimer = cols[2][Trim:] revprimer = cols[3] else: print "%s: barcode sequence not in LinkerPrimerSequence" % cols[0] @@ -974,10 +977,11 @@ def parseMappingFileIllumina(input): cols = line.split('\t') if not cols[0] in samples: samples.append(cols[0]) - match = exactmatch(cols[1], cols[2]) - if match: + match = edlib.align(cols[1], cols[2], mode="HW", k=0) + if match["editDistance"] == 0: + Trim = match["locations"][0][1]+1 if fwdprimer == '': - fwdprimer = cols[2][int(match[2]):] + fwdprimer = cols[2][Trim:] revprimer = cols[3] else: fwdprimer = cols[2] diff --git a/util/amptk-get_barcode_counts.py b/util/amptk-get_barcode_counts.py index 6765e60..5aac5e6 100755 --- a/util/amptk-get_barcode_counts.py +++ b/util/amptk-get_barcode_counts.py @@ -76,9 +76,9 @@ def countBarcodes(file): #now let's count the barcodes found and count the number of times they are found. - barcode_counts = "%20s: %s" % ('Sample', 'Count') + barcode_counts = "%10s: %s" % ('Sample', 'Count') for k,v in natsorted(BarcodeCount.items(), key=lambda (k,v): v, reverse=True): - barcode_counts += "\n%20s: %s" % (k, str(BarcodeCount[k])) + barcode_counts += "\n%10s: %s" % (k, str(BarcodeCount[k])) print("Found %i barcoded samples\n%s" % (len(BarcodeCount), barcode_counts)) def getSeqLength(file): diff --git a/util/amptk-keep_samples.py b/util/amptk-keep_samples.py index 4c49b7a..ea97386 100755 --- a/util/amptk-keep_samples.py +++ b/util/amptk-keep_samples.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -import sys, argparse, os, inspect, itertools +import sys, argparse, os, inspect, itertools, multiprocessing from Bio.SeqIO.QualityIO import FastqGeneralIterator currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) parentdir = os.path.dirname(currentdir) @@ -50,10 +50,17 @@ def filter_sample(file, output): if args.format == 'fasta': out.write(">%s\n%s\n" % (title, seq)) +#check if input compressed, incompress if it is +if args.input.endswith('.gz'): + SeqIn = args.input.replace('.gz', '') + amptklib.Funzip(args.input, SeqIn, multiprocessing.cpu_count()) +else: + SeqIn = args.input + keepers = [] if args.threshold: print("Keeping samples with more than %i reads" % args.threshold) - BC_counts = countBarcodes(args.input) + BC_counts = countBarcodes(SeqIn) for k,v in BC_counts.items(): if int(v) >= args.threshold: if not k in keepers: @@ -78,9 +85,19 @@ def filter_sample(file, output): #now run filtering keep_count = 0 total_count = 0 -filter_sample(args.input, args.out) + +#rename to base +if args.out.endswith('.gz'): + outfile = args.out.replace('.gz', '') +else: + outfile = args.out +#run filtering +filter_sample(SeqIn, outfile) +#compress and clean if args.out.endswith('.gz'): #compress in place - amptklib.Fzip_inplace(args.out) + amptklib.Fzip_inplace(outfile) +if args.input.endswith('.gz'): + amptklib.removefile(SeqIn) print("Kept %i reads out of %i total reads" % (keep_count, total_count)) diff --git a/util/amptk-remove_samples.py b/util/amptk-remove_samples.py index 6e5fa1b..69e465b 100755 --- a/util/amptk-remove_samples.py +++ b/util/amptk-remove_samples.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -import sys, argparse, os, inspect, itertools +import sys, argparse, os, inspect, itertools, multiprocessing from Bio.SeqIO.QualityIO import FastqGeneralIterator currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) parentdir = os.path.dirname(currentdir) @@ -40,7 +40,7 @@ def countBarcodes(file): def filter_sample(file, output): global keep_count, total_count with open(output, 'w') as out: - for title, seq, qual in FastqGeneralIterator(amptklib.gzopen(file)): + for title, seq, qual in FastqGeneralIterator(open(file)): total_count += 1 sample = title.split('=',1)[1].split(';')[0] if not sample in keep_list: @@ -50,10 +50,17 @@ def filter_sample(file, output): elif args.format == 'fasta': out.write(">%s\n%s\n" % (title, seq)) +#check if input compressed, incompress if it is +if args.input.endswith('.gz'): + SeqIn = args.input.replace('.gz', '') + amptklib.Funzip(args.input, SeqIn, multiprocessing.cpu_count()) +else: + SeqIn = args.input + remove = [] if args.threshold: print("Removing samples with less than %i reads" % args.threshold) - BC_counts = countBarcodes(args.input) + BC_counts = countBarcodes(SeqIn) for k,v in BC_counts.items(): if int(v) <= args.threshold: if not k in remove: @@ -76,10 +83,21 @@ def filter_sample(file, output): #now run filtering keep_count = 0 total_count = 0 -filter_sample(args.input, args.out) + +#rename to base +if args.out.endswith('.gz'): + outfile = args.out.replace('.gz', '') +else: + outfile = args.out +#run filtering +filter_sample(SeqIn, outfile) +#compress and clean if args.out.endswith('.gz'): #compress in place - amptklib.Fzip_inplace(args.out) - + amptklib.Fzip_inplace(outfile) +if args.input.endswith('.gz'): + amptklib.removefile(SeqIn) + + print("Removed %i samples" % count) print("Kept %i reads out of %i total reads" % (keep_count, total_count)) diff --git a/util/amptk-split_samples_fastq.py b/util/amptk-split_samples_fastq.py index 2a2bbbe..f7d483c 100755 --- a/util/amptk-split_samples_fastq.py +++ b/util/amptk-split_samples_fastq.py @@ -9,6 +9,7 @@ import lib.revcomp_lib as revcomp_lib import lib.amptklib as amptklib from Bio.SeqIO.QualityIO import FastqGeneralIterator +import edlib class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): def __init__(self,prog): diff --git a/util/amptk-strip_primer.py b/util/amptk-strip_primer.py index 3a0c83c..903102f 100755 --- a/util/amptk-strip_primer.py +++ b/util/amptk-strip_primer.py @@ -36,13 +36,13 @@ def primerStrip(file): with open(goodseq, 'w') as good: with open(badseq, 'w') as bad: for title, seq, qual in FastqGeneralIterator(open(file)): - foralign = edlib.align(args.fwd_primer, seq, mode="HW", k=args.primer_mismatch) + foralign = edlib.align(args.fwd_primer, seq, mode="HW", k=args.primer_mismatch, additionalEqualities=amptklib.degenNuc) if foralign["editDistance"] >= 0: ForCutPos = foralign["locations"][0][1]+1 Seq = seq[ForCutPos:] Qual = qual[ForCutPos:] #align reverse - revalign = edlib.align(RevPrimer, Seq, mode="HW", task="locations", k=args.primer_mismatch) + revalign = edlib.align(RevPrimer, Seq, mode="HW", task="locations", k=args.primer_mismatch, additionalEqualities=amptklib.degenNuc) if revalign["editDistance"] >= 0: RevCutPos = revalign["locations"][0][0] Seq = Seq[:RevCutPos] @@ -58,13 +58,13 @@ def revprimerStrip(file): with open(goodseq, 'w') as good: with open(badseq, 'w') as bad: for title, seq, qual in FastqGeneralIterator(open(file)): - foralign = edlib.align(args.rev_primer, seq, mode="HW", k=args.primer_mismatch) + foralign = edlib.align(args.rev_primer, seq, mode="HW", k=args.primer_mismatch, additionalEqualities=amptklib.degenNuc) if foralign["editDistance"] >= 0: ForCutPos = foralign["locations"][0][1]+1 Seq = seq[ForCutPos:] Qual = qual[ForCutPos:] #align reverse - revalign = edlib.align(RevForPrimer, Seq, mode="HW", task="locations", k=args.primer_mismatch) + revalign = edlib.align(RevForPrimer, Seq, mode="HW", task="locations", k=args.primer_mismatch, additionalEqualities=amptklib.degenNuc) if revalign["editDistance"] >= 0: RevCutPos = revalign["locations"][0][0] Seq = Seq[:RevCutPos]