From 28d817d9c2df27bf0f5b7cfc6c5793480986f2d0 Mon Sep 17 00:00:00 2001
From: Jon Palmer <nextgenusfs@gmail.com>
Date: Sat, 7 Jan 2023 20:25:22 -0800
Subject: [PATCH] fix unoise3 for derep step #96; add sortstep and add minsize
 #99

---
 amptk/__version__.py |   4 +-
 amptk/unoise3.py     | 493 +++++++++++++++++++++++++++++++------------
 2 files changed, 361 insertions(+), 136 deletions(-)

diff --git a/amptk/__version__.py b/amptk/__version__.py
index ecfbcc9..c092d1f 100644
--- a/amptk/__version__.py
+++ b/amptk/__version__.py
@@ -1,3 +1,3 @@
-VERSION = (1, 5, 4)
+VERSION = (1, 5, 5)
 
-__version__ = '.'.join(map(str, VERSION))
+__version__ = ".".join(map(str, VERSION))
diff --git a/amptk/unoise3.py b/amptk/unoise3.py
index c729bec..020a2fd 100755
--- a/amptk/unoise3.py
+++ b/amptk/unoise3.py
@@ -1,251 +1,468 @@
 #!/usr/bin/env python
 
-from __future__ import (absolute_import, division,
-                        print_function, unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 import sys
 import os
 import argparse
 import shutil
 from natsort import natsorted
 from amptk import amptklib
+from distutils.version import LooseVersion
+
 
 class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
-    def __init__(self,prog):
-        super(MyFormatter,self).__init__(prog,max_help_position=50)
+    def __init__(self, prog):
+        super(MyFormatter, self).__init__(prog, max_help_position=50)
+
 
 class colr(object):
-    GRN = '\033[92m'
-    END = '\033[0m'
-    WARN = '\033[93m'
+    GRN = "\033[92m"
+    END = "\033[0m"
+    WARN = "\033[93m"
+
 
 def checkfastqsize(input):
     filesize = os.path.getsize(input)
     return filesize
 
+
 def main(args):
-    parser=argparse.ArgumentParser(prog='amptk-unoise3.py', usage="%(prog)s [options] -i file.demux.fq\n%(prog)s -h for help menu",
-        description='''Script runs UNOISE3 algorithm.
-        Requires USEARCH9 by Robert C. Edgar: http://drive5.com/usearch''',
+    parser = argparse.ArgumentParser(
+        prog="amptk-unoise3.py",
+        usage="%(prog)s [options] -i file.demux.fq\n%(prog)s -h for help menu",
+        description="""Script runs UNOISE3 algorithm.
+        Requires USEARCH9 by Robert C. Edgar: http://drive5.com/usearch""",
         epilog="""Written by Jon Palmer (2016) nextgenusfs@gmail.com""",
-        formatter_class=MyFormatter)
-
-    parser.add_argument('-i','--fastq', dest="FASTQ", required=True, help='FASTQ file (Required)')
-    parser.add_argument('-o','--out', help='Base output name')
-    parser.add_argument('--method', choices=['vsearch', 'usearch'], default='vsearch', help='Program to use')
-    parser.add_argument('-e','--maxee', default='1.0', help='Quality trim EE value')
-    parser.add_argument('-m','--minsize', default='8', help='Min size to keep for denoising')
-    parser.add_argument('-u','--usearch', dest="usearch", default='usearch10', help='USEARCH10 EXE')
-    parser.add_argument('-p','--pct_otu', default='97', help="Biological OTU Clustering Percent")
-    parser.add_argument('--uchime_ref', help='Run UCHIME2 REF [ITS,16S,LSU,COI,custom]')
-    parser.add_argument('--map_filtered', action='store_true', help='map quality filtered reads back to OTUs')
-    parser.add_argument('--debug', action='store_true', help='Remove Intermediate Files')
-    parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto")
-    args=parser.parse_args(args)
+        formatter_class=MyFormatter,
+    )
+
+    parser.add_argument(
+        "-i", "--fastq", dest="FASTQ", required=True, help="FASTQ file (Required)"
+    )
+    parser.add_argument("-o", "--out", help="Base output name")
+    parser.add_argument(
+        "--method",
+        choices=["vsearch", "usearch"],
+        default="vsearch",
+        help="Program to use",
+    )
+    parser.add_argument("-e", "--maxee", default="1.0", help="Quality trim EE value")
+    parser.add_argument(
+        "-m", "--minsize", default="8", help="Min size to keep for denoising"
+    )
+    parser.add_argument(
+        "-u", "--usearch", dest="usearch", default="usearch10", help="USEARCH10 EXE"
+    )
+    parser.add_argument(
+        "-p", "--pct_otu", default="97", help="Biological OTU Clustering Percent"
+    )
+    parser.add_argument("--uchime_ref", help="Run UCHIME2 REF [ITS,16S,LSU,COI,custom]")
+    parser.add_argument(
+        "--map_filtered",
+        action="store_true",
+        help="map quality filtered reads back to OTUs",
+    )
+    parser.add_argument(
+        "--debug", action="store_true", help="Remove Intermediate Files"
+    )
+    parser.add_argument("--cpus", type=int, help="Number of CPUs. Default: auto")
+    args = parser.parse_args(args)
 
     parentdir = os.path.join(os.path.dirname(amptklib.__file__))
 
-    #get basename if not args.out passed
+    # get basename if not args.out passed
     if args.out:
         base = args.out
     else:
-        if 'demux' in args.FASTQ:
-            base = os.path.basename(args.FASTQ).split('.demux')[0]
+        if "demux" in args.FASTQ:
+            base = os.path.basename(args.FASTQ).split(".demux")[0]
         else:
-            base = os.path.basename(args.FASTQ).split('.f')[0]
+            base = os.path.basename(args.FASTQ).split(".f")[0]
 
-    #remove logfile if exists
-    log_name = base + '.amptk-unoise3.log'
+    # remove logfile if exists
+    log_name = base + ".amptk-unoise3.log"
     if os.path.isfile(log_name):
         os.remove(log_name)
 
     amptklib.setupLogging(log_name)
-    FNULL = open(os.devnull, 'w')
-    cmd_args = " ".join(sys.argv)+'\n'
+    FNULL = open(os.devnull, "w")
+    cmd_args = " ".join(sys.argv) + "\n"
     amptklib.log.debug(cmd_args)
     print("-------------------------------------------------------")
 
-    #initialize script, log system info and usearch version
+    # initialize script, log system info and usearch version
     amptklib.SystemInfo()
-    #Do a version check
+    # Do a version check
     usearch = args.usearch
     amptklib.versionDependencyChecks(usearch, method=args.method)
-    if args.method == 'usearch':
+    if args.method == "usearch":
         amptklib.checkusearch10(usearch)
 
-    #get number of cpus
+    # get number of cpus
     if args.cpus:
         cpus = args.cpus
     else:
         cpus = amptklib.getCPUS()
 
-    #make tmp folder
-    tmp = base + '_tmp'
+    # make tmp folder
+    tmp = base + "_tmp"
     if not os.path.exists(tmp):
         os.makedirs(tmp)
 
-    #Count FASTQ records
+    # Count FASTQ records
     amptklib.log.info("Loading FASTQ Records")
-    #convert to FASTA for mapping
-    orig_fasta = os.path.join(tmp, base+'.orig.fa')
-    cmd = ['vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta, '--fastq_qmax', '55', '--threads', str(cpus)]
+    # convert to FASTA for mapping
+    orig_fasta = os.path.join(tmp, base + ".orig.fa")
+    cmd = [
+        "vsearch",
+        "--fastq_filter",
+        args.FASTQ,
+        "--fastaout",
+        orig_fasta,
+        "--fastq_qmax",
+        "55",
+        "--threads",
+        str(cpus),
+    ]
     amptklib.runSubprocess(cmd, amptklib.log)
     orig_total = amptklib.countfasta(orig_fasta)
     size = amptklib.checkfastqsize(args.FASTQ)
     readablesize = amptklib.convertSize(size)
-    amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')')
+    amptklib.log.info("{0:,}".format(orig_total) + " reads (" + readablesize + ")")
 
-    #Expected Errors filtering step
-    filter_out = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fq')
-    filter_fasta = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fa')
+    # Expected Errors filtering step
+    filter_out = os.path.join(tmp, base + ".EE" + args.maxee + ".filter.fq")
+    filter_fasta = os.path.join(tmp, base + ".EE" + args.maxee + ".filter.fa")
     amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
-    cmd = ['vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta, '--fastq_qmax', '55', '--threads', str(cpus)]
+    cmd = [
+        "vsearch",
+        "--fastq_filter",
+        args.FASTQ,
+        "--fastq_maxee",
+        str(args.maxee),
+        "--fastqout",
+        filter_out,
+        "--fastaout",
+        filter_fasta,
+        "--fastq_qmax",
+        "55",
+        "--threads",
+        str(cpus),
+    ]
     amptklib.runSubprocess(cmd, amptklib.log)
     total = amptklib.countfastq(filter_out)
-    amptklib.log.info('{0:,}'.format(total) + ' reads passed')
+    amptklib.log.info("{0:,}".format(total) + " reads passed")
 
-    #now run full length dereplication
-    derep_out = os.path.join(tmp, base + '.EE' + args.maxee + '.derep.fa')
+    # now run full length dereplication
+    derep_out = os.path.join(tmp, base + ".EE" + args.maxee + ".derep.fa")
     amptklib.log.info("De-replication (remove duplicate reads)")
-    cmd = ['vsearch', '--derep_fulllength', filter_out, '--relabel', 'Read_', '--sizeout', '--output', derep_out, '--threads', str(cpus)]
+    if LooseVersion(amptklib.get_vsearch_version()) >= LooseVersion("2.20.0"):
+        cmd = [
+            "vsearch",
+            "--fastx_uniques",
+            filter_out,
+            "--relabel",
+            "Read_",
+            "--sizeout",
+            "--fastaout",
+            derep_out,
+        ]
+    else:
+        cmd = [
+            "vsearch",
+            "--derep_fulllength",
+            filter_out,
+            "--relabel",
+            "Read_",
+            "--sizeout",
+            "--output",
+            derep_out,
+            "--threads",
+            str(cpus),
+        ]
     amptklib.runSubprocess(cmd, amptklib.log)
     total = amptklib.countfasta(derep_out)
-    amptklib.log.info('{0:,}'.format(total) + ' reads passed')
+    amptklib.log.info("{0:,}".format(total) + " reads passed")
 
-    #now run de-noiser UNOISE3
-    unoise_out = os.path.join(tmp, base + '.EE' + args.maxee + '.unoise.fa')
-    if args.method == 'usearch':
+    # now run de-noiser UNOISE3
+    unoise_out = os.path.join(tmp, base + ".EE" + args.maxee + ".unoise.fa")
+    if args.method == "usearch":
         amptklib.log.info("Denoising reads with UNOISE3")
-        cmd = [usearch, '-unoise3', derep_out, '-zotus', unoise_out, '-minsize', args.minsize, '-sizeout', '-threads', str(cpus)]
+        cmd = [
+            usearch,
+            "-unoise3",
+            derep_out,
+            "-zotus",
+            unoise_out,
+            "-minsize",
+            args.minsize,
+            "-sizeout",
+            "-threads",
+            str(cpus),
+        ]
     else:
         amptklib.log.info("Denoising reads with VSEARCH --cluster_unoise")
-        cmd = ['vsearch', '--cluster_unoise', derep_out, '--minsize', args.minsize, '--sizeout', '--threads', str(cpus), '--centroids', unoise_out]
+        # need to sort here first
+        sorted_out = os.path.join(tmp, base + ".EE" + args.maxee + ".sorted.fa")
+        cmd = ["vsearch", "--sortbysize", derep_out, "--output", sorted_out]
+        # vsearch --sortbysize zotus_chim.fa --output zotus_sorted.fa
+        amptklib.runSubprocess(cmd, amptklib.log)
+        # now run unoise3
+        cmd = [
+            "vsearch",
+            "--cluster_unoise",
+            sorted_out,
+            "--minsize",
+            args.minsize,
+            "--sizein",
+            "--sizeout",
+            "--threads",
+            str(cpus),
+            "--centroids",
+            unoise_out,
+        ]
     amptklib.runSubprocess(cmd, amptklib.log)
-    #total = amptklib.countfasta(unoise_out)
-    #amptklib.log.info('{0:,}'.format(total) + ' denoised sequences')
+    # total = amptklib.countfasta(unoise_out)
+    # amptklib.log.info('{0:,}'.format(total) + ' denoised sequences')
 
-    #chimera filter UNOISE3 results
-    denovo_chimera = os.path.join(tmp, base + '.EE' + args.maxee + '.unoise.nonchimeras.fa')
+    # chimera filter UNOISE3 results
+    denovo_chimera = os.path.join(
+        tmp, base + ".EE" + args.maxee + ".unoise.nonchimeras.fa"
+    )
     amptklib.log.info("De novo Chimera filtering with VSEARCH --uchime3_denovo")
-    cmd = ['vsearch', '--uchime3_denovo', unoise_out, '--nonchimeras', denovo_chimera, '--sizein']
+    cmd = [
+        "vsearch",
+        "--uchime3_denovo",
+        unoise_out,
+        "--nonchimeras",
+        denovo_chimera,
+        "--sizein",
+    ]
     amptklib.runSubprocess(cmd, amptklib.log)
     total = amptklib.countfasta(denovo_chimera)
-    amptklib.log.info('{0:,}'.format(total) + ' denoised sequences')
+    amptklib.log.info("{0:,}".format(total) + " denoised sequences")
 
-    #strip N's
-    #amptklib.log.info("Cleaning up padding from OTUs")
-    otu_clean = os.path.join(tmp, base + '.EE' + args.maxee + '.clean.fa')
+    # strip N's
+    # amptklib.log.info("Cleaning up padding from OTUs")
+    otu_clean = os.path.join(tmp, base + ".EE" + args.maxee + ".clean.fa")
     amptklib.fasta_strip_padding(denovo_chimera, otu_clean, stripsize=True)
 
-    #run optional uchime_ref
+    # run optional uchime_ref
     if not args.uchime_ref:
         uchime_out = otu_clean
     else:
-        uchime_out = os.path.join(tmp, base + '.EE' + args.maxee + '.uchime.otus.fa')
-        #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy
-        if args.uchime_ref in ['ITS', '16S', 'LSU', 'COI']: #test if it is one that is setup, otherwise default to full path
-            uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref+'.udb')
+        uchime_out = os.path.join(tmp, base + ".EE" + args.maxee + ".uchime.otus.fa")
+        # R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy
+        if args.uchime_ref in [
+            "ITS",
+            "16S",
+            "LSU",
+            "COI",
+        ]:  # test if it is one that is setup, otherwise default to full path
+            uchime_db = os.path.join(parentdir, "DB", args.uchime_ref + ".udb")
             if not os.path.isfile(uchime_db):
-                amptklib.log.error("Database not properly configured, run `amptk install` to setup DB, skipping chimera filtering")
+                amptklib.log.error(
+                    "Database not properly configured, run `amptk install` to setup DB, skipping chimera filtering"
+                )
                 uchime_out = otu_clean
-            #since uchime cannot work with udb database, need to extract fasta sequences, do this if
-            if not amptklib.checkfile(os.path.join(parentdir, 'DB',args.uchime_ref+'.extracted.fa')):
-                uchime_db = os.path.join(parentdir, 'DB',args.uchime_ref+'.extracted.fa')
-                cmd = ['vsearch', '--udb2fasta', os.path.join(parentdir, 'DB', args.uchime_ref+'.udb'), '--output', uchime_db]
+            # since uchime cannot work with udb database, need to extract fasta sequences, do this if
+            if not amptklib.checkfile(
+                os.path.join(parentdir, "DB", args.uchime_ref + ".extracted.fa")
+            ):
+                uchime_db = os.path.join(
+                    parentdir, "DB", args.uchime_ref + ".extracted.fa"
+                )
+                cmd = [
+                    "vsearch",
+                    "--udb2fasta",
+                    os.path.join(parentdir, "DB", args.uchime_ref + ".udb"),
+                    "--output",
+                    uchime_db,
+                ]
                 amptklib.runSubprocess(cmd, amptklib.log)
             else:
-                uchime_db = os.path.join(parentdir, 'DB',args.uchime_ref+'.extracted.fa')
+                uchime_db = os.path.join(
+                    parentdir, "DB", args.uchime_ref + ".extracted.fa"
+                )
         else:
             uchime_db = os.path.abspath(args.uchime_ref)
-        #now run chimera filtering if all checks out
+        # now run chimera filtering if all checks out
         if not os.path.isfile(uchime_out):
             amptklib.log.info("Chimera Filtering (VSEARCH)")
-            cmd = ['vsearch', '--mindiv', '1.0', '--uchime_ref', otu_clean, '--db', uchime_db, '--nonchimeras', uchime_out, '--threads', str(cpus)]
+            cmd = [
+                "vsearch",
+                "--mindiv",
+                "1.0",
+                "--uchime_ref",
+                otu_clean,
+                "--db",
+                uchime_db,
+                "--nonchimeras",
+                uchime_out,
+                "--threads",
+                str(cpus),
+            ]
             amptklib.runSubprocess(cmd, amptklib.log)
             total = amptklib.countfasta(uchime_out)
-            amptklib.log.info('{0:,}'.format(total) + ' OTUs passed')
-
-    #inferred sequences
-    iSeqs = os.path.join(tmp, base+'.renamed.asvs.fa')
-    amptklib.fastarename(uchime_out, 'ASV', iSeqs)
-
-    #Filter out ASVs in wrong orientation
-    amptklib.log.info('Validating ASV orientation')
-    passingOTUs = base+'.ASVs.fa'
-    numKept, numDropped = amptklib.validateorientation(tmp, derep_out, iSeqs, passingOTUs)
-    amptklib.log.info('{:,} ASVs validated ({:,} dropped)'.format(numKept, numDropped))
-
-    #build OTU table with iSeqs
-    uc_iSeq_out = os.path.join(tmp, base + '.EE' + args.maxee + '.mapping.uc')
-    iSeq_otu_table = base + '.otu_table.txt'
-    #setup reads to map
+            amptklib.log.info("{0:,}".format(total) + " OTUs passed")
+
+    # inferred sequences
+    iSeqs = os.path.join(tmp, base + ".renamed.asvs.fa")
+    amptklib.fastarename(uchime_out, "ASV", iSeqs)
+
+    # Filter out ASVs in wrong orientation
+    amptklib.log.info("Validating ASV orientation")
+    passingOTUs = base + ".ASVs.fa"
+    numKept, numDropped = amptklib.validateorientation(
+        tmp, derep_out, iSeqs, passingOTUs
+    )
+    amptklib.log.info("{:,} ASVs validated ({:,} dropped)".format(numKept, numDropped))
+
+    # build OTU table with iSeqs
+    uc_iSeq_out = os.path.join(tmp, base + ".EE" + args.maxee + ".mapping.uc")
+    iSeq_otu_table = base + ".otu_table.txt"
+    # setup reads to map
     if args.map_filtered:
         reads = filter_fasta
     else:
         reads = orig_fasta
     amptklib.log.info("Mapping Reads to ASVs and Building OTU table")
-    cmd = ['vsearch', '--usearch_global', reads, '--strand', 'plus', '--id', '0.97', '--db', passingOTUs, '--uc', uc_iSeq_out, '--otutabout', iSeq_otu_table, '--threads', str(cpus)]
+    cmd = [
+        "vsearch",
+        "--usearch_global",
+        reads,
+        "--strand",
+        "plus",
+        "--id",
+        "0.97",
+        "--db",
+        passingOTUs,
+        "--uc",
+        uc_iSeq_out,
+        "--otutabout",
+        iSeq_otu_table,
+        "--threads",
+        str(cpus),
+    ]
     amptklib.runSubprocess(cmd, amptklib.log)
 
-    #count reads mapped
+    # count reads mapped
     total = amptklib.line_count2(uc_iSeq_out)
-    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to ASVs '+ '({0:.0f}%)'.format(total/float(orig_total)* 100))
-
-    #now cluster to biological OTUs with UCLUST
-    radius = float(args.pct_otu) / 100.
-    amptklib.log.info("Clustering denoised sequences into biological OTUs at %s%%" % args.pct_otu)
-    uclust_out = os.path.join(tmp, base + '.EE' + args.maxee + '.uclust.fa')
-    cmd = ['vsearch', '--cluster_smallmem', passingOTUs, '--centroids', uclust_out, '--id', str(radius), '--strand', 'plus', '--relabel', 'OTU', '--qmask', 'none', '--usersort', '--threads', str(cpus)]
+    amptklib.log.info(
+        "{0:,}".format(total)
+        + " reads mapped to ASVs "
+        + "({0:.0f}%)".format(total / float(orig_total) * 100)
+    )
+
+    # now cluster to biological OTUs with UCLUST
+    radius = float(args.pct_otu) / 100.0
+    amptklib.log.info(
+        "Clustering denoised sequences into biological OTUs at %s%%" % args.pct_otu
+    )
+    uclust_out = os.path.join(tmp, base + ".EE" + args.maxee + ".uclust.fa")
+    cmd = [
+        "vsearch",
+        "--cluster_smallmem",
+        passingOTUs,
+        "--centroids",
+        uclust_out,
+        "--id",
+        str(radius),
+        "--strand",
+        "plus",
+        "--relabel",
+        "OTU",
+        "--qmask",
+        "none",
+        "--usersort",
+        "--threads",
+        str(cpus),
+    ]
     amptklib.runSubprocess(cmd, amptklib.log)
     total = amptklib.countfasta(uclust_out)
-    amptklib.log.info('{0:,}'.format(total) + ' OTUs generated')
-
-    #determine where denoised sequences clustered
-    ClusterComp = base+'.ASVs2clusters.txt'
-    iSeqmap = base+'.unoise_map.uc'
-    cmd = [usearch, '-usearch_global', passingOTUs, '-db', uclust_out, '-id', str(radius), '-uc', iSeqmap, '-strand', 'plus', '-threads', str(cpus)]
+    amptklib.log.info("{0:,}".format(total) + " OTUs generated")
+
+    # determine where denoised sequences clustered
+    ClusterComp = base + ".ASVs2clusters.txt"
+    iSeqmap = base + ".unoise_map.uc"
+    cmd = [
+        "vsearch",
+        "--usearch_global",
+        passingOTUs,
+        "--db",
+        uclust_out,
+        "--id",
+        str(radius),
+        "--uc",
+        iSeqmap,
+        "--strand",
+        "plus",
+        "--threads",
+        str(cpus),
+    ]
     amptklib.runSubprocess(cmd, amptklib.log)
     iSeqMapped = {}
-    with open(iSeqmap, 'r') as mapping:
+    with open(iSeqmap, "r") as mapping:
         for line in mapping:
-            line = line.replace('\n', '')
-            cols = line.split('\t')
+            line = line.replace("\n", "")
+            cols = line.split("\t")
             OTU = cols[9]
             Hit = cols[8]
             if not OTU in iSeqMapped:
                 iSeqMapped[OTU] = [Hit]
             else:
                 iSeqMapped[OTU].append(Hit)
-    with open(ClusterComp, 'w') as clusters:
-        clusters.write('OTU\tASVs\n')
-        for k,v in natsorted(list(iSeqMapped.items())):
-            clusters.write('%s\t%s\n' % (k, ', '.join(v)))
-
-    #now map reads back to OTUs and build OTU table
-    uc_out = os.path.join(tmp, base + '.EE' + args.maxee + '.cluster.mapping.uc')
-    otu_table = os.path.join(tmp, base + '.EE' + args.maxee + '.cluster.otu_table.txt')
-    #setup reads to map
+    with open(ClusterComp, "w") as clusters:
+        clusters.write("OTU\tASVs\n")
+        for k, v in natsorted(list(iSeqMapped.items())):
+            clusters.write("%s\t%s\n" % (k, ", ".join(v)))
+
+    # now map reads back to OTUs and build OTU table
+    uc_out = os.path.join(tmp, base + ".EE" + args.maxee + ".cluster.mapping.uc")
+    otu_table = os.path.join(tmp, base + ".EE" + args.maxee + ".cluster.otu_table.txt")
+    # setup reads to map
     if args.map_filtered:
         reads = filter_fasta
     else:
         reads = orig_fasta
     amptklib.log.info("Mapping Reads to OTUs and Building OTU table")
-    cmd = ['vsearch', '--usearch_global', reads, '--strand', 'plus', '--id', '0.97', '--db', uclust_out, '--uc', uc_out, '--otutabout', otu_table, '--threads', str(cpus)]
+    cmd = [
+        "vsearch",
+        "--usearch_global",
+        reads,
+        "--strand",
+        "plus",
+        "--id",
+        "0.97",
+        "--db",
+        uclust_out,
+        "--uc",
+        uc_out,
+        "--otutabout",
+        otu_table,
+        "--threads",
+        str(cpus),
+    ]
     amptklib.runSubprocess(cmd, amptklib.log)
 
-    #count reads mapped
+    # count reads mapped
     total = amptklib.line_count2(uc_out)
-    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs '+ '({0:.0f}%)'.format(total/float(orig_total)* 100))
+    amptklib.log.info(
+        "{0:,}".format(total)
+        + " reads mapped to OTUs "
+        + "({0:.0f}%)".format(total / float(orig_total) * 100)
+    )
 
-    #Move files around, delete tmp if argument passed.
+    # Move files around, delete tmp if argument passed.
     currentdir = os.getcwd()
-    final_otu = os.path.join(currentdir, base + '.cluster.otus.fa')
+    final_otu = os.path.join(currentdir, base + ".cluster.otus.fa")
     shutil.copyfile(uclust_out, final_otu)
-    final_otu_table = os.path.join(currentdir, base + '.cluster.otu_table.txt')
+    final_otu_table = os.path.join(currentdir, base + ".cluster.otu_table.txt")
     shutil.copyfile(otu_table, final_otu_table)
     if not args.debug:
         shutil.rmtree(tmp)
 
-    #Print location of files to STDOUT
+    # Print location of files to STDOUT
     print("-------------------------------------------------------")
     print("UNOISE3 Script has Finished Successfully")
     print("-------------------------------------------------------")
@@ -258,12 +475,20 @@ def main(args):
     print("ASVs to OTUs: %s" % ClusterComp)
     print("-------------------------------------------------------")
 
-    otu_print = final_otu.split('/')[-1]
-    tab_print = final_otu_table.split('/')[-1]
-    if 'darwin' in sys.platform:
-        print(colr.WARN + "\nExample of next cmd:" + colr.END + " amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print))
+    otu_print = final_otu.split("/")[-1]
+    tab_print = final_otu_table.split("/")[-1]
+    if "darwin" in sys.platform:
+        print(
+            colr.WARN
+            + "\nExample of next cmd:"
+            + colr.END
+            + " amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print)
+        )
     else:
-        print("\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print))
+        print(
+            "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n"
+            % (tab_print, otu_print)
+        )
 
 
 if __name__ == "__main__":