Skip to content

Commit

Permalink
add LULU wrapper, update stats with indicator species analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
Jon Palmer committed Mar 4, 2018
1 parent 75e3625 commit 8cc9b37
Show file tree
Hide file tree
Showing 6 changed files with 451 additions and 6 deletions.
36 changes: 35 additions & 1 deletion amptk.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def download(url, name):
sys.stdout.write(status)
f.close()

version = '1.0.4'
version = '1.1.0'

default_help = """
Usage: amptk <command> <arguments>
Expand All @@ -82,6 +82,7 @@ def download(url, name):
cluster_ref closed/open reference based clustering (EXPERIMENTAL)
Utilities: filter OTU table filtering
lulu LULU amplicon curation of OTU table
taxonomy Assign taxonomy to OTUs
show show number or reads per barcode from de-multiplexed data
select select reads (samples) from de-multiplexed data
Expand Down Expand Up @@ -500,6 +501,37 @@ def download(url, name):
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print help
sys.exit(1)

elif sys.argv[1] == 'lulu' or sys.argv[1] == 'LULU':
help = """
Usage: amptk %s <arguments>
version: %s
Description: Script is a wrapper for the LULU OTU table post-clustering curation of amplicon
data. The script calculates pairwise identity between the OTUs and then filters
the OTU table based on whether closely related OTUs that share the same/similar
distributions in the data are "daughters" of the "parent" OTU. Requires R and the
LULU R package. doi:10.1038/s41467-017-01312-x
Arguments: -i, --otu_table Input OTU table (Required)
-f, --fasta Input OTUs in FASTA format (Required)
-o, --out Output base name. Default: input basename
--min_ratio_type Minimum ratio threshold. Default: min [min,avg]
--min_ratio Minimum ratio. Default: 1
--min_match Minimum match pident (%%). Default: 84
--min_relative_cooccurence Minimum relative co-occurance (%%): Default: 95
--debug Keep intermediate files.
""" % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(script_path, 'util', 'amptk-lulu.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print help
sys.exit(1)
Expand Down Expand Up @@ -1002,6 +1034,8 @@ def download(url, name):
-t, --tree Phylogeny of OTUs (from amptk taxonomy) (Required)
-d, --distance Distance metric. Default: raupcrick [raupcrick,jaccard,bray,unifrac,wunifrac]
-o, --out Output base name. Default: amptk_stats
--indicator_species Run indicator species analysis
--ignore_otus Drop OTUs from table before running stats
""" % (sys.argv[1], version)

arguments = sys.argv[2:]
Expand Down
17 changes: 14 additions & 3 deletions lib/amptklib.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,8 @@ def checkRversion():
Rvers = versions.split(',')[0]
dada2 = versions.split(',')[1]
phyloseq = versions.split(',')[2]
return (Rvers, dada2, phyloseq)
lulu = versions.split(',')[3]
return (Rvers, dada2, phyloseq, lulu)

def checkfastqsize(input):
filesize = os.path.getsize(input)
Expand Down Expand Up @@ -338,6 +339,16 @@ def runSubprocess3(cmd, logfile, folder, output):
if stderr:
if stderr[0] != None:
logfile.debug(stderr)

def runSubprocess4(cmd, logfile, logfile2):
#function where cmd is issued in logfile, and log captured in logfile 2
logfile.debug(' '.join(cmd))
with open(logfile2, 'w') as out:
proc = subprocess.Popen(cmd, stdout=out, stderr=out)
stderr = proc.communicate()
if stderr:
if stderr[0] != None:
logfile.debug(stderr)

def getSize(filename):
st = os.stat(filename)
Expand Down Expand Up @@ -932,8 +943,8 @@ def setupLogging(LOGNAME):
if 'win32' in sys.platform:
stdoutformat = logging.Formatter('%(asctime)s: %(message)s', datefmt='[%I:%M:%S %p]')
else:
stdoutformat = logging.Formatter(colr.GRN+'%(asctime)s'+colr.END+': %(message)s', datefmt='[%I:%M:%S %p]')
fileformat = logging.Formatter('%(asctime)s: %(message)s')
stdoutformat = logging.Formatter(colr.GRN+'%(asctime)s'+colr.END+': %(message)s', datefmt='[%b %d %I:%M %p]')
fileformat = logging.Formatter('%(asctime)s: %(message)s', datefmt='[%x %H:%M:%S]')
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
sth = logging.StreamHandler()
Expand Down
111 changes: 111 additions & 0 deletions util/amptk-lulu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#!/usr/bin/env python
from __future__ import division
import sys, os, argparse, logging, shutil, subprocess, inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)
import lib.amptklib as amptklib
from Bio import SeqIO

class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
def __init__(self,prog):
super(MyFormatter,self).__init__(prog,max_help_position=50)

class colr:
GRN = '\033[92m'
END = '\033[0m'
WARN = '\033[93m'

parser=argparse.ArgumentParser(prog='amptk-lulu.py',
description='''Script runs OTU table post processing LULU to identify low abundance error OTUs''',
epilog="""Written by Jon Palmer (2018) [email protected]""",
formatter_class=MyFormatter)

parser.add_argument('-i','--otu_table', required=True, help='Input OTU table')
parser.add_argument('-f','--fasta', required=True, help='Input OTUs (multi-fasta)')
parser.add_argument('-o','--out', help='Output folder basename')
parser.add_argument('--min_ratio_type', default='min', choices=['min', 'avg'], help="LULU minimum ratio threshold")
parser.add_argument('--min_ratio', default=1, type=int, help="LULU minimum ratio")
parser.add_argument('--min_match', default=84, type=int, help="LULU minimum match percent identity")
parser.add_argument('--min_relative_cooccurence', default=95, type=int, help="LULU minimum relative cooccurance")
parser.add_argument('--debug', action='store_true', help='Remove Intermediate Files')
args=parser.parse_args()

#get location of R script
luluScript = os.path.join(parentdir, 'util', 'runLULU.R')

if not args.out:
#get base name of files
base = args.otu_table.split(".otu_table")[0]
else:
base = args.out

#remove logfile if exists
log_name = base + '.amptk-lulu.log'
if os.path.isfile(log_name):
amptklib.removefile(log_name)

amptklib.setupLogging(log_name)
FNULL = open(os.devnull, 'w')
cmd_args = " ".join(sys.argv)+'\n'
amptklib.log.debug(cmd_args)
print "-------------------------------------------------------"
#initialize script, log system info and usearch version
amptklib.SystemInfo()
amptklib.versionDependencyChecks('usearch9')
#check dependencies
programs = ['Rscript', 'vsearch']
amptklib.CheckDependencies(programs)
Rversions = amptklib.checkRversion()
if Rversions[3] == '0.0.0':
amptklib.log.info("R v%s installed, LULU not installed will try to install automatically at runtime")
else:
amptklib.log.info("R v%s; LULU v%s" % (Rversions[0], Rversions[3]))

#this is a simple wrapper for an R script so easier to run from amptk menu
tmpdir = 'lulu_'+str(os.getpid())
if not os.path.isdir(tmpdir):
os.makedirs(tmpdir)

#generate the match list using the minimum match pident
match_file = os.path.join(tmpdir, 'match_file.txt')
amptklib.log.info("Loading {:,} OTUs".format(amptklib.countfasta(args.fasta)))
amptklib.log.info("Generating pairwise percent identity between OTUs using VSEARCH at {:}% identity".format(args.min_match))
cmd = ['vsearch', '--usearch_global', os.path.abspath(args.fasta), '--db', os.path.abspath(args.fasta), '--self', '--id', str(args.min_match / 100), '--iddef', '1', '--userout', match_file, '--userfields', 'query+target+id', '--maxaccepts', '0', '--query_cov', '.9', '--maxhits', '10']
amptklib.runSubprocess(cmd, amptklib.log)

#now run LULU in R
LULU_log = os.path.join(tmpdir, 'LULU-R.log')
lulu_otu_table = base + '.lulu.otu_table.txt'
dropList = os.path.join(tmpdir, 'droplist.txt')
MapData = base + '.lulu.otu-map.txt'
amptklib.log.info("Running LULU algorithm")
cmd = ['Rscript', '--vanilla', luluScript, os.path.abspath(args.otu_table), os.path.abspath(match_file), args.min_ratio_type, str(args.min_ratio), str(args.min_match), str(args.min_relative_cooccurence / 100), lulu_otu_table, dropList, MapData]
amptklib.runSubprocess4(cmd, amptklib.log, LULU_log)

#get updated OTUs
remove = []
with open(dropList, 'rU') as dropped:
for line in dropped:
remove.append(line.rstrip())
lulu_otus = base + '.lulu.otus.fa'
with open(lulu_otus, 'w') as output:
with open(args.fasta, 'rU') as infasta:
for record in SeqIO.parse(infasta, 'fasta'):
if not record.id in remove:
output.write('>%s\n%s\n' % (record.id, record.seq))
amptklib.log.info("LULU has merged {:,} OTUs, output data contains {:,} OTUs".format(len(remove), amptklib.countfasta(lulu_otus)))
amptklib.log.info("LULU OTU table post processing finished\n\
----------------------------------\n\
OTU table: {:}\n\
OTU FASTA: {:}\n\
LULU map: {:}\n\
----------------------------------".format(lulu_otu_table,lulu_otus, MapData))
if 'win32' in sys.platform:
print "\nExample of next cmd: amptk taxonomy -f %s -i %s -m mapping_file.txt -d ITS2\n" % (lulu_otus, lulu_otu_table)
else:
print colr.WARN + "\nExample of next cmd:" + colr.END + " amptk taxonomy -f %s -i %s -m mapping_file.txt -d ITS2\n" % (lulu_otus, lulu_otu_table)
if not args.debug:
if os.path.isdir(tmpdir):
shutil.rmtree(tmpdir)
print "-------------------------------------------------------"
8 changes: 6 additions & 2 deletions util/check_version.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,12 @@ if (!is.installed("phyloseq")){
} else {
phyloseqversion <- packageVersion("phyloseq")
}
if (!is.installed("lulu")){
luluversion <- '0.0.0'
} else {
luluversion <- packageVersion("lulu")
}
parts <- strsplit(Rversion, ' ')
Rvers <- parts[[1]][3]
output <- paste(Rvers,dadaversion, phyloseqversion, sep=',')
output <- paste(Rvers, dadaversion, phyloseqversion, luluversion, sep=',')
cat(output,"\n")

Loading

0 comments on commit 8cc9b37

Please sign in to comment.