diff --git a/src/cpp/b2w.cpp b/src/cpp/b2w.cpp index c416184..c41eed6 100644 --- a/src/cpp/b2w.cpp +++ b/src/cpp/b2w.cpp @@ -51,7 +51,7 @@ adapted from samtools/calDep.c // data for fetch_func and pileup_func typedef struct { - int win, inc, min_overlap, max; + int win, inc, min_overlap, max, cov_thrd; bool skip_indel; } paramstruct_t; @@ -102,15 +102,17 @@ int main(int argc, char* argv[]) 201/3, // incr = size / shifts (int) std::round(201. * 0.85), // min_overlap = size * win_min_ext 10000 / 201, // max_c = max_coverage / size + 0, // coverage threshold false // (historically default behaviour of shorah is to never skip insertions) }; char help_string[] = "\nUsage: b2w [options] region\n\nOptions:\n\t-w: window length " "(INT)\n\t-i: increment (INT)\n\t-m: minimum overlap (INT)\n\t-x: max reads starting at a " - "position (INT)\n\t-d: drop SNVs that are adjacent to insertions/deletions (alternate behaviour)\n\t-h: show this help\n\n"; + "position (INT)\n\t-c: coverage threshold. Omit windows with low coverage (INT)\n\t" + "-d: drop SNVs that are adjacent to insertions/deletions (alternate behaviour)\n\t-h: show this help\n\n"; - while ((c = getopt(argc, argv, "w:i:m:x:dh")) != EOF) { + while ((c = getopt(argc, argv, "w:i:m:x:c:dh")) != EOF) { switch (c) { case 'w': param.win = atof(optarg); @@ -124,6 +126,9 @@ int main(int argc, char* argv[]) case 'x': param.max = atof(optarg); break; + case 'c': + param.cov_thrd = atof(optarg); + break; case 'd': // brings fil's weird behaviour into b2w param.skip_indel = true; @@ -297,7 +302,7 @@ int main(int argc, char* argv[]) } outFile.close(); - if (cov > 0) { // ignore empty windows + if (cov > param.cov_thrd) { // ignore windows with low-coverage covOut << filename << "\t" // write out coverage information << header->target_name[tid] << "\t" << win_b + 1 << "\t" << win_e + 1 << "\t" << cov << "\n"; diff --git a/src/cpp/dpm_sampler.cpp b/src/cpp/dpm_sampler.cpp index 603f26a..2f9dfd6 100644 --- a/src/cpp/dpm_sampler.cpp +++ b/src/cpp/dpm_sampler.cpp @@ -558,7 +558,7 @@ void read_data(char* filename, std::ofstream& out_file) fclose(data); if (n == 1) { - out_file << "Nothing to do, have only one read... (n == 1)\n"; + out_file << "Nothing to do, have only one read... (n == 1)" << std::endl; exit(0); } diff --git a/src/shorah/amplicon.py b/src/shorah/amplicon.py index 094aae2..2d6d6a3 100644 --- a/src/shorah/amplicon.py +++ b/src/shorah/amplicon.py @@ -59,7 +59,8 @@ diri_exe = shutil.which('diri_sampler') b2w_exe = shutil.which('b2w') if not (diri_exe and b2w_exe): - logging.error('Executables b2w and diri_sampler not found, compile first.') + logging.error( + 'Executables b2w and diri_sampler not found, compile first.') sys.exit('Executables b2w and diri_sampler not found, compile first.') win_min_ext = 0.95 @@ -79,7 +80,8 @@ def run_child(exe_name, arg_string): retcode = subprocess.call(exe_name + arg_string, shell=True) if retcode > 0: logging.error(exe_name + arg_string) - logging.error("Child %s terminated by signal %s", exe_name, retcode) + logging.error("Child %s terminated by signal %s", + exe_name, retcode) else: logging.debug("Child %s returned %i", exe_name, retcode) except OSError as ee: @@ -93,7 +95,8 @@ def run_diagnostics(window_file, reads): """ import warnings - stem = re.match(r'^(?P.*).reads', window_file).group('stem') # greedy re match to handle situation where '.reads' appears in the ID + # greedy re match to handle situation where '.reads' appears in the ID + stem = re.match(r'^(?P.*).reads', window_file).group('stem') smp_file = stem + '.smp' dbg_file = stem + '.dbg' del stem @@ -196,6 +199,7 @@ def plot_entropy(pos_ent, pos_coords, ave_ent, win_coords): plt.savefig('entropy.pdf') del fig + def highest_entropy(bam_file, fasta_file, ent_sel='relative'): """Parse reads to have their length distribution and compute the trimmed mean read length""" @@ -313,7 +317,7 @@ def highest_entropy(bam_file, fasta_file, ent_sel='relative'): return high_ent_start, high_ent_stop -#def main(in_bam, in_fasta, min_overlap=0.95, max_coverage=50000, +# def main(in_bam, in_fasta, min_overlap=0.95, max_coverage=50000, # alpha=0.5, s=0.01, region='', diversity=False): def main(args): """ @@ -327,17 +331,20 @@ def main(args): alpha = args.a seed = args.seed sigma = args.sigma + cov_thrd = args.cov_thrd diversity = args.diversity min_overlap = args.min_overlap - ignore_indels = args.ignore_indels; + ignore_indels = args.ignore_indels logging.info(' '.join(sys.argv)) # info on reference and region if given, or discover high entropy one ref_seq = list(SeqIO.parse(in_fasta, 'fasta'))[0] ref_name = ref_seq.id if region: - reg_bound = re.search(r':(?P\d+)-(?P\d+)$', region) # handles situation where ':' or '-' appears in the ID - reg_start, reg_stop = int(reg_bound.group('start')), int(reg_bound.group('stop')) + # handles situation where ':' or '-' appears in the ID + reg_bound = re.search(r':(?P\d+)-(?P\d+)$', region) + reg_start, reg_stop = int(reg_bound.group( + 'start')), int(reg_bound.group('stop')) ref_length = reg_stop - reg_start + 1 del reg_bound elif region == '' and diversity: @@ -354,16 +361,16 @@ def main(args): # output the reads, aligned to the amplicon d = ' -d' if ignore_indels else '' - b2w_args = ' -i 0 -w %d -m %d -x %d%s %s %s %s' % \ - (ref_length, int(min_overlap * ref_length), - max_coverage, d, in_bam, in_fasta, region) + b2w_args = ' -i 0 -w %d -m %d -x %d -c %i%s %s %s %s' % (ref_length, int( + min_overlap * ref_length), max_coverage, cov_thrd, d, in_bam, in_fasta, region) ret_b2w = run_child(shlex.quote(b2w_exe), b2w_args) logging.debug('b2w returned %d', ret_b2w) # run diri_sampler on the aligned reads win_file = 'w-%s-%u-%u.reads.fas' % (ref_name, reg_start, reg_stop) # TODO clean ref_name of special caracters - that would be an alternative to processing everything with regex down the line - # BUG the solution currently used by ShoRAH can still fail when path '/' (or on windows '\\' and ':') characters are present in the ref_seq.id + # BUG the solution currently used by ShoRAH can still fail when path '/' + # (or on windows '\\' and ':') characters are present in the ref_seq.id h = list(open('coverage.txt'))[0] n_reads = int(h.split()[-1]) assert os.path.exists(win_file), 'window file %s not found' % win_file @@ -377,4 +384,5 @@ def main(args): run_diagnostics(win_file, n_reads) # run snv.py to parse single nucleotide variants - shorah_snv.main(reference=in_fasta, bam_file=in_bam, sigma=sigma, increment=1) + args.increment = 1 + shorah_snv.main(args) diff --git a/src/shorah/cli.py b/src/shorah/cli.py index 459e7f1..0abd840 100644 --- a/src/shorah/cli.py +++ b/src/shorah/cli.py @@ -59,81 +59,114 @@ sys.modules["shorah"] = mod #from common import hcv_map, hiv_map, org_dict, wobbles #from stats import (genome_coverage, start_stop_coverage) -#else: +# else: #from .common import hcv_map, hiv_map, org_dict, wobbles #from .stats import (genome_coverage, start_stop_coverage) # each subcommand takes one of these functions as default + def shotgun_run(args): """Default function for command line parser.""" from shorah import shotgun shotgun.main(args) + def amplicon_run(args): """Default function for command line parser.""" from shorah import amplicon amplicon.main(args) + +def snv_run(args): + from shorah import shorah_snv + shorah_snv.main(args) + + def main(): """Parse command line, run default functions.""" import argparse # parse command line # create the top-level parser - parser = argparse.ArgumentParser(usage='%(prog)s [options]', - epilog="Run `shorah subcommand -h` for more help") - parser.add_argument('-v', '--version', action='version', version=__version__) + version_parser = argparse.ArgumentParser(add_help=False) + + version_parser.add_argument( + '-v', '--version', action='version', version=__version__) + + parent_parser = argparse.ArgumentParser(add_help=False) + + required = parent_parser.add_argument_group('required arguments') - parser.add_argument("-b", "--bam", metavar='BAM', required=True, type=str, - dest="b", help="sorted bam format alignment file") + required.add_argument("-b", "--bam", metavar='BAM', required=True, + type=str, dest="b", help="sorted bam format alignment file") - parser.add_argument("-f", "--fasta", metavar='REF', required=True, type=str, - dest="f", help="reference genome in fasta format") + required.add_argument("-f", "--fasta", metavar='REF', required=True, + type=str, dest="f", help="reference genome in fasta format") - parser.add_argument("-a", "--alpha", metavar='FLOAT', required=False, type=float, - dest="a", default=0.1, help="alpha in dpm sampling") + parent_parser.add_argument("-a", "--alpha", metavar='FLOAT', required=False, + type=float, dest="a", default=0.1, help="alpha in dpm sampling") - parser.add_argument("-r", "--region", metavar='chrm:start-stop', required=False, type=str, - dest="r", default='', help="region in format 'chr:start-stop', e.g. 'chrm:1000-3000'") + parent_parser.add_argument("-r", "--region", metavar='chrm:start-stop', required=False, type=str, + dest="r", default='', help="region in format 'chr:start-stop', e.g. 'chrm:1000-3000'") - parser.add_argument("-R", "--seed", metavar='INT', required=False, type=int, - dest="seed", default=None, help="set seed for reproducible results") + parent_parser.add_argument("-R", "--seed", metavar='INT', required=False, + type=int, dest="seed", default=None, help="set seed for reproducible results") - parser.add_argument("-x", "--maxcov", metavar='INT', required=False, type=int, - default=10000, dest="max_coverage", help="approximate max coverage allowed") + parent_parser.add_argument("-x", "--maxcov", metavar='INT', required=False, type=int, + default=10000, dest="max_coverage", help="approximate max coverage allowed") - parser.add_argument("-S", "--sigma", metavar='FLOAT', default=0.01, type=float, - dest="sigma", help="sigma value to use when calling SNVs") + parent_parser.add_argument("-S", "--sigma", metavar='FLOAT', default=0.01, + type=float, dest="sigma", help="sigma value to use when calling SNVs") - parser.add_argument("-I", "--ignore_indels", action="store_true", default=False, - dest="ignore_indels", help="ignore SNVs adjacent to insertions/deletions\n(legacy behaviour of 'fil', ignore this option if you don't understand)") + parent_parser.add_argument("-I", "--ignore_indels", action="store_true", default=False, dest="ignore_indels", + help="ignore SNVs adjacent to insertions/deletions\n(legacy behaviour of 'fil', ignore this option if you don't understand)") - subparsers = parser.add_subparsers(help='available sub-commands') + coverage_parser = argparse.ArgumentParser(add_help=False) + + coverage_parser.add_argument("-c", "--win_coverage", metavar='INT', default=0, type=int, + dest='cov_thrd', help='coverage threshold. Omit windows with low coverage') + + parser = argparse.ArgumentParser( + usage='%(prog)s [options]', epilog="Run `shorah subcommand -h` for more help", parents=[version_parser]) + + subparsers = parser.add_subparsers( + title='sub-commands', help='available sub-commands') # create the parser for command "shotgun" - parser_shotgun = subparsers.add_parser('shotgun', help='run local analysis in shotgun mode') + parser_shotgun = subparsers.add_parser( + 'shotgun', help='run local analysis in shotgun mode', parents=[version_parser, parent_parser, coverage_parser]) - parser_shotgun.add_argument("-w", "--windowsize", metavar='INT', required=False, type=int, - dest="w", default=201, help="window size") + parser_shotgun.add_argument("-w", "--windowsize", metavar='INT', + required=False, type=int, dest="w", default=201, help="window size") - parser_shotgun.add_argument("-s", "--winshifts", metavar='INT', required=False, type=int, - default=3, dest="win_shifts", help="number of window shifts") + parser_shotgun.add_argument("-s", "--winshifts", metavar='INT', required=False, + type=int, default=3, dest="win_shifts", help="number of window shifts") parser_shotgun.add_argument("-k", "--keep_files", required=False, action='store_true', default=True, dest="keep_files", help="keep all intermediate files") parser_shotgun.set_defaults(func=shotgun_run) - parser_amplicon = subparsers.add_parser('amplicon', help='run local analysis in amplicon mode') + parser_amplicon = subparsers.add_parser( + 'amplicon', help='run local analysis in amplicon mode', parents=[version_parser, parent_parser, coverage_parser]) parser_amplicon.add_argument("-d", "--diversity", action="store_true", default=False, dest="diversity", help="detect the highest entropy region and run there") - parser_amplicon.add_argument("-m", "--min_overlap", metavar='FLOAT', default=0.95, type=float, - dest="min_overlap", help="fraction of read overlap to be included") + parser_amplicon.add_argument("-m", "--min_overlap", metavar='FLOAT', default=0.95, + type=float, dest="min_overlap", help="fraction of read overlap to be included") parser_amplicon.set_defaults(func=amplicon_run) + # create the parser for command "snv" + parser_snv = subparsers.add_parser( + 'snv', help='run single-nucleotide-variant calling', parents=[version_parser, parent_parser]) + + parser_snv.add_argument("-i", "--increment", metavar='INT', default=1, type=int, required=False, + dest="increment", help="value of increment to use when calling\nSNVs (1 used in amplicon mode)") + + parser_snv.set_defaults(func=snv_run) + # exit so that log file is not written if len(sys.argv) == 1 or sys.argv[1] == '-h' or sys.argv[1] == '--help': parser.print_help() @@ -152,5 +185,6 @@ def main(): args = parser.parse_args() args.func(args) + if __name__ == "__main__": # and __package__ is None: main() diff --git a/src/shorah/shorah_snv.py b/src/shorah/shorah_snv.py index 7eaa79a..a87b179 100644 --- a/src/shorah/shorah_snv.py +++ b/src/shorah/shorah_snv.py @@ -348,13 +348,20 @@ def BH(p_vals, n): return q_vals_l -def main(reference, bam_file, sigma=0.01, increment=1, max_coverage=100000, ignore_indels=False): +def main(args): '''main code ''' from Bio import SeqIO import csv import inspect + reference = args.f + bam_file = args.b + sigma = args.sigma + increment = args.increment + max_coverage = args.max_coverage + ignore_indels = args.ignore_indels + logging.info(str(inspect.getfullargspec(main))) ref_m = dict([[s.id, str(s.seq).upper()] for s in SeqIO.parse(reference, 'fasta')]) @@ -376,7 +383,7 @@ def main(reference, bam_file, sigma=0.01, increment=1, max_coverage=100000, igno a = ' -a' if increment == 1 else '' # run strand bias filter, output in SNVs_%sigma.txt retcode_m = sb_filter(bam_file, sigma, amplimode=a, drop_indels=d, - max_coverage=max_coverage) + max_coverage=max_coverage) if retcode_m is not 0: logging.error('sb_filter exited with error %d', retcode_m) sys.exit() diff --git a/src/shorah/shotgun.py b/src/shorah/shotgun.py index 049adbb..4a09031 100644 --- a/src/shorah/shotgun.py +++ b/src/shorah/shotgun.py @@ -58,7 +58,8 @@ diri_exe = shutil.which('diri_sampler') b2w_exe = shutil.which('b2w') if not (diri_exe and b2w_exe): - logging.error('Executables b2w and diri_sampler not found, compile first.') + logging.error( + 'Executables b2w and diri_sampler not found, compile first.') sys.exit('Executables b2w and diri_sampler not found, compile first.') ################################################# @@ -145,13 +146,13 @@ def windows(run_settings): """run b2w to make windows from bam """ import subprocess - bam, fasta, w, i, m, x, reg, ignore_indels = run_settings + bam, fasta, w, i, m, x, c, reg, ignore_indels = run_settings d = ' -d' if ignore_indels else '' #dn = sys.path[0] my_prog = shlex.quote(b2w_exe) # os.path.join(dn, 'b2w') - my_arg = ' -w %i -i %i -m %i -x %i%s %s %s %s' % \ - (w, i, m, x, d, bam, fasta, reg) + my_arg = ' -w %i -i %i -m %i -x %i -c %i%s %s %s %s' % \ + (w, i, m, x, c, d, bam, fasta, reg) try: retcode = subprocess.call(my_prog + my_arg, shell=True) @@ -176,7 +177,8 @@ def run_dpm(run_setting): filein, j, a, seed = run_setting # if cor.fas.gz exists, skip - stem = re.match(r'^(?P.*).reads', filein).group('stem') # greedy re match to handle situation where '.reads' appears in the ID + # greedy re match to handle situation where '.reads' appears in the ID + stem = re.match(r'^(?P.*).reads', filein).group('stem') corgz = 'corrected/%s.reads-cor.fas.gz' % stem if os.path.exists(corgz): logging.debug('file %s already analysed, skipping', filein) @@ -279,6 +281,7 @@ def get_prop(filename): else: return 'not found' + prop = 0 for l in h: if l.startswith('#made'): prop = int(l.split()[1]) @@ -374,7 +377,7 @@ def merge_corrected_reads(aligned_read): return(ID, merged_corrected_read) -#def main(in_bam, in_fasta, win_length=201, win_shifts=3, region='', +# def main(in_bam, in_fasta, win_length=201, win_shifts=3, region='', # max_coverage=10000, alpha=0.1, keep_files=True, seed=None): def main(args): """ @@ -395,9 +398,10 @@ def main(args): region = args.r max_coverage = args.max_coverage alpha = args.a + cov_thrd = args.cov_thrd keep_files = args.keep_files seed = args.seed - ignore_indels = args.ignore_indels; + ignore_indels = args.ignore_indels logging.info(' '.join(sys.argv)) @@ -420,8 +424,8 @@ def main(args): keep_all_files = keep_files # run b2w - retcode = windows((in_bam, in_fasta, win_length, incr, - win_min_ext * win_length, max_c, region, ignore_indels)) + retcode = windows((in_bam, in_fasta, win_length, incr, win_min_ext * + win_length, max_c, cov_thrd, region, ignore_indels)) if retcode is not 0: sys.exit('b2w run not successful') @@ -462,7 +466,10 @@ def main(args): winFile, j, a, s = i del a # in future alpha might be different on each window del s - parts = re.match(r'^w-(?P.*)-(?P\d+)-(?P\d+).reads', winFile) # greedy re match to handle situation where '.' or '-' appears in the ID + # greedy re match to handle situation where '.' or '-' appears in the + # ID + parts = re.match( + r'^w-(?P.*)-(?P\d+)-(?P\d+).reads', winFile) chrom = parts.group('chrom') beg = int(parts.group('beg')) end = int(parts.group('end')) @@ -577,7 +584,8 @@ def main(args): logging.info('All corrected reads have been merged') ccx = {} - cin_stem = re.sub(r'\.[^.]+$', r'', os.path.split(in_bam)[1]) # handle case where bamfile has no dots in name + # handle case where bamfile has no dots in name + cin_stem = re.sub(r'\.[^.]+$', r'', os.path.split(in_bam)[1]) fch = open('%s.cor.fas' % cin_stem, 'w') logging.debug('writing to file %s.cor.fas', cin_stem) for ID, seq_list in to_correct: @@ -613,8 +621,8 @@ def main(args): ph.close() logging.info('running snv.py') - shorah_snv.main(reference=in_fasta, bam_file=in_bam, - increment=win_length // win_shifts, max_coverage=max_coverage) + args.increment = win_length // win_shifts + shorah_snv.main(args) # tidy snvs try: