Add SNV as sub-command to the command line and handle errors related …

…to empty windows (#58) * Add option for SNV calling If, e.g., one wants to call SNVs with a different value for sigma, re-computing the local haplotypes would be redundant * Add option for threshold on the read coverage per window of the read alignment * [minor] flush output file
cbg-ethz · Nov 29, 2018 · 1a3807a · 1a3807a
1 parent 081fe96
commit 1a3807a
Show file tree

Hide file tree

Showing 6 changed files with 123 additions and 61 deletions.
diff --git a/src/cpp/b2w.cpp b/src/cpp/b2w.cpp
@@ -51,7 +51,7 @@ adapted from samtools/calDep.c
 // data for fetch_func and pileup_func
 typedef struct
 {
-    int win, inc, min_overlap, max;
+    int win, inc, min_overlap, max, cov_thrd;
     bool skip_indel;
 } paramstruct_t;
 
@@ -102,15 +102,17 @@ int main(int argc, char* argv[])
         201/3, // incr = size / shifts
         (int) std::round(201. * 0.85), // min_overlap = size * win_min_ext
         10000 / 201, // max_c = max_coverage / size
+        0,  // coverage threshold
         false   // (historically default behaviour of shorah is to never skip insertions)
     };
 
     char help_string[] =
         "\nUsage: b2w [options] <in.bam> <in.fasta> region\n\nOptions:\n\t-w: window length "
         "(INT)\n\t-i: increment (INT)\n\t-m: minimum overlap (INT)\n\t-x: max reads starting at a "
-        "position (INT)\n\t-d: drop SNVs that are adjacent to insertions/deletions (alternate behaviour)\n\t-h: show this help\n\n";
+        "position (INT)\n\t-c: coverage threshold. Omit windows with low coverage (INT)\n\t"
+        "-d: drop SNVs that are adjacent to insertions/deletions (alternate behaviour)\n\t-h: show this help\n\n";
 
-    while ((c = getopt(argc, argv, "w:i:m:x:dh")) != EOF) {
+    while ((c = getopt(argc, argv, "w:i:m:x:c:dh")) != EOF) {
         switch (c) {
             case 'w':
                 param.win = atof(optarg);
@@ -124,6 +126,9 @@ int main(int argc, char* argv[])
             case 'x':
                 param.max = atof(optarg);
                 break;
+            case 'c':
+                param.cov_thrd = atof(optarg);
+                break;
             case 'd':
                 // brings fil's weird behaviour into b2w
                 param.skip_indel = true;
@@ -297,7 +302,7 @@ int main(int argc, char* argv[])
             }
             outFile.close();
 
-            if (cov > 0) {             // ignore empty windows
+            if (cov > param.cov_thrd) {     // ignore windows with low-coverage
                 covOut << filename << "\t"  // write out coverage information
                         << header->target_name[tid] << "\t" << win_b + 1 << "\t"
                         << win_e + 1 << "\t" << cov << "\n";

diff --git a/src/cpp/dpm_sampler.cpp b/src/cpp/dpm_sampler.cpp
@@ -558,7 +558,7 @@ void read_data(char* filename, std::ofstream& out_file)
     fclose(data);
 
     if (n == 1) {
-        out_file << "Nothing to do, have only one read... (n == 1)\n";
+        out_file << "Nothing to do, have only one read... (n == 1)" << std::endl;
         exit(0);
     }
 

diff --git a/src/shorah/amplicon.py b/src/shorah/amplicon.py
@@ -59,7 +59,8 @@
     diri_exe = shutil.which('diri_sampler')
     b2w_exe = shutil.which('b2w')
     if not (diri_exe and b2w_exe):
-        logging.error('Executables b2w and diri_sampler not found, compile first.')
+        logging.error(
+            'Executables b2w and diri_sampler not found, compile first.')
         sys.exit('Executables b2w and diri_sampler not found, compile first.')
 
 win_min_ext = 0.95
@@ -79,7 +80,8 @@ def run_child(exe_name, arg_string):
         retcode = subprocess.call(exe_name + arg_string, shell=True)
         if retcode > 0:
             logging.error(exe_name + arg_string)
-            logging.error("Child %s terminated by signal %s", exe_name, retcode)
+            logging.error("Child %s terminated by signal %s",
+                          exe_name, retcode)
         else:
             logging.debug("Child %s returned %i", exe_name, retcode)
     except OSError as ee:
@@ -93,7 +95,8 @@ def run_diagnostics(window_file, reads):
     """
     import warnings
 
-    stem = re.match(r'^(?P<stem>.*).reads', window_file).group('stem')  # greedy re match to handle situation where '.reads' appears in the ID
+    # greedy re match to handle situation where '.reads' appears in the ID
+    stem = re.match(r'^(?P<stem>.*).reads', window_file).group('stem')
     smp_file = stem + '.smp'
     dbg_file = stem + '.dbg'
     del stem
@@ -196,6 +199,7 @@ def plot_entropy(pos_ent, pos_coords, ave_ent, win_coords):
     plt.savefig('entropy.pdf')
     del fig
 
+
 def highest_entropy(bam_file, fasta_file, ent_sel='relative'):
     """Parse reads to have their length distribution and compute the
     trimmed mean read length"""
@@ -313,7 +317,7 @@ def highest_entropy(bam_file, fasta_file, ent_sel='relative'):
     return high_ent_start, high_ent_stop
 
 
-#def main(in_bam, in_fasta, min_overlap=0.95, max_coverage=50000,
+# def main(in_bam, in_fasta, min_overlap=0.95, max_coverage=50000,
 #         alpha=0.5, s=0.01, region='', diversity=False):
 def main(args):
     """
@@ -327,17 +331,20 @@ def main(args):
     alpha = args.a
     seed = args.seed
     sigma = args.sigma
+    cov_thrd = args.cov_thrd
     diversity = args.diversity
     min_overlap = args.min_overlap
-    ignore_indels = args.ignore_indels;
+    ignore_indels = args.ignore_indels
 
     logging.info(' '.join(sys.argv))
     # info on reference and region if given, or discover high entropy one
     ref_seq = list(SeqIO.parse(in_fasta, 'fasta'))[0]
     ref_name = ref_seq.id
     if region:
-        reg_bound = re.search(r':(?P<start>\d+)-(?P<stop>\d+)$', region)  # handles situation where ':' or '-' appears in the ID
-        reg_start, reg_stop = int(reg_bound.group('start')), int(reg_bound.group('stop'))
+        # handles situation where ':' or '-' appears in the ID
+        reg_bound = re.search(r':(?P<start>\d+)-(?P<stop>\d+)$', region)
+        reg_start, reg_stop = int(reg_bound.group(
+            'start')), int(reg_bound.group('stop'))
         ref_length = reg_stop - reg_start + 1
         del reg_bound
     elif region == '' and diversity:
@@ -354,16 +361,16 @@ def main(args):
     # output the reads, aligned to the amplicon
     d = ' -d' if ignore_indels else ''
 
-    b2w_args = ' -i 0 -w %d -m %d -x %d%s %s %s %s' % \
-        (ref_length, int(min_overlap * ref_length),
-         max_coverage, d, in_bam, in_fasta, region)
+    b2w_args = ' -i 0 -w %d -m %d -x %d -c %i%s %s %s %s' % (ref_length, int(
+        min_overlap * ref_length), max_coverage, cov_thrd, d, in_bam, in_fasta, region)
     ret_b2w = run_child(shlex.quote(b2w_exe), b2w_args)
     logging.debug('b2w returned %d', ret_b2w)
 
     # run diri_sampler on the aligned reads
     win_file = 'w-%s-%u-%u.reads.fas' % (ref_name, reg_start, reg_stop)
     # TODO clean ref_name of special caracters - that would be an alternative to processing everything with regex down the line
-    # BUG the solution currently used by ShoRAH can still fail when path '/' (or on windows '\\' and ':') characters are present in the ref_seq.id
+    # BUG the solution currently used by ShoRAH can still fail when path '/'
+    # (or on windows '\\' and ':') characters are present in the ref_seq.id
     h = list(open('coverage.txt'))[0]
     n_reads = int(h.split()[-1])
     assert os.path.exists(win_file), 'window file %s not found' % win_file
@@ -377,4 +384,5 @@ def main(args):
     run_diagnostics(win_file, n_reads)
 
     # run snv.py to parse single nucleotide variants
-    shorah_snv.main(reference=in_fasta, bam_file=in_bam, sigma=sigma, increment=1)
+    args.increment = 1
+    shorah_snv.main(args)
diff --git a/src/shorah/cli.py b/src/shorah/cli.py
@@ -59,81 +59,114 @@
         sys.modules["shorah"] = mod
         #from common import hcv_map, hiv_map, org_dict, wobbles
         #from stats import (genome_coverage, start_stop_coverage)
-#else:
+# else:
     #from .common import hcv_map, hiv_map, org_dict, wobbles
     #from .stats import (genome_coverage, start_stop_coverage)
 
 # each subcommand takes one of these functions as default
 
+
 def shotgun_run(args):
     """Default function for command line parser."""
     from shorah import shotgun
     shotgun.main(args)
 
+
 def amplicon_run(args):
     """Default function for command line parser."""
     from shorah import amplicon
     amplicon.main(args)
 
+
+def snv_run(args):
+    from shorah import shorah_snv
+    shorah_snv.main(args)
+
+
 def main():
     """Parse command line, run default functions."""
     import argparse
     # parse command line
     # create the top-level parser
-    parser = argparse.ArgumentParser(usage='%(prog)s <command> [options]',
-                                     epilog="Run `shorah subcommand -h` for more help")
-    parser.add_argument('-v', '--version', action='version', version=__version__)
+    version_parser = argparse.ArgumentParser(add_help=False)
+
+    version_parser.add_argument(
+        '-v', '--version', action='version', version=__version__)
+
+    parent_parser = argparse.ArgumentParser(add_help=False)
+
+    required = parent_parser.add_argument_group('required arguments')
 
-    parser.add_argument("-b", "--bam", metavar='BAM', required=True, type=str,
-                        dest="b", help="sorted bam format alignment file")
+    required.add_argument("-b", "--bam", metavar='BAM', required=True,
+                          type=str, dest="b", help="sorted bam format alignment file")
 
-    parser.add_argument("-f", "--fasta", metavar='REF', required=True, type=str,
-                        dest="f", help="reference genome in fasta format")
+    required.add_argument("-f", "--fasta", metavar='REF', required=True,
+                          type=str, dest="f", help="reference genome in fasta format")
 
-    parser.add_argument("-a", "--alpha", metavar='FLOAT', required=False, type=float,
-                        dest="a", default=0.1, help="alpha in dpm sampling")
+    parent_parser.add_argument("-a", "--alpha", metavar='FLOAT', required=False,
+                               type=float, dest="a", default=0.1, help="alpha in dpm sampling")
 
-    parser.add_argument("-r", "--region", metavar='chrm:start-stop', required=False, type=str,
-                        dest="r", default='', help="region in format 'chr:start-stop', e.g. 'chrm:1000-3000'")
+    parent_parser.add_argument("-r", "--region", metavar='chrm:start-stop', required=False, type=str,
+                               dest="r", default='', help="region in format 'chr:start-stop', e.g. 'chrm:1000-3000'")
 
-    parser.add_argument("-R", "--seed", metavar='INT', required=False, type=int,
-                        dest="seed", default=None, help="set seed for reproducible results")
+    parent_parser.add_argument("-R", "--seed", metavar='INT', required=False,
+                               type=int, dest="seed", default=None, help="set seed for reproducible results")
 
-    parser.add_argument("-x", "--maxcov", metavar='INT', required=False, type=int,
-                        default=10000, dest="max_coverage", help="approximate max coverage allowed")
+    parent_parser.add_argument("-x", "--maxcov", metavar='INT', required=False, type=int,
+                               default=10000, dest="max_coverage", help="approximate max coverage allowed")
 
-    parser.add_argument("-S", "--sigma", metavar='FLOAT', default=0.01, type=float,
-                        dest="sigma", help="sigma value to use when calling SNVs")
+    parent_parser.add_argument("-S", "--sigma", metavar='FLOAT', default=0.01,
+                               type=float, dest="sigma", help="sigma value to use when calling SNVs")
 
-    parser.add_argument("-I", "--ignore_indels", action="store_true", default=False,
-                        dest="ignore_indels", help="ignore SNVs adjacent to insertions/deletions\n(legacy behaviour of 'fil', ignore this option if you don't understand)")
+    parent_parser.add_argument("-I", "--ignore_indels", action="store_true", default=False, dest="ignore_indels",
+                               help="ignore SNVs adjacent to insertions/deletions\n(legacy behaviour of 'fil', ignore this option if you don't understand)")
 
-    subparsers = parser.add_subparsers(help='available sub-commands')
+    coverage_parser = argparse.ArgumentParser(add_help=False)
+
+    coverage_parser.add_argument("-c", "--win_coverage", metavar='INT', default=0, type=int,
+                                 dest='cov_thrd', help='coverage threshold. Omit windows with low coverage')
+
+    parser = argparse.ArgumentParser(
+        usage='%(prog)s <subcommand> [options]', epilog="Run `shorah subcommand -h` for more help", parents=[version_parser])
+
+    subparsers = parser.add_subparsers(
+        title='sub-commands', help='available sub-commands')
 
     # create the parser for command "shotgun"
-    parser_shotgun = subparsers.add_parser('shotgun', help='run local analysis in shotgun mode')
+    parser_shotgun = subparsers.add_parser(
+        'shotgun', help='run local analysis in shotgun mode', parents=[version_parser, parent_parser, coverage_parser])
 
-    parser_shotgun.add_argument("-w", "--windowsize", metavar='INT', required=False, type=int,
-                                dest="w", default=201, help="window size")
+    parser_shotgun.add_argument("-w", "--windowsize", metavar='INT',
+                                required=False, type=int, dest="w", default=201, help="window size")
 
-    parser_shotgun.add_argument("-s", "--winshifts", metavar='INT', required=False, type=int,
-                                default=3, dest="win_shifts", help="number of window shifts")
+    parser_shotgun.add_argument("-s", "--winshifts", metavar='INT', required=False,
+                                type=int, default=3, dest="win_shifts", help="number of window shifts")
 
     parser_shotgun.add_argument("-k", "--keep_files", required=False, action='store_true',
                                 default=True, dest="keep_files", help="keep all intermediate files")
 
     parser_shotgun.set_defaults(func=shotgun_run)
 
-    parser_amplicon = subparsers.add_parser('amplicon', help='run local analysis in amplicon mode')
+    parser_amplicon = subparsers.add_parser(
+        'amplicon', help='run local analysis in amplicon mode', parents=[version_parser, parent_parser, coverage_parser])
 
     parser_amplicon.add_argument("-d", "--diversity", action="store_true", default=False,
                                  dest="diversity", help="detect the highest entropy region and run there")
 
-    parser_amplicon.add_argument("-m", "--min_overlap", metavar='FLOAT', default=0.95, type=float,
-                                 dest="min_overlap", help="fraction of read overlap to be included")
+    parser_amplicon.add_argument("-m", "--min_overlap", metavar='FLOAT', default=0.95,
+                                 type=float, dest="min_overlap", help="fraction of read overlap to be included")
 
     parser_amplicon.set_defaults(func=amplicon_run)
 
+    # create the parser for command "snv"
+    parser_snv = subparsers.add_parser(
+        'snv', help='run single-nucleotide-variant calling', parents=[version_parser, parent_parser])
+
+    parser_snv.add_argument("-i", "--increment", metavar='INT', default=1, type=int, required=False,
+                            dest="increment", help="value of increment to use when calling\nSNVs (1 used in  amplicon mode)")
+
+    parser_snv.set_defaults(func=snv_run)
+
     # exit so that log file is not written
     if len(sys.argv) == 1 or sys.argv[1] == '-h' or sys.argv[1] == '--help':
         parser.print_help()
@@ -152,5 +185,6 @@ def main():
     args = parser.parse_args()
     args.func(args)
 
+
 if __name__ == "__main__":  # and __package__ is None:
     main()
diff --git a/src/shorah/shorah_snv.py b/src/shorah/shorah_snv.py
@@ -348,13 +348,20 @@ def BH(p_vals, n):
     return q_vals_l
 
 
-def main(reference, bam_file, sigma=0.01, increment=1, max_coverage=100000, ignore_indels=False):
+def main(args):
     '''main code
     '''
     from Bio import SeqIO
     import csv
     import inspect
 
+    reference = args.f
+    bam_file = args.b
+    sigma = args.sigma
+    increment = args.increment
+    max_coverage = args.max_coverage
+    ignore_indels = args.ignore_indels
+
     logging.info(str(inspect.getfullargspec(main)))
     ref_m = dict([[s.id, str(s.seq).upper()]
                   for s in SeqIO.parse(reference, 'fasta')])
@@ -376,7 +383,7 @@ def main(reference, bam_file, sigma=0.01, increment=1, max_coverage=100000, igno
     a = ' -a' if increment == 1 else ''
     # run strand bias filter, output in SNVs_%sigma.txt
     retcode_m = sb_filter(bam_file, sigma, amplimode=a, drop_indels=d,
-                            max_coverage=max_coverage)
+                          max_coverage=max_coverage)
     if retcode_m is not 0:
         logging.error('sb_filter exited with error %d', retcode_m)
         sys.exit()