Skip to content

Commit

Permalink
Add SNV as sub-command to the command line and handle errors related …
Browse files Browse the repository at this point in the history
…to empty windows (#58)

* Add option for SNV calling
If, e.g., one wants to call SNVs with a different value for sigma,
re-computing the local haplotypes would be redundant

* Add option for threshold on the read coverage per window of the read alignment
* [minor] flush output file
  • Loading branch information
sposadac committed Nov 29, 2018
1 parent 081fe96 commit 1a3807a
Show file tree
Hide file tree
Showing 6 changed files with 123 additions and 61 deletions.
13 changes: 9 additions & 4 deletions src/cpp/b2w.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ adapted from samtools/calDep.c
// data for fetch_func and pileup_func
typedef struct
{
int win, inc, min_overlap, max;
int win, inc, min_overlap, max, cov_thrd;
bool skip_indel;
} paramstruct_t;

Expand Down Expand Up @@ -102,15 +102,17 @@ int main(int argc, char* argv[])
201/3, // incr = size / shifts
(int) std::round(201. * 0.85), // min_overlap = size * win_min_ext
10000 / 201, // max_c = max_coverage / size
0, // coverage threshold
false // (historically default behaviour of shorah is to never skip insertions)
};

char help_string[] =
"\nUsage: b2w [options] <in.bam> <in.fasta> region\n\nOptions:\n\t-w: window length "
"(INT)\n\t-i: increment (INT)\n\t-m: minimum overlap (INT)\n\t-x: max reads starting at a "
"position (INT)\n\t-d: drop SNVs that are adjacent to insertions/deletions (alternate behaviour)\n\t-h: show this help\n\n";
"position (INT)\n\t-c: coverage threshold. Omit windows with low coverage (INT)\n\t"
"-d: drop SNVs that are adjacent to insertions/deletions (alternate behaviour)\n\t-h: show this help\n\n";

while ((c = getopt(argc, argv, "w:i:m:x:dh")) != EOF) {
while ((c = getopt(argc, argv, "w:i:m:x:c:dh")) != EOF) {
switch (c) {
case 'w':
param.win = atof(optarg);
Expand All @@ -124,6 +126,9 @@ int main(int argc, char* argv[])
case 'x':
param.max = atof(optarg);
break;
case 'c':
param.cov_thrd = atof(optarg);
break;
case 'd':
// brings fil's weird behaviour into b2w
param.skip_indel = true;
Expand Down Expand Up @@ -297,7 +302,7 @@ int main(int argc, char* argv[])
}
outFile.close();

if (cov > 0) { // ignore empty windows
if (cov > param.cov_thrd) { // ignore windows with low-coverage
covOut << filename << "\t" // write out coverage information
<< header->target_name[tid] << "\t" << win_b + 1 << "\t"
<< win_e + 1 << "\t" << cov << "\n";
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/dpm_sampler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,7 @@ void read_data(char* filename, std::ofstream& out_file)
fclose(data);

if (n == 1) {
out_file << "Nothing to do, have only one read... (n == 1)\n";
out_file << "Nothing to do, have only one read... (n == 1)" << std::endl;
exit(0);
}

Expand Down
32 changes: 20 additions & 12 deletions src/shorah/amplicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@
diri_exe = shutil.which('diri_sampler')
b2w_exe = shutil.which('b2w')
if not (diri_exe and b2w_exe):
logging.error('Executables b2w and diri_sampler not found, compile first.')
logging.error(
'Executables b2w and diri_sampler not found, compile first.')
sys.exit('Executables b2w and diri_sampler not found, compile first.')

win_min_ext = 0.95
Expand All @@ -79,7 +80,8 @@ def run_child(exe_name, arg_string):
retcode = subprocess.call(exe_name + arg_string, shell=True)
if retcode > 0:
logging.error(exe_name + arg_string)
logging.error("Child %s terminated by signal %s", exe_name, retcode)
logging.error("Child %s terminated by signal %s",
exe_name, retcode)
else:
logging.debug("Child %s returned %i", exe_name, retcode)
except OSError as ee:
Expand All @@ -93,7 +95,8 @@ def run_diagnostics(window_file, reads):
"""
import warnings

stem = re.match(r'^(?P<stem>.*).reads', window_file).group('stem') # greedy re match to handle situation where '.reads' appears in the ID
# greedy re match to handle situation where '.reads' appears in the ID
stem = re.match(r'^(?P<stem>.*).reads', window_file).group('stem')
smp_file = stem + '.smp'
dbg_file = stem + '.dbg'
del stem
Expand Down Expand Up @@ -196,6 +199,7 @@ def plot_entropy(pos_ent, pos_coords, ave_ent, win_coords):
plt.savefig('entropy.pdf')
del fig


def highest_entropy(bam_file, fasta_file, ent_sel='relative'):
"""Parse reads to have their length distribution and compute the
trimmed mean read length"""
Expand Down Expand Up @@ -313,7 +317,7 @@ def highest_entropy(bam_file, fasta_file, ent_sel='relative'):
return high_ent_start, high_ent_stop


#def main(in_bam, in_fasta, min_overlap=0.95, max_coverage=50000,
# def main(in_bam, in_fasta, min_overlap=0.95, max_coverage=50000,
# alpha=0.5, s=0.01, region='', diversity=False):
def main(args):
"""
Expand All @@ -327,17 +331,20 @@ def main(args):
alpha = args.a
seed = args.seed
sigma = args.sigma
cov_thrd = args.cov_thrd
diversity = args.diversity
min_overlap = args.min_overlap
ignore_indels = args.ignore_indels;
ignore_indels = args.ignore_indels

logging.info(' '.join(sys.argv))
# info on reference and region if given, or discover high entropy one
ref_seq = list(SeqIO.parse(in_fasta, 'fasta'))[0]
ref_name = ref_seq.id
if region:
reg_bound = re.search(r':(?P<start>\d+)-(?P<stop>\d+)$', region) # handles situation where ':' or '-' appears in the ID
reg_start, reg_stop = int(reg_bound.group('start')), int(reg_bound.group('stop'))
# handles situation where ':' or '-' appears in the ID
reg_bound = re.search(r':(?P<start>\d+)-(?P<stop>\d+)$', region)
reg_start, reg_stop = int(reg_bound.group(
'start')), int(reg_bound.group('stop'))
ref_length = reg_stop - reg_start + 1
del reg_bound
elif region == '' and diversity:
Expand All @@ -354,16 +361,16 @@ def main(args):
# output the reads, aligned to the amplicon
d = ' -d' if ignore_indels else ''

b2w_args = ' -i 0 -w %d -m %d -x %d%s %s %s %s' % \
(ref_length, int(min_overlap * ref_length),
max_coverage, d, in_bam, in_fasta, region)
b2w_args = ' -i 0 -w %d -m %d -x %d -c %i%s %s %s %s' % (ref_length, int(
min_overlap * ref_length), max_coverage, cov_thrd, d, in_bam, in_fasta, region)
ret_b2w = run_child(shlex.quote(b2w_exe), b2w_args)
logging.debug('b2w returned %d', ret_b2w)

# run diri_sampler on the aligned reads
win_file = 'w-%s-%u-%u.reads.fas' % (ref_name, reg_start, reg_stop)
# TODO clean ref_name of special caracters - that would be an alternative to processing everything with regex down the line
# BUG the solution currently used by ShoRAH can still fail when path '/' (or on windows '\\' and ':') characters are present in the ref_seq.id
# BUG the solution currently used by ShoRAH can still fail when path '/'
# (or on windows '\\' and ':') characters are present in the ref_seq.id
h = list(open('coverage.txt'))[0]
n_reads = int(h.split()[-1])
assert os.path.exists(win_file), 'window file %s not found' % win_file
Expand All @@ -377,4 +384,5 @@ def main(args):
run_diagnostics(win_file, n_reads)

# run snv.py to parse single nucleotide variants
shorah_snv.main(reference=in_fasta, bam_file=in_bam, sigma=sigma, increment=1)
args.increment = 1
shorah_snv.main(args)
92 changes: 63 additions & 29 deletions src/shorah/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,81 +59,114 @@
sys.modules["shorah"] = mod
#from common import hcv_map, hiv_map, org_dict, wobbles
#from stats import (genome_coverage, start_stop_coverage)
#else:
# else:
#from .common import hcv_map, hiv_map, org_dict, wobbles
#from .stats import (genome_coverage, start_stop_coverage)

# each subcommand takes one of these functions as default


def shotgun_run(args):
"""Default function for command line parser."""
from shorah import shotgun
shotgun.main(args)


def amplicon_run(args):
"""Default function for command line parser."""
from shorah import amplicon
amplicon.main(args)


def snv_run(args):
from shorah import shorah_snv
shorah_snv.main(args)


def main():
"""Parse command line, run default functions."""
import argparse
# parse command line
# create the top-level parser
parser = argparse.ArgumentParser(usage='%(prog)s <command> [options]',
epilog="Run `shorah subcommand -h` for more help")
parser.add_argument('-v', '--version', action='version', version=__version__)
version_parser = argparse.ArgumentParser(add_help=False)

version_parser.add_argument(
'-v', '--version', action='version', version=__version__)

parent_parser = argparse.ArgumentParser(add_help=False)

required = parent_parser.add_argument_group('required arguments')

parser.add_argument("-b", "--bam", metavar='BAM', required=True, type=str,
dest="b", help="sorted bam format alignment file")
required.add_argument("-b", "--bam", metavar='BAM', required=True,
type=str, dest="b", help="sorted bam format alignment file")

parser.add_argument("-f", "--fasta", metavar='REF', required=True, type=str,
dest="f", help="reference genome in fasta format")
required.add_argument("-f", "--fasta", metavar='REF', required=True,
type=str, dest="f", help="reference genome in fasta format")

parser.add_argument("-a", "--alpha", metavar='FLOAT', required=False, type=float,
dest="a", default=0.1, help="alpha in dpm sampling")
parent_parser.add_argument("-a", "--alpha", metavar='FLOAT', required=False,
type=float, dest="a", default=0.1, help="alpha in dpm sampling")

parser.add_argument("-r", "--region", metavar='chrm:start-stop', required=False, type=str,
dest="r", default='', help="region in format 'chr:start-stop', e.g. 'chrm:1000-3000'")
parent_parser.add_argument("-r", "--region", metavar='chrm:start-stop', required=False, type=str,
dest="r", default='', help="region in format 'chr:start-stop', e.g. 'chrm:1000-3000'")

parser.add_argument("-R", "--seed", metavar='INT', required=False, type=int,
dest="seed", default=None, help="set seed for reproducible results")
parent_parser.add_argument("-R", "--seed", metavar='INT', required=False,
type=int, dest="seed", default=None, help="set seed for reproducible results")

parser.add_argument("-x", "--maxcov", metavar='INT', required=False, type=int,
default=10000, dest="max_coverage", help="approximate max coverage allowed")
parent_parser.add_argument("-x", "--maxcov", metavar='INT', required=False, type=int,
default=10000, dest="max_coverage", help="approximate max coverage allowed")

parser.add_argument("-S", "--sigma", metavar='FLOAT', default=0.01, type=float,
dest="sigma", help="sigma value to use when calling SNVs")
parent_parser.add_argument("-S", "--sigma", metavar='FLOAT', default=0.01,
type=float, dest="sigma", help="sigma value to use when calling SNVs")

parser.add_argument("-I", "--ignore_indels", action="store_true", default=False,
dest="ignore_indels", help="ignore SNVs adjacent to insertions/deletions\n(legacy behaviour of 'fil', ignore this option if you don't understand)")
parent_parser.add_argument("-I", "--ignore_indels", action="store_true", default=False, dest="ignore_indels",
help="ignore SNVs adjacent to insertions/deletions\n(legacy behaviour of 'fil', ignore this option if you don't understand)")

subparsers = parser.add_subparsers(help='available sub-commands')
coverage_parser = argparse.ArgumentParser(add_help=False)

coverage_parser.add_argument("-c", "--win_coverage", metavar='INT', default=0, type=int,
dest='cov_thrd', help='coverage threshold. Omit windows with low coverage')

parser = argparse.ArgumentParser(
usage='%(prog)s <subcommand> [options]', epilog="Run `shorah subcommand -h` for more help", parents=[version_parser])

subparsers = parser.add_subparsers(
title='sub-commands', help='available sub-commands')

# create the parser for command "shotgun"
parser_shotgun = subparsers.add_parser('shotgun', help='run local analysis in shotgun mode')
parser_shotgun = subparsers.add_parser(
'shotgun', help='run local analysis in shotgun mode', parents=[version_parser, parent_parser, coverage_parser])

parser_shotgun.add_argument("-w", "--windowsize", metavar='INT', required=False, type=int,
dest="w", default=201, help="window size")
parser_shotgun.add_argument("-w", "--windowsize", metavar='INT',
required=False, type=int, dest="w", default=201, help="window size")

parser_shotgun.add_argument("-s", "--winshifts", metavar='INT', required=False, type=int,
default=3, dest="win_shifts", help="number of window shifts")
parser_shotgun.add_argument("-s", "--winshifts", metavar='INT', required=False,
type=int, default=3, dest="win_shifts", help="number of window shifts")

parser_shotgun.add_argument("-k", "--keep_files", required=False, action='store_true',
default=True, dest="keep_files", help="keep all intermediate files")

parser_shotgun.set_defaults(func=shotgun_run)

parser_amplicon = subparsers.add_parser('amplicon', help='run local analysis in amplicon mode')
parser_amplicon = subparsers.add_parser(
'amplicon', help='run local analysis in amplicon mode', parents=[version_parser, parent_parser, coverage_parser])

parser_amplicon.add_argument("-d", "--diversity", action="store_true", default=False,
dest="diversity", help="detect the highest entropy region and run there")

parser_amplicon.add_argument("-m", "--min_overlap", metavar='FLOAT', default=0.95, type=float,
dest="min_overlap", help="fraction of read overlap to be included")
parser_amplicon.add_argument("-m", "--min_overlap", metavar='FLOAT', default=0.95,
type=float, dest="min_overlap", help="fraction of read overlap to be included")

parser_amplicon.set_defaults(func=amplicon_run)

# create the parser for command "snv"
parser_snv = subparsers.add_parser(
'snv', help='run single-nucleotide-variant calling', parents=[version_parser, parent_parser])

parser_snv.add_argument("-i", "--increment", metavar='INT', default=1, type=int, required=False,
dest="increment", help="value of increment to use when calling\nSNVs (1 used in amplicon mode)")

parser_snv.set_defaults(func=snv_run)

# exit so that log file is not written
if len(sys.argv) == 1 or sys.argv[1] == '-h' or sys.argv[1] == '--help':
parser.print_help()
Expand All @@ -152,5 +185,6 @@ def main():
args = parser.parse_args()
args.func(args)


if __name__ == "__main__": # and __package__ is None:
main()
11 changes: 9 additions & 2 deletions src/shorah/shorah_snv.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,13 +348,20 @@ def BH(p_vals, n):
return q_vals_l


def main(reference, bam_file, sigma=0.01, increment=1, max_coverage=100000, ignore_indels=False):
def main(args):
'''main code
'''
from Bio import SeqIO
import csv
import inspect

reference = args.f
bam_file = args.b
sigma = args.sigma
increment = args.increment
max_coverage = args.max_coverage
ignore_indels = args.ignore_indels

logging.info(str(inspect.getfullargspec(main)))
ref_m = dict([[s.id, str(s.seq).upper()]
for s in SeqIO.parse(reference, 'fasta')])
Expand All @@ -376,7 +383,7 @@ def main(reference, bam_file, sigma=0.01, increment=1, max_coverage=100000, igno
a = ' -a' if increment == 1 else ''
# run strand bias filter, output in SNVs_%sigma.txt
retcode_m = sb_filter(bam_file, sigma, amplimode=a, drop_indels=d,
max_coverage=max_coverage)
max_coverage=max_coverage)
if retcode_m is not 0:
logging.error('sb_filter exited with error %d', retcode_m)
sys.exit()
Expand Down
Loading

0 comments on commit 1a3807a

Please sign in to comment.