Skip to content

Commit

Permalink
Update per-site threshold script to handle mulitple modified bases (f…
Browse files Browse the repository at this point in the history
…or new 5hmC model training).
  • Loading branch information
marcus1487 committed Nov 17, 2020
1 parent ffb2e7a commit 939b980
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 17 deletions.
6 changes: 4 additions & 2 deletions megalodon_extras/_extras_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,8 +567,10 @@ def get_parser_modified_bases_per_site_thresholds():
help='Minimum coverage (single strand) to include a site from ' +
'nanopore data. Default: %(default)d')
parser.add_argument(
'--mod-base', default='m',
help='Single letter code for the modified base. Default: %(default)s')
'--mod-bases', default='m',
help='Single letter codes for the modified base. For ' +
'mulitple alternative bases supply all single letter codes ' +
'with no spaces. Default: %(default)s')
parser.add_argument(
'--strand-offset', type=int,
help='Offset to combine stranded results. Positive value indicates ' +
Expand Down
39 changes: 24 additions & 15 deletions megalodon_extras/modified_bases_per_site_thresholds.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,17 @@ def write_thresh_worker(thresh_q, thresh_fn):

def extract_threshs_worker(
site_batches_q, thresh_q, low_cov_q, mod_db_fn, gt_cov_min, np_cov_min,
target_mod_base, strand_offset, valid_sites_used):
target_mod_bases, strand_offset, valid_sites_used):
def get_gt_cov(chrm, strand, lookup_pos):
if strand_offset is not None and strand == -1:
lookup_pos -= strand_offset
cov_strand = strand if strand_offset is None else None
return cov[(lookup_pos, cov_strand)], mod_cov[(lookup_pos, cov_strand)]

def get_pos_thresh(pos_cov, pos_mod_cov, pos_mod_data):
# TODO add new method here to determine the most likely threshold
# taking the combination of the ground truth and calibrated modified
# base scores (estimated from a sample of this data)
if pos_mod_cov == 0:
return '{:.4f}'.format(-MAX_MOD_SCORE)
elif pos_mod_cov == pos_cov:
Expand All @@ -72,14 +75,19 @@ def get_pos_thresh(pos_cov, pos_mod_cov, pos_mod_data):
for read_pos_lps in pos_stats.values():
mt_lps = list(read_pos_lps.values())
can_lp = np.log1p(-np.exp(mt_lps).sum())
for mod_dbid, mod_lp in read_pos_lps.items():
if mod_dbid == target_mod_dbid:
pos_llrs.append(can_lp - mod_lp)
valid_mt_lps = [
mod_lp for mod_dbid, mod_lp in read_pos_lps.items()
if mod_dbid in target_mod_dbids]
if len(valid_mt_lps) > 0:
# take maximum modified base log probability to match
# behavior for markup in mods.annotate_all_mods
pos_llrs.append(can_lp - max(valid_mt_lps))
gt_meth_pct = 100.0 * pos_mod_cov / pos_cov
return '{:.4f}'.format(np.percentile(pos_llrs, gt_meth_pct))

mods_db = mods.ModsDb(mod_db_fn)
target_mod_dbid = mods_db.get_mod_base_dbid(target_mod_base)
target_mod_dbids = set(mods_db.get_mod_base_dbid(tmb)
for tmb in target_mod_bases)
while True:
try:
batch = site_batches_q.get(block=True, timeout=0.1)
Expand All @@ -98,7 +106,7 @@ def get_pos_thresh(pos_cov, pos_mod_cov, pos_mod_data):
# check that mod calls are to target modbase
target_mod_cov = len(set(
read_id for read_id, mod_dbid, _ in pos_mod_data
if mod_dbid == target_mod_dbid))
if mod_dbid in target_mod_dbids))
if target_mod_cov == 0:
continue
# convert strand to string for output
Expand Down Expand Up @@ -143,7 +151,7 @@ def parse_bedmethyl_worker(

def process_all_batches(
num_proc, batch_size, gt_bed, low_cov_fn, thresh_fn, mod_db_fn,
strand_offset, gt_cov_min, np_cov_min, target_mod_base,
strand_offset, gt_cov_min, np_cov_min, target_mod_bases,
valid_sites_fn):
site_batches_q = mega_mp.CountingMPQueue(maxsize=MAX_BATCHES)
parse_bm_p = mp.Process(
Expand All @@ -158,7 +166,7 @@ def process_all_batches(
extract_threshs_ps.append(mp.Process(
target=extract_threshs_worker,
args=(site_batches_q, thresh_q, low_cov_q, mod_db_fn, gt_cov_min,
np_cov_min, target_mod_base, strand_offset,
np_cov_min, target_mod_bases, strand_offset,
valid_sites_fn is not None),
daemon=True, name='ExtractThresh{:03d}'.format(et_i)))
extract_threshs_ps[-1].start()
Expand Down Expand Up @@ -189,7 +197,7 @@ def process_all_batches(


def check_matching_attrs(
ground_truth_bed, strand_offset, mod_db_fn, target_mod_base,
ground_truth_bed, strand_offset, mod_db_fn, target_mod_bases,
limit=10000):
mods_db = mods.ModsDb(mod_db_fn)
db_strands = (1, -1) if strand_offset is None else (None, )
Expand All @@ -209,10 +217,11 @@ def check_matching_attrs(
map(str, list(cov.keys())))))
raise mh.MegaError('No overlapping contigs found.')
db_mods = set(mod_base for mod_base, _ in mods_db.get_mod_long_names())
if target_mod_base not in db_mods:
raise mh.MegaError(
'Target modified base not found in mods database ({}).'.format(
', '.join(db_mods)))
for tmb in target_mod_bases:
if tmb not in db_mods:
raise mh.MegaError((
'Target modified base, {}, not found in mods database ' +
'({}).').format(tmb, ', '.join(db_mods)))
mods_db.check_data_covering_index_exists()
mods_db.close()

Expand All @@ -230,14 +239,14 @@ def _main(args):
LOGGER.info('Checking for consistent contig names')
mod_db_fn = mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME)
check_matching_attrs(
args.ground_truth_bed, args.strand_offset, mod_db_fn, args.mod_base)
args.ground_truth_bed, args.strand_offset, mod_db_fn, args.mod_bases)

LOGGER.info('Processing batches')
process_all_batches(
args.processes, args.batch_size, args.ground_truth_bed,
args.out_low_coverage_sites, args.out_per_site_mod_thresholds,
mod_db_fn, args.strand_offset, args.ground_truth_cov_min,
args.nanopore_cov_min, args.mod_base, args.valid_sites)
args.nanopore_cov_min, args.mod_bases, args.valid_sites)


if __name__ == '__main__':
Expand Down
9 changes: 9 additions & 0 deletions test/test_megalodon.sh
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,15 @@ megalodon_extras \
--out-filename \
${CTRL_READS}.mega_res/per_read_variant_calls.rerun.txt

sort -k1V -k2n ${NAT_READS}.mega_res/modified_bases.5mC.bed > \
${NAT_READS}.mega_res/modified_bases.5mC.sorted.bed
megalodon_extras \
modified_bases per_site_thresholds \
${NAT_READS}.mega_res/ \
${NAT_READS}.mega_res/modified_bases.5mC.sorted.bed \
--mod-bases Z \
--ground-truth-cov-min 3 --nanopore-cov-min 5

# TODO add tests for more megalodon_extras commands


Expand Down

0 comments on commit 939b980

Please sign in to comment.