-
Notifications
You must be signed in to change notification settings - Fork 7
/
find-knots.py
executable file
·154 lines (124 loc) · 5.8 KB
/
find-knots.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#! /usr/bin/env python2
#
# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: [email protected]
#
# pylint: disable=invalid-name,missing-docstring
"""
Find highly-connected k-mers.
k-mers are output into a .stoptags file, for later use in partitioning.
% python scripts/find-knots.py <base>
"""
import argparse
import glob
import os
import textwrap
import khmer
import sys
from khmer.kfile import check_input_files, check_space
from khmer.khmer_args import info
# counting hash parameters.
DEFAULT_COUNTING_HT_SIZE = 3e6 # number of bytes
DEFAULT_COUNTING_HT_N = 4 # number of counting hash tables
# Lump removal parameters. Probably shouldn't be changed, but who knows?
#
# explanation:
#
# We will walk EXCURSION_DISTANCE out from each tag; if we find more than
# EXCURSION_KMER_THRESHOLD kmers within that range, this will be a "big"
# excursion and we will track all k-mers visited. If we find that any
# k-mer has been visited more than EXCURSION_KMER_COUNT_THRESHOLD times,
# we will mark it as BAD and make it a stop tag for traversal.
# don't change these!
EXCURSION_DISTANCE = 40
EXCURSION_KMER_THRESHOLD = 200
EXCURSION_KMER_COUNT_THRESHOLD = 2
# EXCURSION_KMER_COUNT_THRESHOLD=5 # -- works ok for non-diginormed data
def get_parser():
epilog = """
Load an k-mer presence table/tagset pair created by load-graph, and a set
of pmap files created by partition-graph. Go through each pmap file,
select the largest partition in each, and do the same kind of traversal as
in :program:`make-initial-stoptags.py` from each of the waypoints in that
partition; this should identify all of the HCKs in that partition. These
HCKs are output to <graphbase>.stoptags after each pmap file.
Parameter choice is reasonably important. See the pipeline in
:doc:`partitioning-big-data` for an example run.
This script is not very scalable and may blow up memory and die horribly.
You should be able to use the intermediate stoptags to restart the
process, and if you eliminate the already-processed pmap files, you can
continue where you left off.
"""
parser = argparse.ArgumentParser(
description="Find all highly connected k-mers.",
epilog=textwrap.dedent(epilog),
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--n_tables', '-N', type=int,
default=DEFAULT_COUNTING_HT_N,
help='number of k-mer counting tables to use')
parser.add_argument('--min-tablesize', '-x', type=float,
default=DEFAULT_COUNTING_HT_SIZE, help='lower bound on'
' the size of the k-mer counting table(s)')
parser.add_argument('graphbase', help='Basename for the input and output '
'files.')
parser.add_argument('--version', action='version', version='%(prog)s ' +
khmer.__version__)
return parser
def main():
info('find-knots.py', ['graph'])
args = get_parser().parse_args()
graphbase = args.graphbase
# @RamRS: This might need some more work
infiles = [graphbase + '.pt', graphbase + '.tagset']
if os.path.exists(graphbase + '.stoptags'):
infiles.append(graphbase + '.stoptags')
for _ in infiles:
check_input_files(_, False)
check_space(infiles, False)
print >>sys.stderr, 'loading k-mer presence table %s.pt' % graphbase
htable = khmer.load_hashbits(graphbase + '.pt')
print >>sys.stderr, 'loading tagset %s.tagset...' % graphbase
htable.load_tagset(graphbase + '.tagset')
initial_stoptags = False # @CTB regularize with make-initial
if os.path.exists(graphbase + '.stoptags'):
print >>sys.stderr, 'loading stoptags %s.stoptags' % graphbase
htable.load_stop_tags(graphbase + '.stoptags')
initial_stoptags = True
pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')
print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \
(len(pmap_files), pmap_files[0])
print >>sys.stderr, '---'
print >>sys.stderr, 'output stoptags will be in', graphbase + '.stoptags'
if initial_stoptags:
print >>sys.stderr, \
'(these output stoptags will include the already-loaded set)'
print >>sys.stderr, '---'
# create counting hash
ksize = htable.ksize()
counting = khmer.new_counting_hash(ksize, args.min_tablesize,
args.n_tables)
# load & merge
for index, subset_file in enumerate(pmap_files):
print >>sys.stderr, '<-', subset_file
subset = htable.load_subset_partitionmap(subset_file)
print >>sys.stderr, '** repartitioning subset... %s' % subset_file
htable.repartition_largest_partition(subset, counting,
EXCURSION_DISTANCE,
EXCURSION_KMER_THRESHOLD,
EXCURSION_KMER_COUNT_THRESHOLD)
print >>sys.stderr, '** merging subset... %s' % subset_file
htable.merge_subset(subset)
print >>sys.stderr, '** repartitioning, round 2... %s' % subset_file
size = htable.repartition_largest_partition(
None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD,
EXCURSION_KMER_COUNT_THRESHOLD)
print >>sys.stderr, '** repartitioned size:', size
print >>sys.stderr, 'saving stoptags binary'
htable.save_stop_tags(graphbase + '.stoptags')
os.rename(subset_file, subset_file + '.processed')
print >>sys.stderr, '(%d of %d)\n' % (index, len(pmap_files))
print >>sys.stderr, 'done!'
if __name__ == '__main__':
main()