@@ -40,10 +40,11 @@ THE SOFTWARE.
40
40
#include " Combinator.hpp"
41
41
42
42
43
- static const uint min_nb_kmer_count = 1000 ;
43
+ static const uint max_nb_kmer_multiplicity = 32 ;
44
+ static const uint min_nb_kmer_count = 10000 ;
44
45
45
46
static const uchar num_genomic_rate_gc_bias_bins = 1 ;
46
- static const uchar max_multiplicity = Utils::uchar_overflow;
47
+ static const uchar max_kmer_multiplicity = Utils::uchar_overflow;
47
48
static const uchar max_kmer_count = Utils::uchar_overflow;
48
49
49
50
@@ -54,15 +55,15 @@ CountDistribution::CountDistribution(const vector<Sample> & samples_in, const Op
54
55
genomic_count_distributions = vector<vector<NegativeBinomialDistribution> >(samples.size (), vector<NegativeBinomialDistribution>(num_genomic_rate_gc_bias_bins));
55
56
56
57
noise_rates = vector<double >(samples.size (), 0 );
57
- genomic_count_log_pmf_cache = vector<vector<vector<vector<double > > > >(samples.size (), vector<vector<vector<double > > >(num_genomic_rate_gc_bias_bins, vector<vector<double > >(max_multiplicity + 1 , vector<double >(max_kmer_count + 1 ))));
58
+ genomic_count_log_pmf_cache = vector<vector<vector<vector<double > > > >(samples.size (), vector<vector<vector<double > > >(num_genomic_rate_gc_bias_bins, vector<vector<double > >(max_kmer_multiplicity + 1 , vector<double >(max_kmer_count + 1 ))));
58
59
noise_count_log_pmf_cache = vector<vector<double > >(samples.size (), vector<double >(max_kmer_count + 1 ));
59
60
60
61
resetNoiseRates ();
61
62
updateGenomicCache ();
62
63
}
63
64
64
65
65
- void CountDistribution::setGenomicCountDistributions (const vector<vector<vector<KmerStats> > > & intercluster_diploid_kmer_stats , const string & output_prefix) {
66
+ void CountDistribution::setGenomicCountDistributions (const vector<vector<vector<KmerStats> > > & intercluster_kmer_stats , const string & output_prefix) {
66
67
67
68
cout << " [" << Utils::getLocalTime () << " ] Estimating genomic haploid kmer count distribution(s) from parameter kmers ...\n " << endl;
68
69
@@ -76,61 +77,58 @@ void CountDistribution::setGenomicCountDistributions(const vector<vector<vector<
76
77
77
78
genomic_outfile << " Sample\t Mean\t Variance" << endl;
78
79
79
- assert (intercluster_diploid_kmer_stats .size () == samples.size ());
80
- assert (intercluster_diploid_kmer_stats .size () == genomic_count_distributions.size ());
80
+ assert (intercluster_kmer_stats .size () == samples.size ());
81
+ assert (intercluster_kmer_stats .size () == genomic_count_distributions.size ());
81
82
82
- for (ushort sample_idx = 0 ; sample_idx < samples.size (); sample_idx++) {
83
+ assert (num_genomic_rate_gc_bias_bins == 1 );
84
+ assert (max_nb_kmer_multiplicity <= Utils::uchar_overflow);
83
85
84
- assert (num_genomic_rate_gc_bias_bins == 1 );
86
+ for ( ushort sample_idx = 0 ; sample_idx < samples. size (); sample_idx++) {
85
87
86
- assert (intercluster_diploid_kmer_stats .at (sample_idx).size () == num_genomic_rate_gc_bias_bins);
87
- assert (intercluster_diploid_kmer_stats .at (sample_idx).size () == genomic_count_distributions.at (sample_idx).size ());
88
+ assert (intercluster_kmer_stats .at (sample_idx).size () == num_genomic_rate_gc_bias_bins);
89
+ assert (intercluster_kmer_stats .at (sample_idx).size () == genomic_count_distributions.at (sample_idx).size ());
88
90
89
91
for (ushort bias_idx = 0 ; bias_idx < num_genomic_rate_gc_bias_bins; bias_idx++) {
90
92
91
- assert (intercluster_diploid_kmer_stats .at (sample_idx).at (bias_idx).size () == static_cast < uint > (Utils::Ploidy::PLOIDY_SIZE ));
93
+ assert (intercluster_kmer_stats .at (sample_idx).at (bias_idx).size () == (Utils::uchar_overflow + 1 ));
92
94
93
- uint max_kmer_count = 0 ;
94
- ushort max_kmer_multiplicity = 0 ;
95
+ uint max_genomic_kmer_count = 0 ;
96
+ ushort max_genomic_kmer_multiplicity = 0 ;
95
97
96
- for (ushort kmer_multiplicity = 1 ; kmer_multiplicity < static_cast < uint >(Utils::Ploidy::PLOIDY_SIZE); kmer_multiplicity ++) {
98
+ for (ushort kmer_genomic_multiplicity = 1 ; kmer_genomic_multiplicity <= max_nb_kmer_multiplicity; kmer_genomic_multiplicity ++) {
97
99
98
- auto kmer_count = intercluster_diploid_kmer_stats .at (sample_idx).at (bias_idx).at (kmer_multiplicity ).getCount ();
100
+ auto kmer_genomic_count = intercluster_kmer_stats .at (sample_idx).at (bias_idx).at (kmer_genomic_multiplicity ).getCount ();
99
101
100
- if (kmer_count > max_kmer_count ) {
102
+ if (kmer_genomic_count > max_genomic_kmer_count ) {
101
103
102
- max_kmer_count = kmer_count ;
103
- max_kmer_multiplicity = kmer_multiplicity ;
104
+ max_genomic_kmer_count = kmer_genomic_count ;
105
+ max_genomic_kmer_multiplicity = kmer_genomic_multiplicity ;
104
106
}
105
107
}
106
108
107
- if (max_kmer_count < min_nb_kmer_count) {
108
-
109
- cerr << " \n ERROR: Insufficient number of kmers used for negative binomial parameters estimation for sample " << samples.at (sample_idx).name << " (" << max_kmer_count << " < " << min_nb_kmer_count << " ); the genome used is likely too small and/or too repetitive\n " << endl;
110
- exit (1 );
111
-
112
- } else if (max_kmer_count < (min_nb_kmer_count * 10 )) {
109
+ if (max_genomic_kmer_count < min_nb_kmer_count) {
113
110
114
- cout << " \n WARNING: Low number of kmers used for negative binomial parameters estimation for sample " << samples.at (sample_idx).name << " (" << max_kmer_count << " < " << min_nb_kmer_count * 10 << " ); mean and variance estimate might be biased\n " << endl;
111
+ cout << " \n WARNING: Low number of kmers used for negative binomial parameters estimation for sample " << samples.at (sample_idx).name << " (" << max_genomic_kmer_count << " < " << min_nb_kmer_count << " )" << endl;
112
+ cout << " WARNING: The mean and variance estimates might be biased due to the genome used being too small, too variant dense and/or too repetitive\n " << endl;
115
113
}
116
114
117
- assert (max_kmer_multiplicity > 0 );
115
+ assert (max_genomic_kmer_multiplicity > 0 );
118
116
119
- auto kmer_mean = intercluster_diploid_kmer_stats .at (sample_idx).at (bias_idx).at (max_kmer_multiplicity ).getMean ();
117
+ auto kmer_mean = intercluster_kmer_stats .at (sample_idx).at (bias_idx).at (max_genomic_kmer_multiplicity ).getMean ();
120
118
assert (kmer_mean.second );
121
119
122
- auto kmer_var = intercluster_diploid_kmer_stats .at (sample_idx).at (bias_idx).at (max_kmer_multiplicity ).getVariance ();
120
+ auto kmer_var = intercluster_kmer_stats .at (sample_idx).at (bias_idx).at (max_genomic_kmer_multiplicity ).getVariance ();
123
121
assert (kmer_var.second );
124
122
125
123
auto nb_para = NegativeBinomialDistribution::momentsToParameters (kmer_mean.first , kmer_var.first );
126
- nb_para.second /= max_kmer_multiplicity ;
124
+ nb_para.second /= max_genomic_kmer_multiplicity ;
127
125
128
126
genomic_count_distributions.at (sample_idx).at (bias_idx).setParameters (nb_para);
129
127
130
128
auto nb_mean = genomic_count_distributions.at (sample_idx).at (bias_idx).mean ();
131
129
auto nb_var = genomic_count_distributions.at (sample_idx).at (bias_idx).var ();
132
130
133
- cout << " [" << Utils::getLocalTime () << " ] Estimated negative binomial (mean = " << nb_mean << " , var = " << nb_var << " ) for sample " << samples.at (sample_idx).name << " using " << max_kmer_count << " parameter kmers (ploidy = " << max_kmer_multiplicity << " )" << endl;
131
+ cout << " [" << Utils::getLocalTime () << " ] Estimated negative binomial (mean = " << nb_mean << " , var = " << nb_var << " ) for sample " << samples.at (sample_idx).name << " using " << max_genomic_kmer_count << " parameter kmers (multiplicity = " << max_genomic_kmer_multiplicity << " )" << endl;
134
132
135
133
genomic_outfile << samples.at (sample_idx).name << " \t " << nb_mean << " \t " << nb_var << endl;
136
134
}
@@ -139,7 +137,7 @@ void CountDistribution::setGenomicCountDistributions(const vector<vector<vector<
139
137
genomic_outfile.close ();
140
138
updateGenomicCache ();
141
139
142
- cout << " \n [" << Utils::getLocalTime () << " ] Wrote parameters to " << output_prefix << " .txt" << endl;
140
+ cout << " \n [" << Utils::getLocalTime () << " ] Wrote genomic parameters to " << output_prefix << " .txt" << endl;
143
141
}
144
142
145
143
const vector<vector<NegativeBinomialDistribution> > & CountDistribution::getGenomicCountDistributions () const {
@@ -224,9 +222,9 @@ void CountDistribution::updateGenomicCache() {
224
222
225
223
for (ushort bias_idx = 0 ; bias_idx < num_genomic_rate_gc_bias_bins; bias_idx++) {
226
224
227
- assert (genomic_count_log_pmf_cache.at (sample_idx).at (bias_idx).size () == static_cast <uint >(max_multiplicity + 1 ));
225
+ assert (genomic_count_log_pmf_cache.at (sample_idx).at (bias_idx).size () == static_cast <uint >(max_kmer_multiplicity + 1 ));
228
226
229
- for (ushort kmer_multiplicity = 0 ; kmer_multiplicity <= max_multiplicity ; kmer_multiplicity++) {
227
+ for (ushort kmer_multiplicity = 0 ; kmer_multiplicity <= max_kmer_multiplicity ; kmer_multiplicity++) {
230
228
231
229
assert (genomic_count_log_pmf_cache.at (sample_idx).at (bias_idx).at (kmer_multiplicity).size () == static_cast <uint >(max_kmer_count + 1 ));
232
230
0 commit comments