Skip to content

Commit

Permalink
bug fix
Browse files Browse the repository at this point in the history
  • Loading branch information
eric committed Mar 3, 2024
1 parent e81f223 commit cb0f5e0
Show file tree
Hide file tree
Showing 11 changed files with 94 additions and 158 deletions.
5 changes: 5 additions & 0 deletions kr2r/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ name = "classify"
path = "src/bin/classify.rs"


[[bin]]
name = "inspect"
path = "src/bin/inspect.rs"


[features]
default = ["dna"]
dna = []
Expand Down
7 changes: 3 additions & 4 deletions kr2r/benches/mmscanner_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,11 @@ use kr2r::Meros;
fn performance_test(c: &mut Criterion) {
let seq: Vec<u8> = b"ACGATCGACGACG".to_vec();
let meros = Meros::new(10, 5, None, None, None);
let mut scanner = MinimizerScanner::new(meros);
scanner.set_seq_end(&seq);
let mut scanner = MinimizerScanner::new(&seq, meros);
// 这里执行需要测试性能的操作,例如多次调用 next_minimizer
c.bench_function("next_minimizer", |b| {
c.bench_function("next", |b| {
b.iter(|| {
let _ = scanner.next_minimizer(&seq);
let _ = scanner.next();
// let _ = scanner.next_minimizer(&seq);
});
});
Expand Down
16 changes: 7 additions & 9 deletions kr2r/src/bin/build_db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use kr2r::{
use kr2r::db::{generate_taxonomy, get_bits_for_taxid, process_sequence};
use std::path::PathBuf;
use std::sync::atomic::AtomicU32;
use std::time::Instant;

#[derive(Parser, Debug, Clone)]
#[clap(version, about = "build database")]
Expand Down Expand Up @@ -82,14 +83,6 @@ impl Build {
}
}

// fn set_minimizer_lca(hash: &mut CompactHashTableMut, minimizer: u64, taxid: u64, tax: &Taxonomy) {
// let mut old_value: u32 = 0;
// let mut new_value: u64 = taxid;
// // while !hash.compare_and_set(minimizer, new_value as u32, &mut old_value) {
// // new_value = tax.lowest_common_ancestor(old_value as u64, taxid);
// // }
// }

// // This function exists to deal with NCBI's use of \x01 characters to denote
// // the start of a new FASTA header in the same line (for non-redundant DBs).
// // We return all sequence IDs in a header line, not just the first.
Expand Down Expand Up @@ -164,6 +157,8 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
};

println!("start...");
// 开始计时
let start = Instant::now();
for fna_file in fna_files {
process_sequence(
fna_file,
Expand All @@ -174,7 +169,10 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
args.threads as u32,
)
}
println!("success...");
// 计算持续时间
let duration = start.elapsed();
// 打印运行时间
println!("build db took: {:?}", duration);

let idx_opts = IndexOptions::new(
args.k_mer as usize,
Expand Down
19 changes: 8 additions & 11 deletions kr2r/src/bin/estimate_capacity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ use clap::{error::ErrorKind, Error, Parser};
use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
use kr2r::mmscanner::MinimizerScanner;
use kr2r::utils::{expand_spaced_seed_mask, find_library_fna_files};
use kr2r::KBuildHasher;
use kr2r::{construct_seed_template, Meros, BITS_PER_CHAR, DEFAULT_MINIMIZER_SPACES};
use kr2r::{fmix64 as murmur_hash3, KBuildHasher};
use seq_io::fasta::{Reader, Record};
use seq_io::parallel::read_parallel;
use serde_json;
Expand Down Expand Up @@ -105,26 +105,23 @@ fn process_sequence(
HyperLogLogPlus::new(16, KBuildHasher::default()).unwrap();

let reader = Reader::from_path(fna_file).unwrap();
let range_n = args.n as u64;
read_parallel(
reader,
args.threads as u32,
args.threads - 2 as usize,
|record_set| {
let meros = Meros::new(k_mer, l_mer, Some(spaced_seed_mask), args.toggle_mask, None);

let mut scanner = MinimizerScanner::new(meros);

let mut minimizer_set = HashSet::new();
for record in record_set.into_iter() {
let seq = record.seq();
scanner.set_seq_end(seq);
while let Some(minimizer) = scanner.next_minimizer(seq) {
let hash_v = murmur_hash3(minimizer);
if hash_v & RANGE_MASK < args.n as u64 {
minimizer_set.insert(hash_v);
}
}
scanner.reset();
let kmer_iter = MinimizerScanner::new(seq, meros)
.into_iter()
.filter(|hash_key| hash_key & RANGE_MASK < range_n)
.collect::<HashSet<u64>>();

minimizer_set.extend(kmer_iter);
}
minimizer_set
},
Expand Down
37 changes: 37 additions & 0 deletions kr2r/src/bin/inspect.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
use clap::Parser;
use kr2r::compact_hash::CompactHashTable;
use kr2r::taxonomy::Taxonomy;
use kr2r::IndexOptions;
use std::io::Result;

/// Command line arguments for the classify program.
///
/// This structure defines the command line arguments that are accepted by the classify program.
/// It uses the `clap` crate for parsing command line arguments.
#[derive(Parser, Debug, Clone)]
#[clap(version, about = "inspect")]
struct Args {
/// The file path for the Kraken 2 index.
#[clap(short = 'H', long = "index-filename", value_parser, required = true)]
index_filename: String,

/// The file path for the Kraken 2 taxonomy.
#[clap(short = 't', long = "taxonomy-filename", value_parser, required = true)]
taxonomy_filename: String,

/// The file path for the Kraken 2 options.
#[clap(short = 'o', long = "options-filename", value_parser, required = true)]
options_filename: String,
}

fn main() -> Result<()> {
let args = Args::parse();
let idx_opts = IndexOptions::read_index_options(args.options_filename.clone())?;
let taxo = Taxonomy::from_file(&args.taxonomy_filename)?;
let cht = CompactHashTable::from(args.index_filename.clone())?;
println!("index option {:?}", idx_opts);
println!("taxonomy node count {:?}", taxo.node_count());
println!("compact hash table {:?}", cht);

Ok(())
}
4 changes: 1 addition & 3 deletions kr2r/src/compact_hash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -362,9 +362,7 @@ impl<'a> CompactHashTable<'a, AtomicU32> {
cell1.populate(ci.cell, self.value_bits);
break;
}
if cell1.compacted_key(self.value_bits) == ci.cell.compacted_key
&& cell1.taxid(self.value_mask) != ci.cell.taxid
{
if cell1.compacted_key(self.value_bits) == ci.cell.compacted_key {
return Some(CellIndex::new(
idx,
ci.cell.compacted_key,
Expand Down
43 changes: 19 additions & 24 deletions kr2r/src/db.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
// 使用时需要引用模块路径
use crate::compact_hash::{CellIndex, Compact, CompactHashTable};
use crate::fmix64 as murmur_hash3;
use crate::mmscanner::MinimizerScanner;
use crate::taxonomy::{NCBITaxonomy, Taxonomy};
use crate::Meros;
Expand All @@ -27,7 +26,7 @@ pub fn process_sequence<P: AsRef<Path>>(
let total_counter = AtomicUsize::new(0);
let size_counter = AtomicUsize::new(0);
let seq_counter = AtomicUsize::new(0);
let update_counter = AtomicUsize::new(0);
// let update_counter = AtomicUsize::new(0);

let capacity = chtm.capacity;
let value_bits = chtm.value_bits;
Expand All @@ -37,32 +36,29 @@ pub fn process_sequence<P: AsRef<Path>>(
threads,
queue_len,
|record_set| {
let mut scanner = MinimizerScanner::new(meros);
let mut hash_set = BTreeSet::<CellIndex>::new();

for record in record_set.into_iter() {
seq_counter.fetch_add(1, Ordering::SeqCst);
let seq = record.seq();
if let Ok(seq_id) = record.id() {
// let seq_count = seq_counter.load(Ordering::SeqCst);
// println!("seq_id {:?}, seq_count: {:?}", seq_id, seq_count);

if let Some(ext_taxid) = id_to_taxon_map.get(seq_id) {
let taxid = taxonomy.get_internal_id(*ext_taxid);

scanner.set_seq_end(seq);
while let Some(minimizer) = scanner.next_minimizer(seq) {
let hash_key = murmur_hash3(minimizer);

let ci = CellIndex::new(
hash_key.index(capacity),
hash_key.compacted(value_bits),
taxid as u32,
);

hash_set.insert(ci);

total_counter.fetch_add(1, Ordering::SeqCst);
}

scanner.reset();
let kmer_iter = MinimizerScanner::new(record.seq(), meros)
.into_iter()
.map(|hash_key| {
CellIndex::new(
hash_key.index(capacity),
hash_key.compacted(value_bits),
taxid as u32,
)
})
.collect::<BTreeSet<CellIndex>>();

total_counter.fetch_add(kmer_iter.len(), Ordering::SeqCst);
hash_set.extend(kmer_iter);
};
}
}
Expand All @@ -76,7 +72,7 @@ pub fn process_sequence<P: AsRef<Path>>(
if ci.cell.taxid != new_taxid {
ci.cell.taxid = new_taxid;
chtm.update_cell(ci);
update_counter.fetch_add(1, Ordering::SeqCst);
// update_counter.fetch_add(1, Ordering::SeqCst);
}
} else {
size_counter.fetch_add(1, Ordering::SeqCst);
Expand All @@ -88,11 +84,10 @@ pub fn process_sequence<P: AsRef<Path>>(
let size_count = size_counter.load(Ordering::SeqCst);
let seq_count = seq_counter.load(Ordering::SeqCst);
let total_count = total_counter.load(Ordering::SeqCst);
let update_count = update_counter.load(Ordering::SeqCst);
// let update_count = update_counter.load(Ordering::SeqCst);
println!("seq_count {:?}", seq_count);
println!("size_count {:?}", size_count);
println!("total_count {:?}", total_count);
println!("update_count {:?}", update_count);
chtm.update_size(size_count);
}

Expand Down
3 changes: 1 addition & 2 deletions kr2r/src/iclassify.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,8 @@ pub fn resolve_tree(

if max_score >= required_score {
break;
} else {
max_taxon = taxonomy.nodes[max_taxon as usize].parent_id as u32;
}
max_taxon = taxonomy.nodes[max_taxon as usize].parent_id as u32;
}

max_taxon
Expand Down
Loading

0 comments on commit cb0f5e0

Please sign in to comment.