Skip to content

Commit

Permalink
change process type list to set, to reduce over loop of build_process…
Browse files Browse the repository at this point in the history
…_type_tree function
  • Loading branch information
Lips7 committed Oct 22, 2024
1 parent f9668bc commit ba7185c
Show file tree
Hide file tree
Showing 7 changed files with 37 additions and 98 deletions.
28 changes: 0 additions & 28 deletions matcher_rs/benches/bench_test.rs

This file was deleted.

12 changes: 6 additions & 6 deletions matcher_rs/src/matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ impl Matcher {
M: MatchTableTrait<T>,
T: AsRef<str>,
{
let mut process_type_list = Vec::new();
let mut process_type_set = IdSet::new();

let mut simple_word_id = 0;
let mut simple_word_table_conf_id = 0;
Expand Down Expand Up @@ -557,7 +557,7 @@ impl Matcher {
if !word_list.is_empty() {
match match_table_type {
MatchTableType::Simple { process_type } => {
process_type_list.push(process_type);
process_type_set.insert(process_type.bits() as usize);
simple_word_table_conf_list.push(WordTableConf {
match_id,
table_id,
Expand All @@ -580,7 +580,7 @@ impl Matcher {
sim_match_type,
threshold,
} => {
process_type_list.push(process_type);
process_type_set.insert(process_type.bits() as usize);
sim_table_list.push(SimTable {
table_id,
match_id,
Expand All @@ -594,7 +594,7 @@ impl Matcher {
process_type,
regex_match_type,
} => {
process_type_list.push(process_type);
process_type_set.insert(process_type.bits() as usize);
regex_table_list.push(RegexTable {
table_id,
match_id,
Expand All @@ -607,7 +607,7 @@ impl Matcher {
}

if !exemption_word_list.is_empty() {
process_type_list.push(exemption_process_type);
process_type_set.insert(exemption_process_type.bits() as usize);
simple_word_table_conf_list.push(WordTableConf {
match_id,
table_id,
Expand All @@ -628,7 +628,7 @@ impl Matcher {
}
}

let process_type_tree = build_process_type_tree(&process_type_list);
let process_type_tree = build_process_type_tree(&process_type_set);

Matcher {
process_type_tree,
Expand Down
44 changes: 5 additions & 39 deletions matcher_rs/src/process/process_matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -856,24 +856,9 @@ pub struct ProcessTypeBitNode {
///
/// A [Vec] containing `ProcessTypeBitNode`s that represent the processing type tree.
///
/// # Example
///
/// ```
/// use matcher_rs::{build_process_type_tree, ProcessType};
///
/// let process_type_list = &[
/// ProcessType::Delete,
/// ProcessType::PinYin,
/// ProcessType::Delete | ProcessType::PinYin,
/// ];
///
/// let process_type_tree = build_process_type_tree(process_type_list);
/// // Use the `process_type_tree` for further processing...
/// ```
///
/// # Details
///
/// The tree is constructed by traversing each [ProcessType] in the input list and building a chain
/// The tree is constructed by traversing each [ProcessType] in the input set and building a chain
/// of nodes for each bit in the [ProcessType]. If a node for a specific bit already exists, it reuses
/// the node; otherwise, it creates a new node. Each node maintains a list of process types and its children,
/// ensuring efficient lookups and updates.
Expand All @@ -887,7 +872,7 @@ pub struct ProcessTypeBitNode {
/// # Safety
///
/// The function does not involve any unsafe operations.
pub fn build_process_type_tree(process_type_list: &[ProcessType]) -> Vec<ProcessTypeBitNode> {
pub fn build_process_type_tree(process_type_set: &IdSet) -> Vec<ProcessTypeBitNode> {
let mut process_type_tree = Vec::new();
let root = ProcessTypeBitNode {
process_type_list: ArrayVec::new(),
Expand All @@ -897,9 +882,10 @@ pub fn build_process_type_tree(process_type_list: &[ProcessType]) -> Vec<Process
children: ArrayVec::new(),
};
process_type_tree.push(root);
for &process_type in process_type_list.iter() {
for process_type_usize in process_type_set.iter() {
let process_type = ProcessType::from_bits(process_type_usize as u8).unwrap();
let mut current_node_index = 0;
for process_type_bit in process_type.iter() {
for process_type_bit in process_type.into_iter() {
let current_node = process_type_tree[current_node_index];
if current_node.process_type_bit == process_type_bit {
continue;
Expand Down Expand Up @@ -966,26 +952,6 @@ pub fn build_process_type_tree(process_type_list: &[ProcessType]) -> Vec<Process
/// about the data structures hold. Ensure that the provided `process_type_tree` is well-formed
/// and the indices are valid.
///
/// # Example
///
/// ```
/// use matcher_rs::{build_process_type_tree, reduce_text_process_with_tree, ProcessType};
///
/// let process_type_list = &[
/// ProcessType::Delete,
/// ProcessType::PinYin,
/// ProcessType::Delete | ProcessType::PinYin,
/// ];
///
/// let process_type_tree = build_process_type_tree(process_type_list);
/// let text = "example text";
///
/// let result = reduce_text_process_with_tree(&process_type_tree, text);
/// for (processed_text, id_set) in result.iter() {
/// println!("Processed text: {}, IdSet: {:?}", processed_text, id_set);
/// }
/// ```
///
/// # Panics
///
/// This function assumes that array operations on [ArrayVec] and slice operations on the process type tree
Expand Down
6 changes: 3 additions & 3 deletions matcher_rs/src/regex_matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -212,11 +212,11 @@ impl RegexMatcher {
/// For each [RegexTable] entry, the function creates a corresponding `RegexPatternTable` with appropriate
/// regex patterns or lists, then constructs the final [RegexMatcher] with a process type tree.
pub fn new(regex_table_list: &[RegexTable]) -> RegexMatcher {
let mut process_type_list = Vec::with_capacity(regex_table_list.len());
let mut process_type_set = IdSet::with_capacity(regex_table_list.len());
let mut regex_pattern_table_list = Vec::with_capacity(regex_table_list.len());

for regex_table in regex_table_list {
process_type_list.push(regex_table.process_type);
process_type_set.insert(regex_table.process_type.bits() as usize);

let size = regex_table.word_list.len();

Expand Down Expand Up @@ -315,7 +315,7 @@ impl RegexMatcher {
};
}

let process_type_tree = build_process_type_tree(&process_type_list);
let process_type_tree = build_process_type_tree(&process_type_set);

RegexMatcher {
process_type_tree,
Expand Down
6 changes: 3 additions & 3 deletions matcher_rs/src/sim_matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -173,11 +173,11 @@ impl SimMatcher {
/// * `process_type_tree` - A vector of `ProcessTypeBitNode`, representing the tree structure used for text processing based on the process types extracted from the input [SimTable] list.
/// * `sim_processed_table_list` - A vector of `SimProcessedTable`, each containing an owned vector of words and other properties derived from the input [SimTable] list.
pub fn new(sim_table_list: &[SimTable]) -> SimMatcher {
let mut process_type_list = Vec::with_capacity(sim_table_list.len());
let mut process_type_set = IdSet::with_capacity(sim_table_list.len());
let mut sim_processed_table_list = Vec::with_capacity(sim_table_list.len());

for sim_table in sim_table_list {
process_type_list.push(sim_table.process_type);
process_type_set.insert(sim_table.process_type.bits() as usize);
sim_processed_table_list.push(SimProcessedTable {
table_id: sim_table.table_id,
match_id: sim_table.match_id,
Expand All @@ -192,7 +192,7 @@ impl SimMatcher {
})
}

let process_type_tree = build_process_type_tree(&process_type_list);
let process_type_tree = build_process_type_tree(&process_type_set);

SimMatcher {
process_type_tree,
Expand Down
6 changes: 3 additions & 3 deletions matcher_rs/src/simple_matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ impl SimpleMatcher {
{
let word_size: usize = process_type_word_map.values().map(|m| m.len()).sum();

let mut process_type_list = Vec::with_capacity(process_type_word_map.len());
let mut process_type_set = IdSet::with_capacity(process_type_word_map.len());
let mut ac_dedup_word_conf_list = Vec::with_capacity(word_size);
let mut word_conf_map = IntMap::with_capacity_and_hasher(word_size, Default::default());

Expand All @@ -220,7 +220,7 @@ impl SimpleMatcher {

for (&process_type, simple_word_map) in process_type_word_map {
let word_process_type = process_type - ProcessType::Delete;
process_type_list.push(process_type);
process_type_set.insert(process_type.bits() as usize);

for (&simple_word_id, simple_word) in simple_word_map {
let mut ac_split_word_and_counter = FxHashMap::default();
Expand Down Expand Up @@ -318,7 +318,7 @@ impl SimpleMatcher {
}
}

let process_type_tree = build_process_type_tree(&process_type_list);
let process_type_tree = build_process_type_tree(&process_type_set);

#[cfg(feature = "dfa")]
let aho_corasick_kind = AhoCorasickKind::DFA;
Expand Down
33 changes: 17 additions & 16 deletions matcher_rs/tests/test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ mod test_matcher {
}

mod test_process {
use id_set::IdSet;
use matcher_rs::{
build_process_type_tree, reduce_text_process, reduce_text_process_emit,
reduce_text_process_with_list, reduce_text_process_with_tree, text_process, ProcessType,
Expand All @@ -228,27 +229,27 @@ mod test_process {

#[test]
fn test_build_process_type_tree() {
let process_type_list = vec![
ProcessType::Fanjian,
ProcessType::DeleteNormalize,
ProcessType::FanjianDeleteNormalize,
ProcessType::Delete,
ProcessType::Normalize,
];
let process_type_tree = build_process_type_tree(&process_type_list);
let process_type_set = IdSet::from_iter([
ProcessType::Fanjian.bits() as usize,
ProcessType::DeleteNormalize.bits() as usize,
ProcessType::FanjianDeleteNormalize.bits() as usize,
ProcessType::Delete.bits() as usize,
ProcessType::Normalize.bits() as usize,
]);
let process_type_tree = build_process_type_tree(&process_type_set);
println!("{:?}", process_type_tree);
}

#[test]
fn test_reduce_text_process_with_tree() {
let process_type_list = vec![
ProcessType::Fanjian,
ProcessType::DeleteNormalize,
ProcessType::FanjianDeleteNormalize,
ProcessType::Delete,
ProcessType::Normalize,
];
let process_type_tree = build_process_type_tree(&process_type_list);
let process_type_set = IdSet::from_iter([
ProcessType::Fanjian.bits() as usize,
ProcessType::DeleteNormalize.bits() as usize,
ProcessType::FanjianDeleteNormalize.bits() as usize,
ProcessType::Delete.bits() as usize,
ProcessType::Normalize.bits() as usize,
]);
let process_type_tree = build_process_type_tree(&process_type_set);
let text = "test爽-︻";

let processed_text_process_type_set =
Expand Down

0 comments on commit ba7185c

Please sign in to comment.