Skip to content

Commit

Permalink
Removing unwanted Drosophila specific prefixes from UniprotKB Blast H…
Browse files Browse the repository at this point in the history
…it descriptions
  • Loading branch information
asishallab committed Aug 14, 2024
1 parent 75d5e1a commit b89cb75
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 8 deletions.
2 changes: 1 addition & 1 deletion src/annotation_process.rs
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ impl AnnotationProcess {
stored_query.n_parsed_from_sssr_tables += 1;
// Have all input SSSR files provided data for the argument `query`?
if stored_query.n_parsed_from_sssr_tables == self.seq_sim_search_tables.len() as u16 {
drop(stored_query);
let _ = stored_query;
// If yes, then process the parsed data:
self.process_query_data_complete(qacc);
}
Expand Down
2 changes: 1 addition & 1 deletion src/default.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ lazy_static! {
/// The default regular expressions used to filter a Hit title (`stitle`) and retain the short
/// human readable description.
pub static ref FILTER_REGEXS: Vec<Regex> = vec![
Regex::new(r"(?i)low\s+quality\s+protein:").unwrap(),
Regex::new(r"\sOS=.*$").unwrap(),
Regex::new(r"(?i)OS.*[.].*protein").unwrap(),
Regex::new(r"(?i)^H0.*protein").unwrap(),
Expand All @@ -72,7 +73,6 @@ lazy_static! {
Regex::new(r"(?i)\bfragment\b").unwrap(),
Regex::new(r"(?i)\bcontig\b").unwrap(),
Regex::new(r"(?i)\bblast:\b").unwrap(),
Regex::new(r"(?i)\blow quality protein:\b").unwrap(),
];

/// The default header definition of sequence similarity search result tables, i.e. mapping
Expand Down
35 changes: 29 additions & 6 deletions src/model_funcs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -225,11 +225,11 @@ mod tests {
)
);

// Test 7 checks that the identifier is filtered out
// Test 7 checks that the identifier is filtered out
hit_words = "sp|Q9C8M9|SRF6_ARATH Protein STRUBBELIG-RECEPTOR FAMILY 6 OS=Arabidopsis thaliana OX=3702 GN=SRF6 PE=1 SV=1".to_string();
expected = "protein strubbelig receptor family";
assert_eq!(
expected,
expected,
filter_stitle(
&hit_words,
&(*FILTER_REGEXS),
Expand All @@ -241,27 +241,50 @@ mod tests {
hit_words = "sp|Q6R2K2|SRF4_ARATH n Transferase Domain Containing Protein OS=Arabidopsis thaliana OX=3702 GN=SRF4 PE=2 SV=1".to_string();
expected = "n transferase domain containing protein";
assert_eq!(
expected,
expected,
filter_stitle(
&hit_words,
&(*FILTER_REGEXS),
Some(&(*CAPTURE_REPLACE_DESCRIPTION_PAIRS))
)
);

// Test 8 also checks that no additional letters are deleted
// Test 9 also checks that no additional letters are deleted
hit_words = "sp|Q6R2K2|SRF4_ARATH P Transferase Domain Containing Protein OS=Arabidopsis thaliana OX=3702 GN=SRF4 PE=2 SV=1".to_string();
expected = "p transferase domain containing protein";
assert_eq!(
expected,
expected,
filter_stitle(
&hit_words,
&(*FILTER_REGEXS),
Some(&(*CAPTURE_REPLACE_DESCRIPTION_PAIRS))
)
);


// Test 10 checks that the Drosophila specific HRD description prefix 'LOW QUALITY
// PROTEIN:' is removed:
hit_words = "tr|A0A6P4E2J9|A0A6P4E2J9_DRORH LOW QUALITY PROTEIN: muscarinic acetylcholine receptor DM1 OS=Drosophila rhopaloa OX=1041015 GN=LOC108039593 PE=3 SV=1".to_string();
expected = "muscarinic acetylcholine receptor dm";
assert_eq!(
expected,
filter_stitle(
&hit_words,
&(*FILTER_REGEXS),
Some(&(*CAPTURE_REPLACE_DESCRIPTION_PAIRS))
)
);

// Test 11 checks that the Drosophila specific HRD description prefix 'Blast:' is removed:
hit_words = "tr|A0A3B0K592|A0A3B0K592_DROGU Blast:Homeobox protein abdominal-A OS=Drosophila guanche OX=7266 GN=DGUA_6G017991 PE=3 SV=1".to_string();
expected = "homeobox protein abdominal a";
assert_eq!(
expected,
filter_stitle(
&hit_words,
&(*FILTER_REGEXS),
Some(&(*CAPTURE_REPLACE_DESCRIPTION_PAIRS))
)
);
}

#[test]
Expand Down

0 comments on commit b89cb75

Please sign in to comment.