diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index f9e2aef..6b5c458 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -31,7 +31,7 @@ unicode-normalization = "0.1.23" irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" } [features] -default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition", "turkish"] +default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition", "turkish", "russian"] # allow chinese specialized tokenization chinese = ["chinese-segmentation", "chinese-normalization"] @@ -57,6 +57,9 @@ thai = [] # allow greek specialized tokenization greek = [] +# allow russian specialized tokenization +russian = [] + # allow splitting camelCase latin words latin-camelcase = ["dep:finl_unicode"] diff --git a/charabia/src/normalizer/compatibility_decomposition.rs b/charabia/src/normalizer/compatibility_decomposition.rs index 84b5d39..661a458 100644 --- a/charabia/src/normalizer/compatibility_decomposition.rs +++ b/charabia/src/normalizer/compatibility_decomposition.rs @@ -52,6 +52,13 @@ mod test { // base tokens to normalize. fn tokens() -> Vec> { vec![ + Token { + lemma: Owned("Ёё".to_string()), + char_end: 2, + byte_end: 2, + script: Script::Cyrillic, + ..Default::default() + }, Token { // Decompose 1E69 to 0073 0323 0307 lemma: Owned("ṩ ṩ".to_string()), @@ -74,6 +81,14 @@ mod test { // expected result of the current Normalizer. fn normalizer_result() -> Vec> { vec![ + Token { + lemma: Owned("Е\u{308}е\u{308}".to_string()), + char_end: 2, + byte_end: 2, + char_map: Some(vec![(2, 4), (2, 4)]), + script: Script::Cyrillic, + ..Default::default() + }, Token { lemma: Owned("s\u{0323}\u{0307} s\u{0323}\u{0307}".to_string()), char_end: 2, @@ -108,6 +123,15 @@ mod test { // expected result of the complete Normalizer pieline. fn normalized_tokens() -> Vec> { vec![ + Token { + lemma: Owned("ее".to_string()), + char_end: 2, + byte_end: 2, + script: Script::Cyrillic, + char_map: Some(vec![(2, 2), (2, 2)]), + kind: TokenKind::Word, + ..Default::default() + }, Token { lemma: Owned("s s".to_string()), char_end: 2, diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs index 2cc31ad..d790460 100644 --- a/charabia/src/normalizer/mod.rs +++ b/charabia/src/normalizer/mod.rs @@ -15,6 +15,8 @@ pub use self::japanese::JapaneseNormalizer; pub use self::lowercase::LowercaseNormalizer; use self::nonspacing_mark::NonspacingMarkNormalizer; use self::quote::QuoteNormalizer; +#[cfg(feature = "russian")] +pub use self::russian::RussianNormalizer; #[cfg(feature = "swedish-recomposition")] use self::swedish_recomposition::SwedishRecompositionNormalizer; #[cfg(feature = "turkish")] @@ -39,6 +41,8 @@ mod japanese; mod lowercase; mod nonspacing_mark; mod quote; +#[cfg(feature = "russian")] +mod russian; #[cfg(feature = "swedish-recomposition")] mod swedish_recomposition; #[cfg(feature = "turkish")] @@ -75,6 +79,8 @@ pub static LOSSY_NORMALIZERS: Lazy>> = Lazy::new(|| { Box::new(NonspacingMarkNormalizer), #[cfg(feature = "vietnamese")] Box::new(VietnameseNormalizer), + #[cfg(feature = "russian")] + Box::new(RussianNormalizer), #[cfg(feature = "turkish")] Box::new(TurkishNormalizer), ] diff --git a/charabia/src/normalizer/russian.rs b/charabia/src/normalizer/russian.rs new file mode 100644 index 0000000..5a4cf1b --- /dev/null +++ b/charabia/src/normalizer/russian.rs @@ -0,0 +1,133 @@ +use std::borrow::Cow; + +use super::{Normalizer, NormalizerOption}; +use crate::{Script, Token}; +use aho_corasick::AhoCorasick; +use once_cell::sync::Lazy; + +pub struct RussianNormalizer; + +static MATCHING_STR: Lazy = + Lazy::new(|| AhoCorasick::new(["Е\u{308}", "е\u{308}"]).unwrap()); + +impl Normalizer for RussianNormalizer { + fn normalize<'o>(&self, mut token: Token<'o>, options: &NormalizerOption) -> Token<'o> { + match token.char_map.take() { + Some(mut char_map) => { + // if a char_map already exists,iterate over it to reconstruct sub-strings. + let mut lemma = String::new(); + let mut tail = token.lemma.as_ref(); + let mut normalized = String::new(); + for (_, normalized_len) in char_map.iter_mut() { + let (head, t) = tail.split_at(*normalized_len as usize); + tail = t; + normalized.clear(); + // then normalize each sub-strings recomputing the size in the char_map. + let mut peekable = head.chars().peekable(); + while let Some(c) = peekable.next() { + let (c, peek_consumed) = normalize_russian(c, peekable.peek()); + + if peek_consumed { + peekable.next(); + } + + normalized.push(c); + } + + *normalized_len = normalized.len() as u8; + lemma.push_str(normalized.as_ref()); + } + + token.lemma = Cow::Owned(lemma); + token.char_map = Some(char_map); + } + None => { + // if no char_map exists, iterate over the lemma recomposing characters. + let mut char_map = Vec::new(); + let mut lemma = String::new(); + let mut peekable = token.lemma.chars().peekable(); + while let Some(c) = peekable.next() { + let (normalized, peek_consumed) = normalize_russian(c, peekable.peek()); + + if peek_consumed { + peekable.next(); + } + + if options.create_char_map { + char_map.push((c.len_utf8() as u8, normalized.len_utf8() as u8)); + } + lemma.push(normalized); + } + token.lemma = Cow::Owned(lemma); + if options.create_char_map { + token.char_map = Some(char_map); + } + } + } + + token + } + + fn should_normalize(&self, token: &Token) -> bool { + token.script == Script::Cyrillic && MATCHING_STR.is_match(token.lemma()) + } +} + +// https://en.wikipedia.org/wiki/Russian_alphabet +// Only decomposed forms are considered, as compatibility decomposition already takes care of 1-codepoint forms. +fn normalize_russian(current: char, next: Option<&char>) -> (char, bool) { + match (current, next) { + // ё -> е, grammatically permissible, common in writing + ('Е', Some('\u{308}')) => ('Е', true), + ('е', Some('\u{308}')) => ('е', true), + + (c, _) => (c, false), + } +} + +#[cfg(test)] +mod test { + use std::borrow::Cow::Owned; + + use crate::normalizer::test::test_normalizer; + use crate::normalizer::Normalizer; + use crate::token::TokenKind; + + // base tokens to normalize. + fn tokens() -> Vec> { + vec![Token { + lemma: Owned("Ёё".to_string()), + char_end: 2, + byte_end: 2, + script: Script::Cyrillic, + ..Default::default() + }] + } + + // expected result of the current Normalizer. + fn normalizer_result() -> Vec> { + vec![Token { + lemma: Owned("Ёё".to_string()), + char_end: 2, + byte_end: 2, + script: Script::Cyrillic, + char_map: None, + ..Default::default() + }] + } + + // expected result of the complete Normalizer pipeline. + fn normalized_tokens() -> Vec> { + vec![Token { + lemma: Owned("ее".to_string()), + char_end: 2, + byte_end: 2, + script: Script::Cyrillic, + char_map: Some(vec![(2, 2), (2, 2)]), + kind: TokenKind::Word, + ..Default::default() + }] + } + + test_normalizer!(RussianNormalizer, tokens(), normalizer_result(), normalized_tokens()); +}