diff --git a/README.md b/README.md index a6a56f9..5793392 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ This crate implements some stemmer algorithms found in the [snowball project](ht - Greek - Hungarian - Italian +- Nepali - Norwegian - Portuguese - Romanian diff --git a/algorithms/nepali.sbl b/algorithms/nepali.sbl new file mode 100644 index 0000000..d388748 --- /dev/null +++ b/algorithms/nepali.sbl @@ -0,0 +1,92 @@ +/* + * Authors: + * - Ingroj Shrestha , Nepali NLP Group + * - Oleg Bartunov , Postgres Professional Ltd. + * - Shreeya Singh Dhakal, Nepali NLP Group + */ + +routines ( + remove_category_1 + check_category_2 + remove_category_2 + remove_category_3 +) + +stringescapes {} + +stringdef dsc '{U+0901}' // DEVANAGARI_SIGN_CANDRABINDU +stringdef dsa '{U+0902}' // DEVANAGARI_SIGN_ANUSVARA +stringdef dli '{U+0907}' // DEVANAGARI_LETTER_I +stringdef dlii '{U+0908}' // DEVANAGARI_LETTER_II +stringdef dle '{U+090F}' // DEVANAGARI_LETTER_E +stringdef dlka '{U+0915}' // DEVANAGARI_LETTER_KA +stringdef dlkha '{U+0916}' // DEVANAGARI_LETTER_KHA +stringdef dlg '{U+0917}' // DEVANAGARI_LETTER_GA +stringdef dlc '{U+091B}' // DEVANAGARI_LETTER_CHA +stringdef dlta '{U+0924}' // DEVANAGARI_LETTER_TA +stringdef dltha '{U+0925}' // DEVANAGARI_LETTER_THA +stringdef dld '{U+0926}' // DEVANAGARI_LETTER_DA +stringdef dln '{U+0928}' // DEVANAGARI_LETTER_NA +stringdef dlpa '{U+092A}' // DEVANAGARI_LETTER_PA +stringdef dlpha '{U+092B}' // DEVANAGARI_LETTER_PHA +stringdef dlb '{U+092D}' // DEVANAGARI_LETTER_BHA +stringdef dlm '{U+092E}' // DEVANAGARI_LETTER_MA +stringdef dly '{U+092F}' // DEVANAGARI_LETTER_YA +stringdef dlr '{U+0930}' // DEVANAGARI_LETTER_RA +stringdef dll '{U+0932}' // DEVANAGARI_LETTER_LA +stringdef dlv '{U+0935}' // DEVANAGARI_LETTER_VA +stringdef dls '{U+0938}' // DEVANAGARI_LETTER_SA +stringdef dlh '{U+0939}' // DEVANAGARI_LETTER_HA +stringdef dvsaa '{U+093E}' // DEVANAGARI_VOWEL_SIGN_AA +stringdef dvsi '{U+093F}' // DEVANAGARI_VOWEL_SIGN_I +stringdef dvsii '{U+0940}' // DEVANAGARI_VOWEL_SIGN_II +stringdef dvsu '{U+0941}' // DEVANAGARI_VOWEL_SIGN_U +stringdef dvsuu '{U+0942}' // DEVANAGARI_VOWEL_SIGN_UU +stringdef dvse '{U+0947}' // DEVANAGARI_VOWEL_SIGN_E +stringdef dvsai '{U+0948}' // DEVANAGARI_VOWEL_SIGN_AI +stringdef dvso '{U+094B}' // DEVANAGARI_VOWEL_SIGN_O +stringdef dvsau '{U+094C}' // DEVANAGARI_VOWEL_SIGN_AU +stringdef dsv '{U+094D}' // DEVANAGARI_SIGN_VIRAMA + +externals ( stem ) +backwardmode ( + define remove_category_1 as( + [substring] among ( + '{dlm}{dvsaa}{dlr}{dsv}{dlpha}{dlta}' '{dld}{dsv}{dlv}{dvsaa}{dlr}{dvsaa}' '{dls}{dsc}{dlg}{dvsai}' '{dls}{dsa}{dlg}' + '{dls}{dsc}{dlg}' '{dll}{dvsaa}{dli}' '{dll}{dvsaa}{dlii}' '{dlpa}{dlc}{dvsi}' + '{dll}{dvse}' '{dlr}{dlta}' '{dlm}{dvsai}' '{dlm}{dvsaa}' + (delete) + '{dlka}{dvso}' '{dlka}{dvsaa}' '{dlka}{dvsi}' '{dlka}{dvsii}' '{dlka}{dvsai}'(('{dle}' or '{dvse}' ()) or delete) + ) + ) + + define check_category_2 as( + [substring] among( + '{dsc}' '{dsa}' '{dvsai}' + ) + ) + + define remove_category_2 as ( + [substring] among( + '{dsc}' '{dsa}' ('{dly}{dvsau}' or '{dlc}{dvsau}' or '{dln}{dvsau}' or '{dltha}{dvse}' delete) + '{dvsai}' ('{dlta}{dsv}{dlr}' delete) + ) + ) + + define remove_category_3 as( + [substring] among( + '{dltha}{dvsi}{dli}{dls}{dsv}' '{dlh}{dvsu}{dln}{dvse}{dlc}' '{dlh}{dvsu}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dls}{dsv}' '{dln}{dvse}{dlc}{dln}{dsv}' '{dli}{dle}{dlka}{dvsii}' '{dli}{dle}{dlka}{dvsaa}' '{dli}{dle}{dlka}{dvso}' '{dvsi}{dle}{dlka}{dvsii}' '{dvsi}{dle}{dlka}{dvsaa}' '{dvsi}{dle}{dlka}{dvso}' '{dli}{dlc}{dln}{dsv}' '{dvsi}{dlc}{dln}{dsv}' '{dli}{dlc}{dls}{dsv}' '{dvsi}{dlc}{dls}{dsv}' '{dle}{dlc}{dln}{dsv}' '{dvse}{dlc}{dln}{dsv}' '{dle}{dlc}{dls}{dsv}' '{dvse}{dlc}{dls}{dsv}' '{dlc}{dvsi}{dln}{dsv}' '{dlc}{dvse}{dls}{dsv}' '{dlc}{dsv}{dly}{dvsau}' '{dltha}{dvsi}{dln}{dsv}' '{dltha}{dvsi}{dly}{dvso}' '{dltha}{dvsi}{dly}{dvsau}' '{dltha}{dvsi}{dls}{dsv}' '{dltha}{dsv}{dly}{dvso}' '{dltha}{dsv}{dly}{dvsau}' '{dld}{dvsi}{dly}{dvso}' '{dld}{dvse}{dlkha}{dvsi}' '{dld}{dvse}{dlkha}{dvsii}' '{dll}{dvsaa}{dln}{dsv}' '{dlm}{dvsaa}{dltha}{dvsi}' '{dln}{dvse}{dlka}{dvsai}' '{dln}{dvse}{dlka}{dvsaa}' '{dln}{dvse}{dlka}{dvso}' '{dln}{dvse}{dlc}{dvsau}' '{dlh}{dvso}{dls}{dsv}' '{dli}{dln}{dsv}{dlc}' '{dvsi}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dvsu}' '{dli}{dlc}{dvsau}' '{dvsi}{dlc}{dvsau}' '{dli}{dls}{dsv}' '{dvsi}{dls}{dsv}' '{dvsi}{dly}{dvso}' '{dli}{dly}{dvso}' '{dle}{dlka}{dvsaa}' '{dvse}{dlka}{dvsaa}' '{dle}{dlka}{dvsii}' '{dvse}{dlka}{dvsii}' '{dle}{dlka}{dvsai}' '{dvse}{dlka}{dvsai}' '{dle}{dlka}{dvso}' '{dvse}{dlka}{dvso}' '{dle}{dlc}{dvsu}' '{dvse}{dlc}{dvsu}' '{dle}{dlc}{dvsau}' '{dvse}{dlc}{dvsau}' '{dlc}{dln}{dsv}' '{dlc}{dls}{dsv}' '{dltha}{dvsi}{dle}' '{dlpa}{dlr}{dsv}' '{dlb}{dly}{dvso}' '{dlh}{dlr}{dvsu}' '{dlh}{dlr}{dvsuu}' '{dvsi}{dld}{dvsaa}' '{dli}{dld}{dvsaa}' '{dvsi}{dld}{dvso}' '{dli}{dld}{dvso}' '{dvsi}{dld}{dvsai}' '{dli}{dld}{dvsai}' '{dln}{dvse}{dlc}' '{dli}{dlc}' '{dvsi}{dlc}' '{dle}{dlc}' '{dvse}{dlc}' '{dlc}{dvsu}' '{dlc}{dvse}' '{dlc}{dvsau}' '{dltha}{dvsii}' '{dltha}{dvse}' '{dld}{dvsaa}' '{dld}{dvsii}' '{dld}{dvsai}' '{dld}{dvso}' '{dln}{dvsu}' '{dln}{dvse}' '{dly}{dvso}' '{dly}{dvsau}' '{dlc}' + (delete) + ) + ) + +) + +define stem as ( + backwards ( + do remove_category_1 + do ( + repeat (do (check_category_2 and remove_category_2) remove_category_3) + ) + ) +) diff --git a/src/lib.rs b/src/lib.rs index 38c2c03..0eba7e9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -46,6 +46,7 @@ pub enum Algorithm { Greek, Hungarian, Italian, + Nepali, Norwegian, Portuguese, Romanian, @@ -76,6 +77,7 @@ impl Stemmer { Algorithm::Greek => Stemmer { stemmer: algorithms::greek::stem }, Algorithm::Hungarian => Stemmer { stemmer: algorithms::hungarian::stem }, Algorithm::Italian => Stemmer { stemmer: algorithms::italian::stem }, + Algorithm::Nepali => Stemmer { stemmer: algorithms::nepali::stem }, Algorithm::Norwegian => Stemmer { stemmer: algorithms::norwegian::stem }, Algorithm::Portuguese => Stemmer { stemmer: algorithms::portuguese::stem }, Algorithm::Romanian => Stemmer { stemmer: algorithms::romanian::stem }, diff --git a/src/snowball/algorithms/mod.rs b/src/snowball/algorithms/mod.rs index c1c4073..60df2fd 100644 --- a/src/snowball/algorithms/mod.rs +++ b/src/snowball/algorithms/mod.rs @@ -9,6 +9,7 @@ pub mod german; pub mod greek; pub mod hungarian; pub mod italian; +pub mod nepali; pub mod norwegian; pub mod portuguese; pub mod romanian; diff --git a/src/snowball/algorithms/nepali.rs b/src/snowball/algorithms/nepali.rs new file mode 100644 index 0000000..b47af7d --- /dev/null +++ b/src/snowball/algorithms/nepali.rs @@ -0,0 +1,293 @@ +//! Generated by Snowball 2.1.0 - https://snowballstem.org/ + +#![allow(non_upper_case_globals)] +#![allow(non_snake_case)] +#![allow(unused_variables)] +#![allow(unused_mut)] +use snowball::SnowballEnv; +use snowball::Among; + +static A_0: &'static [Among; 17] = &[ + Among("\u{0915}\u{0940}", -1, 2, None), + Among("\u{0932}\u{093E}\u{0907}", -1, 1, None), + Among("\u{0932}\u{0947}", -1, 1, None), + Among("\u{0932}\u{093E}\u{0908}", -1, 1, None), + Among("\u{0915}\u{0948}", -1, 2, None), + Among("\u{0938}\u{0901}\u{0917}\u{0948}", -1, 1, None), + Among("\u{092E}\u{0948}", -1, 1, None), + Among("\u{0915}\u{094B}", -1, 2, None), + Among("\u{0938}\u{0901}\u{0917}", -1, 1, None), + Among("\u{0938}\u{0902}\u{0917}", -1, 1, None), + Among("\u{092E}\u{093E}\u{0930}\u{094D}\u{092B}\u{0924}", -1, 1, None), + Among("\u{0930}\u{0924}", -1, 1, None), + Among("\u{0915}\u{093E}", -1, 2, None), + Among("\u{092E}\u{093E}", -1, 1, None), + Among("\u{0926}\u{094D}\u{0935}\u{093E}\u{0930}\u{093E}", -1, 1, None), + Among("\u{0915}\u{093F}", -1, 2, None), + Among("\u{092A}\u{091B}\u{093F}", -1, 1, None), +]; + +static A_1: &'static [Among; 3] = &[ + Among("\u{0901}", -1, -1, None), + Among("\u{0902}", -1, -1, None), + Among("\u{0948}", -1, -1, None), +]; + +static A_2: &'static [Among; 3] = &[ + Among("\u{0901}", -1, 1, None), + Among("\u{0902}", -1, 1, None), + Among("\u{0948}", -1, 2, None), +]; + +static A_3: &'static [Among; 91] = &[ + Among("\u{0947}\u{0915}\u{0940}", -1, 1, None), + Among("\u{090F}\u{0915}\u{0940}", -1, 1, None), + Among("\u{0907}\u{090F}\u{0915}\u{0940}", 1, 1, None), + Among("\u{093F}\u{090F}\u{0915}\u{0940}", 1, 1, None), + Among("\u{0926}\u{0947}\u{0916}\u{0940}", -1, 1, None), + Among("\u{0925}\u{0940}", -1, 1, None), + Among("\u{0926}\u{0940}", -1, 1, None), + Among("\u{091B}\u{0941}", -1, 1, None), + Among("\u{0947}\u{091B}\u{0941}", 7, 1, None), + Among("\u{0928}\u{0947}\u{091B}\u{0941}", 8, 1, None), + Among("\u{090F}\u{091B}\u{0941}", 7, 1, None), + Among("\u{0928}\u{0941}", -1, 1, None), + Among("\u{0939}\u{0930}\u{0941}", -1, 1, None), + Among("\u{0939}\u{0930}\u{0942}", -1, 1, None), + Among("\u{091B}\u{0947}", -1, 1, None), + Among("\u{0925}\u{0947}", -1, 1, None), + Among("\u{0928}\u{0947}", -1, 1, None), + Among("\u{0947}\u{0915}\u{0948}", -1, 1, None), + Among("\u{0928}\u{0947}\u{0915}\u{0948}", 17, 1, None), + Among("\u{090F}\u{0915}\u{0948}", -1, 1, None), + Among("\u{0926}\u{0948}", -1, 1, None), + Among("\u{0907}\u{0926}\u{0948}", 20, 1, None), + Among("\u{093F}\u{0926}\u{0948}", 20, 1, None), + Among("\u{0947}\u{0915}\u{094B}", -1, 1, None), + Among("\u{0928}\u{0947}\u{0915}\u{094B}", 23, 1, None), + Among("\u{090F}\u{0915}\u{094B}", -1, 1, None), + Among("\u{0907}\u{090F}\u{0915}\u{094B}", 25, 1, None), + Among("\u{093F}\u{090F}\u{0915}\u{094B}", 25, 1, None), + Among("\u{0926}\u{094B}", -1, 1, None), + Among("\u{0907}\u{0926}\u{094B}", 28, 1, None), + Among("\u{093F}\u{0926}\u{094B}", 28, 1, None), + Among("\u{092F}\u{094B}", -1, 1, None), + Among("\u{0907}\u{092F}\u{094B}", 31, 1, None), + Among("\u{0925}\u{094D}\u{092F}\u{094B}", 31, 1, None), + Among("\u{092D}\u{092F}\u{094B}", 31, 1, None), + Among("\u{093F}\u{092F}\u{094B}", 31, 1, None), + Among("\u{0925}\u{093F}\u{092F}\u{094B}", 35, 1, None), + Among("\u{0926}\u{093F}\u{092F}\u{094B}", 35, 1, None), + Among("\u{091B}\u{094C}", -1, 1, None), + Among("\u{0907}\u{091B}\u{094C}", 38, 1, None), + Among("\u{0947}\u{091B}\u{094C}", 38, 1, None), + Among("\u{0928}\u{0947}\u{091B}\u{094C}", 40, 1, None), + Among("\u{090F}\u{091B}\u{094C}", 38, 1, None), + Among("\u{093F}\u{091B}\u{094C}", 38, 1, None), + Among("\u{092F}\u{094C}", -1, 1, None), + Among("\u{091B}\u{094D}\u{092F}\u{094C}", 44, 1, None), + Among("\u{0925}\u{094D}\u{092F}\u{094C}", 44, 1, None), + Among("\u{0925}\u{093F}\u{092F}\u{094C}", 44, 1, None), + Among("\u{091B}\u{0928}\u{094D}", -1, 1, None), + Among("\u{0907}\u{091B}\u{0928}\u{094D}", 48, 1, None), + Among("\u{0947}\u{091B}\u{0928}\u{094D}", 48, 1, None), + Among("\u{0928}\u{0947}\u{091B}\u{0928}\u{094D}", 50, 1, None), + Among("\u{090F}\u{091B}\u{0928}\u{094D}", 48, 1, None), + Among("\u{093F}\u{091B}\u{0928}\u{094D}", 48, 1, None), + Among("\u{0932}\u{093E}\u{0928}\u{094D}", -1, 1, None), + Among("\u{091B}\u{093F}\u{0928}\u{094D}", -1, 1, None), + Among("\u{0925}\u{093F}\u{0928}\u{094D}", -1, 1, None), + Among("\u{092A}\u{0930}\u{094D}", -1, 1, None), + Among("\u{0907}\u{0938}\u{094D}", -1, 1, None), + Among("\u{0925}\u{093F}\u{0907}\u{0938}\u{094D}", 58, 1, None), + Among("\u{091B}\u{0947}\u{0938}\u{094D}", -1, 1, None), + Among("\u{0939}\u{094B}\u{0938}\u{094D}", -1, 1, None), + Among("\u{091B}\u{0938}\u{094D}", -1, 1, None), + Among("\u{0907}\u{091B}\u{0938}\u{094D}", 62, 1, None), + Among("\u{0947}\u{091B}\u{0938}\u{094D}", 62, 1, None), + Among("\u{0928}\u{0947}\u{091B}\u{0938}\u{094D}", 64, 1, None), + Among("\u{090F}\u{091B}\u{0938}\u{094D}", 62, 1, None), + Among("\u{093F}\u{091B}\u{0938}\u{094D}", 62, 1, None), + Among("\u{093F}\u{0938}\u{094D}", -1, 1, None), + Among("\u{0925}\u{093F}\u{0938}\u{094D}", 68, 1, None), + Among("\u{0925}\u{093F}\u{090F}", -1, 1, None), + Among("\u{091B}", -1, 1, None), + Among("\u{0907}\u{091B}", 71, 1, None), + Among("\u{0947}\u{091B}", 71, 1, None), + Among("\u{0928}\u{0947}\u{091B}", 73, 1, None), + Among("\u{0939}\u{0941}\u{0928}\u{0947}\u{091B}", 74, 1, None), + Among("\u{0939}\u{0941}\u{0928}\u{094D}\u{091B}", 71, 1, None), + Among("\u{0907}\u{0928}\u{094D}\u{091B}", 71, 1, None), + Among("\u{093F}\u{0928}\u{094D}\u{091B}", 71, 1, None), + Among("\u{090F}\u{091B}", 71, 1, None), + Among("\u{093F}\u{091B}", 71, 1, None), + Among("\u{0947}\u{0915}\u{093E}", -1, 1, None), + Among("\u{0928}\u{0947}\u{0915}\u{093E}", 81, 1, None), + Among("\u{090F}\u{0915}\u{093E}", -1, 1, None), + Among("\u{0907}\u{090F}\u{0915}\u{093E}", 83, 1, None), + Among("\u{093F}\u{090F}\u{0915}\u{093E}", 83, 1, None), + Among("\u{0926}\u{093E}", -1, 1, None), + Among("\u{0907}\u{0926}\u{093E}", 86, 1, None), + Among("\u{093F}\u{0926}\u{093E}", 86, 1, None), + Among("\u{0926}\u{0947}\u{0916}\u{093F}", -1, 1, None), + Among("\u{092E}\u{093E}\u{0925}\u{093F}", -1, 1, None), +]; + +#[derive(Clone)] +struct Context { +} + +fn r_remove_category_1(env: &mut SnowballEnv, context: &mut Context) -> bool { + let mut among_var; + env.ket = env.cursor; + among_var = env.find_among_b(A_0, context); + if among_var == 0 { + return false; + } + env.bra = env.cursor; + if among_var == 1 { + if !env.slice_del() { + return false; + } + } else if among_var == 2 { + 'lab0: loop { + let v_1 = env.limit - env.cursor; + 'lab1: loop { + 'lab2: loop { + let v_2 = env.limit - env.cursor; + 'lab3: loop { + if !env.eq_s_b(&"\u{090F}") { + break 'lab3; + } + break 'lab2; + } + env.cursor = env.limit - v_2; + if !env.eq_s_b(&"\u{0947}") { + break 'lab1; + } + break 'lab2; + } + break 'lab0; + } + env.cursor = env.limit - v_1; + if !env.slice_del() { + return false; + } + break 'lab0; + } + } + return true; +} + +fn r_check_category_2(env: &mut SnowballEnv, context: &mut Context) -> bool { + env.ket = env.cursor; + if env.find_among_b(A_1, context) == 0 { + return false; + } + env.bra = env.cursor; + return true; +} + +fn r_remove_category_2(env: &mut SnowballEnv, context: &mut Context) -> bool { + let mut among_var; + env.ket = env.cursor; + among_var = env.find_among_b(A_2, context); + if among_var == 0 { + return false; + } + env.bra = env.cursor; + if among_var == 1 { + 'lab0: loop { + let v_1 = env.limit - env.cursor; + 'lab1: loop { + if !env.eq_s_b(&"\u{092F}\u{094C}") { + break 'lab1; + } + break 'lab0; + } + env.cursor = env.limit - v_1; + 'lab2: loop { + if !env.eq_s_b(&"\u{091B}\u{094C}") { + break 'lab2; + } + break 'lab0; + } + env.cursor = env.limit - v_1; + 'lab3: loop { + if !env.eq_s_b(&"\u{0928}\u{094C}") { + break 'lab3; + } + break 'lab0; + } + env.cursor = env.limit - v_1; + if !env.eq_s_b(&"\u{0925}\u{0947}") { + return false; + } + break 'lab0; + } + if !env.slice_del() { + return false; + } + } else if among_var == 2 { + if !env.eq_s_b(&"\u{0924}\u{094D}\u{0930}") { + return false; + } + if !env.slice_del() { + return false; + } + } + return true; +} + +fn r_remove_category_3(env: &mut SnowballEnv, context: &mut Context) -> bool { + env.ket = env.cursor; + if env.find_among_b(A_3, context) == 0 { + return false; + } + env.bra = env.cursor; + if !env.slice_del() { + return false; + } + return true; +} + +pub fn stem(env: &mut SnowballEnv) -> bool { + let mut context = &mut Context { + }; + env.limit_backward = env.cursor; + env.cursor = env.limit; + let v_1 = env.limit - env.cursor; + r_remove_category_1(env, context); + env.cursor = env.limit - v_1; + let v_2 = env.limit - env.cursor; + 'lab0: loop { + 'replab1: loop{ + let v_3 = env.limit - env.cursor; + 'lab2: for _ in 0..1 { + let v_4 = env.limit - env.cursor; + 'lab3: loop { + let v_5 = env.limit - env.cursor; + if !r_check_category_2(env, context) { + break 'lab3; + } + env.cursor = env.limit - v_5; + if !r_remove_category_2(env, context) { + break 'lab3; + } + break 'lab3; + } + env.cursor = env.limit - v_4; + if !r_remove_category_3(env, context) { + break 'lab2; + } + continue 'replab1; + } + env.cursor = env.limit - v_3; + break 'replab1; + } + break 'lab0; + } + env.cursor = env.limit - v_2; + env.cursor = env.limit_backward; + return true; +}