From cb9d957e5543475ba5fe1b414c05737121883c7c Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 3 Dec 2024 10:55:13 +0900 Subject: [PATCH] mv pyprojector to projection mod, organize required_field constructio --- python/src/dictionary.rs | 74 +++++++++++++++++++++++--------------- python/src/morpheme.rs | 3 +- python/src/pretokenizer.rs | 4 +-- python/src/projection.rs | 49 +++++-------------------- python/src/tokenizer.rs | 3 +- 5 files changed, 60 insertions(+), 73 deletions(-) diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs index fb9892ea..5e0e6904 100644 --- a/python/src/dictionary.rs +++ b/python/src/dictionary.rs @@ -16,7 +16,6 @@ use std::convert::TryFrom; use std::fmt::Write; -use std::ops::Deref; use std::path::{Path, PathBuf}; use std::str::FromStr; use std::sync::Arc; @@ -36,10 +35,10 @@ use sudachi::plugin::oov::OovProviderPlugin; use sudachi::plugin::path_rewrite::PathRewritePlugin; use crate::errors; -use crate::morpheme::{PyMorphemeListWrapper, PyProjector}; +use crate::morpheme::PyMorphemeListWrapper; use crate::pos_matcher::PyPosMatcher; use crate::pretokenizer::PyPretokenizer; -use crate::projection::{morpheme_projection, parse_projection_opt, resolve_projection}; +use crate::projection::{pyprojection, PyProjector}; use crate::tokenizer::{PySplitMode, PyTokenizer}; pub(crate) struct PyDicData { @@ -217,11 +216,7 @@ impl PyDictionary { }) .collect(); - let projection = if config.projection == SurfaceProjection::Surface { - None - } else { - Some(morpheme_projection(config.projection, &jdic)) - }; + let projection = pyprojection(config.projection, &jdic); let dic_data = PyDicData { dictionary: jdic, @@ -262,19 +257,22 @@ impl PyDictionary { None => Mode::C, }; let fields = parse_field_subset(fields)?; - let mut required_fields = self.config.projection.required_subset(); let dict = self.dictionary.as_ref().unwrap().clone(); - let projobj = if let Some(s) = projection { - let proj = errors::wrap(SurfaceProjection::try_from(s.to_str()?))?; - required_fields = proj.required_subset(); - Some(morpheme_projection(proj, &dict)) + + let (projection, required_fields) = if let Some(s) = projection { + let projection = errors::wrap(SurfaceProjection::try_from(s.to_str()?))?; + ( + pyprojection(projection, &dict), + projection.required_subset(), + ) } else { - None + ( + dict.projection.clone(), + self.config.projection.required_subset(), + ) }; - let projobj = resolve_projection(projobj, &dict.projection); - - let tok = PyTokenizer::new(dict, mode, fields | required_fields, projobj); + let tok = PyTokenizer::new(dict, mode, fields | required_fields, projection); Ok(tok) } @@ -304,10 +302,13 @@ impl PyDictionary { /// :param mode: Use this split mode (C by default) /// :param fields: ask Sudachi to load only a subset of fields. /// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html. + /// Only used when `handler` is set. /// :param handler: a custom callable to transform MorphemeList into list of tokens. If None, simply use surface as token representations. + /// Overrides `projection`. /// It should be a `function(index: int, original: NormalizedString, morphemes: MorphemeList) -> List[NormalizedString]`. /// See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py. - /// :param projection: Projection override for created Tokenizer. See Config.projection for values. + /// If nothing was passed, simply use surface as token representations. + /// :param projection: Projection override for created Tokenizer. See Config.projection for supported values. /// /// :type mode: SplitMode | str | None /// :type fields: set[str] | None @@ -329,7 +330,7 @@ impl PyDictionary { Some(m) => extract_mode(m)?, None => Mode::C, }; - let subset = parse_field_subset(fields)?; + if let Some(h) = handler.as_ref() { if !h.bind(py).is_callable() { return errors::wrap(Err("handler must be callable")); @@ -338,18 +339,35 @@ impl PyDictionary { let dict = self.dictionary.as_ref().unwrap().clone(); - let mut required_fields = if handler.is_none() { - self.config.projection.required_subset() + // morphemes will be consumed inside pretokenizer therefore we only need fields used by handler or projection + let (projection, required_fields) = if handler.is_some() { + // pretokenizer won't use projection when handler is set. + ( + None, + self.config.projection.required_subset() | parse_field_subset(fields)?, + ) + } else if let Some(s) = projection { + let projection = errors::wrap(SurfaceProjection::try_from(s.to_str()?))?; + // use default projection if "surface" is specified (see #259) + if projection == SurfaceProjection::Surface { + ( + dict.projection.clone(), + self.config.projection.required_subset(), + ) + } else { + ( + pyprojection(projection, &dict), + projection.required_subset(), + ) + } } else { - self.config.projection.required_subset() | subset + ( + dict.projection.clone(), + self.config.projection.required_subset(), + ) }; - let (passed, projection) = parse_projection_opt(projection, dict.deref())?; - - required_fields |= projection.required_subset(); - - let projector = resolve_projection(passed, &dict.projection); - let internal = PyPretokenizer::new(dict, mode, required_fields, handler, projector); + let internal = PyPretokenizer::new(dict, mode, required_fields, handler, projection); let internal_cell = Bound::new(py, internal)?; let module = py.import_bound("tokenizers.pre_tokenizers")?; module diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs index 55a29571..49e854a3 100644 --- a/python/src/morpheme.rs +++ b/python/src/morpheme.rs @@ -26,11 +26,10 @@ use sudachi::prelude::{Morpheme, MorphemeList}; use crate::dictionary::{extract_mode, PyDicData, PyDictionary}; use crate::errors; -use crate::projection::MorphemeProjection; +use crate::projection::{MorphemeProjection, PyProjector}; use crate::word_info::PyWordInfo; pub(crate) type PyMorphemeList = MorphemeList>; -pub(crate) type PyProjector = Option>; /// A list of morphemes. /// diff --git a/python/src/pretokenizer.rs b/python/src/pretokenizer.rs index 8faab5e5..8c4ee3f0 100644 --- a/python/src/pretokenizer.rs +++ b/python/src/pretokenizer.rs @@ -29,8 +29,8 @@ use sudachi::prelude::Mode; use crate::dictionary::PyDicData; use crate::errors; -use crate::morpheme::{PyMorphemeList, PyMorphemeListWrapper, PyProjector}; -use crate::projection::MorphemeProjection; +use crate::morpheme::{PyMorphemeList, PyMorphemeListWrapper}; +use crate::projection::{MorphemeProjection, PyProjector}; /// This struct perform actual tokenization /// There should be at most one instance per thread of execution diff --git a/python/src/projection.rs b/python/src/projection.rs index bca184ec..3075236f 100644 --- a/python/src/projection.rs +++ b/python/src/projection.rs @@ -14,13 +14,12 @@ * limitations under the License. */ -use std::convert::TryFrom; use std::ops::Deref; use std::sync::Arc; use pyo3::prelude::*; use pyo3::types::PyString; -use pyo3::{PyResult, Python}; +use pyo3::Python; use sudachi::analysis::stateless_tokenizer::DictionaryAccess; use sudachi::config::SurfaceProjection; @@ -28,8 +27,6 @@ use sudachi::pos::PosMatcher; use sudachi::prelude::Morpheme; use crate::dictionary::PyDicData; -use crate::errors; -use crate::morpheme::PyProjector; pub(crate) trait MorphemeProjection { fn project<'py>(&self, m: &Morpheme>, py: Python<'py>) -> Bound<'py, PyString>; @@ -159,43 +156,15 @@ fn make_matcher) -> bool>( PosMatcher::new(ids) } -pub(crate) fn resolve_projection(base: PyProjector, fallback: &PyProjector) -> PyProjector { - match (base, fallback) { - (None, None) => None, - (Some(p), _) => Some(p), - (_, Some(p)) => Some(p.clone()), - } -} - -pub(crate) fn parse_projection( - value: &Bound, - dict: &D, -) -> PyResult<(PyProjector, SurfaceProjection)> { - value.to_str().and_then(|s| parse_projection_raw(s, dict)) -} +pub(crate) type PyProjector = Option>; -pub(crate) fn parse_projection_raw( - value: &str, - dict: &D, -) -> PyResult<(PyProjector, SurfaceProjection)> { - errors::wrap_ctx( - SurfaceProjection::try_from(value).map(|v| { - if v == SurfaceProjection::Surface { - (None, SurfaceProjection::Surface) - } else { - (Some(morpheme_projection(v, dict)), v) - } - }), - "invalid surface projection", - ) -} - -pub(crate) fn parse_projection_opt( - value: Option<&Bound>, +pub(crate) fn pyprojection( + projection: SurfaceProjection, dict: &D, -) -> PyResult<(PyProjector, SurfaceProjection)> { - match value { - None => Ok((None, SurfaceProjection::Surface)), - Some(v) => parse_projection(v, dict), +) -> PyProjector { + if projection == SurfaceProjection::Surface { + None + } else { + Some(morpheme_projection(projection, dict)) } } diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs index d5b938d9..37660823 100644 --- a/python/src/tokenizer.rs +++ b/python/src/tokenizer.rs @@ -26,7 +26,8 @@ use sudachi::prelude::*; use crate::dictionary::{extract_mode, PyDicData}; use crate::errors; -use crate::morpheme::{PyMorphemeListWrapper, PyProjector}; +use crate::morpheme::PyMorphemeListWrapper; +use crate::projection::PyProjector; /// Unit to split text. ///