Skip to content

Commit

Permalink
Merge pull request #283 from mh-northlander/refactor/pyprojector
Browse files Browse the repository at this point in the history
mv pyprojector to projection mod, organize required_field constructio
  • Loading branch information
mh-northlander authored Dec 4, 2024
2 parents 00c545b + cb9d957 commit 287f3d7
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 73 deletions.
74 changes: 46 additions & 28 deletions python/src/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

use std::convert::TryFrom;
use std::fmt::Write;
use std::ops::Deref;
use std::path::{Path, PathBuf};
use std::str::FromStr;
use std::sync::Arc;
Expand All @@ -36,10 +35,10 @@ use sudachi::plugin::oov::OovProviderPlugin;
use sudachi::plugin::path_rewrite::PathRewritePlugin;

use crate::errors;
use crate::morpheme::{PyMorphemeListWrapper, PyProjector};
use crate::morpheme::PyMorphemeListWrapper;
use crate::pos_matcher::PyPosMatcher;
use crate::pretokenizer::PyPretokenizer;
use crate::projection::{morpheme_projection, parse_projection_opt, resolve_projection};
use crate::projection::{pyprojection, PyProjector};
use crate::tokenizer::{PySplitMode, PyTokenizer};

pub(crate) struct PyDicData {
Expand Down Expand Up @@ -217,11 +216,7 @@ impl PyDictionary {
})
.collect();

let projection = if config.projection == SurfaceProjection::Surface {
None
} else {
Some(morpheme_projection(config.projection, &jdic))
};
let projection = pyprojection(config.projection, &jdic);

let dic_data = PyDicData {
dictionary: jdic,
Expand Down Expand Up @@ -262,19 +257,22 @@ impl PyDictionary {
None => Mode::C,
};
let fields = parse_field_subset(fields)?;
let mut required_fields = self.config.projection.required_subset();
let dict = self.dictionary.as_ref().unwrap().clone();
let projobj = if let Some(s) = projection {
let proj = errors::wrap(SurfaceProjection::try_from(s.to_str()?))?;
required_fields = proj.required_subset();
Some(morpheme_projection(proj, &dict))

let (projection, required_fields) = if let Some(s) = projection {
let projection = errors::wrap(SurfaceProjection::try_from(s.to_str()?))?;
(
pyprojection(projection, &dict),
projection.required_subset(),
)
} else {
None
(
dict.projection.clone(),
self.config.projection.required_subset(),
)
};

let projobj = resolve_projection(projobj, &dict.projection);

let tok = PyTokenizer::new(dict, mode, fields | required_fields, projobj);
let tok = PyTokenizer::new(dict, mode, fields | required_fields, projection);
Ok(tok)
}

Expand Down Expand Up @@ -304,10 +302,13 @@ impl PyDictionary {
/// :param mode: Use this split mode (C by default)
/// :param fields: ask Sudachi to load only a subset of fields.
/// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html.
/// Only used when `handler` is set.
/// :param handler: a custom callable to transform MorphemeList into list of tokens. If None, simply use surface as token representations.
/// Overrides `projection`.
/// It should be a `function(index: int, original: NormalizedString, morphemes: MorphemeList) -> List[NormalizedString]`.
/// See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py.
/// :param projection: Projection override for created Tokenizer. See Config.projection for values.
/// If nothing was passed, simply use surface as token representations.
/// :param projection: Projection override for created Tokenizer. See Config.projection for supported values.
///
/// :type mode: SplitMode | str | None
/// :type fields: set[str] | None
Expand All @@ -329,7 +330,7 @@ impl PyDictionary {
Some(m) => extract_mode(m)?,
None => Mode::C,
};
let subset = parse_field_subset(fields)?;

if let Some(h) = handler.as_ref() {
if !h.bind(py).is_callable() {
return errors::wrap(Err("handler must be callable"));
Expand All @@ -338,18 +339,35 @@ impl PyDictionary {

let dict = self.dictionary.as_ref().unwrap().clone();

let mut required_fields = if handler.is_none() {
self.config.projection.required_subset()
// morphemes will be consumed inside pretokenizer therefore we only need fields used by handler or projection
let (projection, required_fields) = if handler.is_some() {
// pretokenizer won't use projection when handler is set.
(
None,
self.config.projection.required_subset() | parse_field_subset(fields)?,
)
} else if let Some(s) = projection {
let projection = errors::wrap(SurfaceProjection::try_from(s.to_str()?))?;
// use default projection if "surface" is specified (see #259)
if projection == SurfaceProjection::Surface {
(
dict.projection.clone(),
self.config.projection.required_subset(),
)
} else {
(
pyprojection(projection, &dict),
projection.required_subset(),
)
}
} else {
self.config.projection.required_subset() | subset
(
dict.projection.clone(),
self.config.projection.required_subset(),
)
};

let (passed, projection) = parse_projection_opt(projection, dict.deref())?;

required_fields |= projection.required_subset();

let projector = resolve_projection(passed, &dict.projection);
let internal = PyPretokenizer::new(dict, mode, required_fields, handler, projector);
let internal = PyPretokenizer::new(dict, mode, required_fields, handler, projection);
let internal_cell = Bound::new(py, internal)?;
let module = py.import_bound("tokenizers.pre_tokenizers")?;
module
Expand Down
3 changes: 1 addition & 2 deletions python/src/morpheme.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,10 @@ use sudachi::prelude::{Morpheme, MorphemeList};

use crate::dictionary::{extract_mode, PyDicData, PyDictionary};
use crate::errors;
use crate::projection::MorphemeProjection;
use crate::projection::{MorphemeProjection, PyProjector};
use crate::word_info::PyWordInfo;

pub(crate) type PyMorphemeList = MorphemeList<Arc<PyDicData>>;
pub(crate) type PyProjector = Option<Arc<dyn MorphemeProjection + Send + Sync>>;

/// A list of morphemes.
///
Expand Down
4 changes: 2 additions & 2 deletions python/src/pretokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ use sudachi::prelude::Mode;

use crate::dictionary::PyDicData;
use crate::errors;
use crate::morpheme::{PyMorphemeList, PyMorphemeListWrapper, PyProjector};
use crate::projection::MorphemeProjection;
use crate::morpheme::{PyMorphemeList, PyMorphemeListWrapper};
use crate::projection::{MorphemeProjection, PyProjector};

/// This struct perform actual tokenization
/// There should be at most one instance per thread of execution
Expand Down
49 changes: 9 additions & 40 deletions python/src/projection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,19 @@
* limitations under the License.
*/

use std::convert::TryFrom;
use std::ops::Deref;
use std::sync::Arc;

use pyo3::prelude::*;
use pyo3::types::PyString;
use pyo3::{PyResult, Python};
use pyo3::Python;

use sudachi::analysis::stateless_tokenizer::DictionaryAccess;
use sudachi::config::SurfaceProjection;
use sudachi::pos::PosMatcher;
use sudachi::prelude::Morpheme;

use crate::dictionary::PyDicData;
use crate::errors;
use crate::morpheme::PyProjector;

pub(crate) trait MorphemeProjection {
fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> Bound<'py, PyString>;
Expand Down Expand Up @@ -159,43 +156,15 @@ fn make_matcher<D: DictionaryAccess, F: FnMut(&Vec<String>) -> bool>(
PosMatcher::new(ids)
}

pub(crate) fn resolve_projection(base: PyProjector, fallback: &PyProjector) -> PyProjector {
match (base, fallback) {
(None, None) => None,
(Some(p), _) => Some(p),
(_, Some(p)) => Some(p.clone()),
}
}

pub(crate) fn parse_projection<D: DictionaryAccess>(
value: &Bound<PyString>,
dict: &D,
) -> PyResult<(PyProjector, SurfaceProjection)> {
value.to_str().and_then(|s| parse_projection_raw(s, dict))
}
pub(crate) type PyProjector = Option<Arc<dyn MorphemeProjection + Send + Sync>>;

pub(crate) fn parse_projection_raw<D: DictionaryAccess>(
value: &str,
dict: &D,
) -> PyResult<(PyProjector, SurfaceProjection)> {
errors::wrap_ctx(
SurfaceProjection::try_from(value).map(|v| {
if v == SurfaceProjection::Surface {
(None, SurfaceProjection::Surface)
} else {
(Some(morpheme_projection(v, dict)), v)
}
}),
"invalid surface projection",
)
}

pub(crate) fn parse_projection_opt<D: DictionaryAccess>(
value: Option<&Bound<PyString>>,
pub(crate) fn pyprojection<D: DictionaryAccess>(
projection: SurfaceProjection,
dict: &D,
) -> PyResult<(PyProjector, SurfaceProjection)> {
match value {
None => Ok((None, SurfaceProjection::Surface)),
Some(v) => parse_projection(v, dict),
) -> PyProjector {
if projection == SurfaceProjection::Surface {
None
} else {
Some(morpheme_projection(projection, dict))
}
}
3 changes: 2 additions & 1 deletion python/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ use sudachi::prelude::*;

use crate::dictionary::{extract_mode, PyDicData};
use crate::errors;
use crate::morpheme::{PyMorphemeListWrapper, PyProjector};
use crate::morpheme::PyMorphemeListWrapper;
use crate::projection::PyProjector;

/// Unit to split text.
///
Expand Down

0 comments on commit 287f3d7

Please sign in to comment.