From 8baaa7abc53c49d7b475256436935f2e30fe3a4c Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 5 Jul 2024 17:24:57 +0900 Subject: [PATCH 1/9] add missing docstrings --- python/py_src/sudachipy/errors.py | 6 ++++-- python/src/build.rs | 4 +++- python/src/lib.rs | 6 ++++-- python/src/morpheme.rs | 6 ++++-- python/src/pos_matcher.rs | 6 +++++- python/src/pretokenizer.rs | 7 ++++--- python/src/tokenizer.rs | 7 ++++--- 7 files changed, 28 insertions(+), 14 deletions(-) diff --git a/python/py_src/sudachipy/errors.py b/python/py_src/sudachipy/errors.py index e75e21cd..c11a8205 100644 --- a/python/py_src/sudachipy/errors.py +++ b/python/py_src/sudachipy/errors.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 Works Applications Co., Ltd. +# Copyright (c) 2023-2024 Works Applications Co., Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,4 +13,6 @@ # limitations under the License. class SudachiError(Exception): - pass \ No newline at end of file + """Base class for all Sudachipy exceptions. + """ + pass diff --git a/python/src/build.rs b/python/src/build.rs index a6005b26..59eb50c9 100644 --- a/python/src/build.rs +++ b/python/src/build.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -58,6 +58,7 @@ fn create_file(p: &Path) -> std::io::Result { OpenOptions::new().create_new(true).write(true).open(p) } +/// Build system dictionary from matrix and lexicons. #[pyfunction] #[pyo3(text_signature = "(matrix, lex, output, description=None) -> list")] fn build_system_dic<'p>( @@ -87,6 +88,7 @@ fn build_system_dic<'p>( to_stats(py, builder) } +/// Build user dictionary from lexicons based on the given system dictionary. #[pyfunction] #[pyo3(text_signature = "(system, lex, output, description=None) -> list")] fn build_user_dic<'p>( diff --git a/python/src/lib.rs b/python/src/lib.rs index 68a9c91d..4887a737 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,7 +26,9 @@ mod projection; mod tokenizer; mod word_info; -/// module root +/// SudachiPy raw module root. +/// +/// Users should not use this directly. #[pymodule] fn sudachipy(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs index ad3929dd..47e020ee 100644 --- a/python/src/morpheme.rs +++ b/python/src/morpheme.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -86,6 +86,7 @@ impl PyMorphemeListWrapper { } } } + #[pymethods] impl PyMorphemeListWrapper { /// Returns an empty morpheme list with dictionary @@ -197,7 +198,7 @@ impl PyMorphemeListWrapper { } } -/// A morpheme (basic semantic unit of language). +/// An iterator over the MorphemeList. #[pyclass(module = "sudachipy.morphemelist", name = "MorphemeIter")] pub struct PyMorphemeIter { list: Py, @@ -241,6 +242,7 @@ impl<'py> Deref for MorphemeRef<'py> { } } +/// A morpheme (basic semantic unit of language). #[pyclass(module = "sudachipy.morpheme", name = "Morpheme", frozen)] pub struct PyMorpheme { list: Py, diff --git a/python/src/pos_matcher.rs b/python/src/pos_matcher.rs index 7c6a884d..a849edf5 100644 --- a/python/src/pos_matcher.rs +++ b/python/src/pos_matcher.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,6 +26,9 @@ use sudachi::pos::PosMatcher; use crate::dictionary::PyDicData; use crate::morpheme::PyMorpheme; +/// A part-of-speech matcher which checks if a morpheme belongs to a set of part of speech. +/// +/// Create using Dictionary.pos_matcher method. #[pyclass(name = "PosMatcher", module = "sudachipy")] pub struct PyPosMatcher { matcher: PosMatcher, @@ -189,6 +192,7 @@ impl PyPosMatcher { } } +/// An iterator over POS tuples in the PosPatcher #[pyclass(name = "PosMatcherIterator", module = "sudachipy")] pub struct PyPosIter { data: Vec, diff --git a/python/src/pretokenizer.rs b/python/src/pretokenizer.rs index 755f040b..385c6dcb 100644 --- a/python/src/pretokenizer.rs +++ b/python/src/pretokenizer.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -76,9 +76,10 @@ impl PerThreadPreTokenizer { } } -/// Binding for the Tokenizer, which handles threading for tokenization +/// Binding for the Tokenizer, which handles threading for tokenization. /// -/// We use ThreadLocal for storing actual tokenizers +/// Create using Dictionary.pre_tokenizer method. +/// We use ThreadLocal for storing actual tokenizers. #[pyclass(module = "sudachipy.pretokenizer", name = "SudachiPreTokenizer")] pub struct PyPretokenizer { dict: Arc, diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs index 558d02cb..a53ce166 100644 --- a/python/src/tokenizer.rs +++ b/python/src/tokenizer.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,7 +36,6 @@ use crate::morpheme::{PyMorphemeListWrapper, PyProjector}; /// B == middle mode /// /// C == long mode -// #[pyclass(module = "sudachipy.tokenizer", name = "SplitMode", frozen)] #[derive(Clone, PartialEq, Eq, Copy, Debug)] #[repr(u8)] @@ -68,6 +67,7 @@ impl From for PySplitMode { #[pymethods] impl PySplitMode { + /// Parse SplitMode from a character. #[new] fn new(mode: Option<&str>) -> PyResult { let mode = match mode { @@ -82,7 +82,7 @@ impl PySplitMode { } } -/// Sudachi Tokenizer, Python version +/// Sudachi Tokenizer #[pyclass(module = "sudachipy.tokenizer", name = "Tokenizer")] pub(crate) struct PyTokenizer { tokenizer: StatefulTokenizer>, @@ -182,6 +182,7 @@ impl PyTokenizer { Ok(out_list) } + /// SplitMode of the tokenizer. #[getter] fn mode(&self) -> PySplitMode { self.tokenizer.mode().into() From 8b597e341c3b9c2b9340c08d6507bfa75e041ab8 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 5 Jul 2024 17:52:19 +0900 Subject: [PATCH 2/9] copy docstring from new to class --- python/src/dictionary.rs | 22 ++++++++++++++++------ python/src/tokenizer.rs | 11 ++++++++--- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs index bc333c8e..1bada310 100644 --- a/python/src/dictionary.rs +++ b/python/src/dictionary.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -78,7 +78,17 @@ impl PyDicData { } } -/// A sudachi dictionary +/// A sudachi dictionary. +/// +/// If both config.systemDict and dict_type are not given, `sudachidict_core` is used. +/// If both config.systemDict and dict_type are given, dict_type is used. +/// If dict is an absolute path to a file, it is used as a dictionary. +/// +/// :param config_path: path to the configuration JSON file. +/// :param resource_dir: path to the resource directory folder. +/// :param dict: type of pre-packaged dictionary, referring to sudachidict_ packages on PyPI: https://pypi.org/search/?q=sudachidict. +/// Also, can be an _absolute_ path to a compiled dictionary file. +/// :param dict_type: deprecated alias to dict. #[pyclass(module = "sudachipy.dictionary", name = "Dictionary")] #[derive(Clone)] pub struct PyDictionary { @@ -92,13 +102,13 @@ impl PyDictionary { /// /// If both config.systemDict and dict_type are not given, `sudachidict_core` is used. /// If both config.systemDict and dict_type are given, dict_type is used. - /// If dict is an absolute path to a file, it is used as a dictionary + /// If dict is an absolute path to a file, it is used as a dictionary. /// - /// :param config_path: path to the configuration JSON file - /// :param resource_dir: path to the resource directory folder + /// :param config_path: path to the configuration JSON file. + /// :param resource_dir: path to the resource directory folder. /// :param dict: type of pre-packaged dictionary, referring to sudachidict_ packages on PyPI: https://pypi.org/search/?q=sudachidict. /// Also, can be an _absolute_ path to a compiled dictionary file. - /// :param dict_type: deprecated alias to dict + /// :param dict_type: deprecated alias to dict. #[new] #[pyo3(signature=(config_path = None, resource_dir = None, dict = None, dict_type = None, *, config = None))] fn new( diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs index a53ce166..fe3b66d3 100644 --- a/python/src/tokenizer.rs +++ b/python/src/tokenizer.rs @@ -29,13 +29,13 @@ use crate::dictionary::{extract_mode, PyDicData}; use crate::errors::SudachiError as SudachiPyErr; use crate::morpheme::{PyMorphemeListWrapper, PyProjector}; -/// Unit to split text +/// Unit to split text. /// /// A == short mode -/// /// B == middle mode -/// /// C == long mode +/// +/// :param mode: str to parse. One of [A,B,C] in captital or lower case. #[pyclass(module = "sudachipy.tokenizer", name = "SplitMode", frozen)] #[derive(Clone, PartialEq, Eq, Copy, Debug)] #[repr(u8)] @@ -68,7 +68,10 @@ impl From for PySplitMode { #[pymethods] impl PySplitMode { /// Parse SplitMode from a character. + /// + /// :param mode: str to parse. One of [A,B,C] in captital or lower case. #[new] + #[pyo3(signature=(mode=None, *))] fn new(mode: Option<&str>) -> PyResult { let mode = match mode { Some(m) => m, @@ -83,6 +86,8 @@ impl PySplitMode { } /// Sudachi Tokenizer +/// +/// Create using Dictionary.create method. #[pyclass(module = "sudachipy.tokenizer", name = "Tokenizer")] pub(crate) struct PyTokenizer { tokenizer: StatefulTokenizer>, From c1d37c7f0aab64bd64144664537fa46512aac6c5 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 8 Jul 2024 11:04:08 +0900 Subject: [PATCH 3/9] update text_signature --- python/src/build.rs | 4 ++-- python/src/dictionary.rs | 21 ++++++++++++--------- python/src/morpheme.rs | 36 ++++++++++++++++++------------------ python/src/tokenizer.rs | 9 ++++++--- 4 files changed, 38 insertions(+), 32 deletions(-) diff --git a/python/src/build.rs b/python/src/build.rs index 59eb50c9..350f2fb3 100644 --- a/python/src/build.rs +++ b/python/src/build.rs @@ -60,7 +60,7 @@ fn create_file(p: &Path) -> std::io::Result { /// Build system dictionary from matrix and lexicons. #[pyfunction] -#[pyo3(text_signature = "(matrix, lex, output, description=None) -> list")] +#[pyo3(text_signature="(matrix, lex, output, description=None) -> list[tuple[str, int, float]]")] fn build_system_dic<'p>( py: Python<'p>, matrix: &'p PyAny, @@ -90,7 +90,7 @@ fn build_system_dic<'p>( /// Build user dictionary from lexicons based on the given system dictionary. #[pyfunction] -#[pyo3(text_signature = "(system, lex, output, description=None) -> list")] +#[pyo3(text_signature="(system, lex, output, description=None) -> list[tuple[str, int, float]]")] fn build_user_dic<'p>( py: Python<'p>, system: &'p PyAny, diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs index 1bada310..e208492f 100644 --- a/python/src/dictionary.rs +++ b/python/src/dictionary.rs @@ -110,7 +110,10 @@ impl PyDictionary { /// Also, can be an _absolute_ path to a compiled dictionary file. /// :param dict_type: deprecated alias to dict. #[new] - #[pyo3(signature=(config_path = None, resource_dir = None, dict = None, dict_type = None, *, config = None))] + #[pyo3( + text_signature="(config_path=None, resource_dir=None, dict=None, dict_type=None, *, config=None) -> Dictionary", + signature=(config_path=None, resource_dir=None, dict=None, dict_type=None, *, config=None) + )] fn new( py: Python, config_path: Option<&PyAny>, @@ -230,8 +233,8 @@ impl PyDictionary { /// :param fields: load only a subset of fields. /// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html #[pyo3( - text_signature = "($self, mode = 'C') -> sudachipy.Tokenizer", - signature = (mode = None, fields = None, *, projection = None) + text_signature="(self, /, mode=None, fields=None, *, projection=None) -> Tokenizer", + signature=(mode=None, fields=None, *, projection=None) )] fn create<'py>( &'py self, @@ -272,7 +275,7 @@ impl PyDictionary { /// (None, None, None, None, None, '終止形‐一般') will match any word in 終止形‐一般 conjugation form. /// /// :param target: can be either a callable or list of POS partial tuples - #[pyo3(text_signature = "($self, target)")] + #[pyo3(text_signature="(self, /, target) -> PosMatcher")] fn pos_matcher<'py>(&'py self, py: Python<'py>, target: &PyAny) -> PyResult { PyPosMatcher::create(py, self.dictionary.as_ref().unwrap(), target) } @@ -293,8 +296,8 @@ impl PyDictionary { /// :type mode: sudachipy.SplitMode /// :type fields: Set[str] #[pyo3( - text_signature = "($self, mode, fields, handler) -> tokenizers.PreTokenizer", - signature = (mode = None, fields = None, handler = None, *, projection = None) + text_signature="(self, /, mode=None, fields=None, handler=None, *, projection=None) -> tokenizers.PreTokenizer", + signature=(mode=None, fields=None, handler=None, *, projection=None) )] fn pre_tokenizer<'p>( &'p self, @@ -349,7 +352,7 @@ impl PyDictionary { /// See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details. /// :type surface: str /// :type out: sudachipy.MorphemeList - #[pyo3(text_signature = "($self, surface, out = None) -> sudachipy.MorphemeList")] + #[pyo3(text_signature="(self, /, surface, out=None) -> MorphemeList")] fn lookup<'p>( &'p self, py: Python<'p>, @@ -377,13 +380,13 @@ impl PyDictionary { } /// Close this dictionary - #[pyo3(text_signature = "($self)")] + #[pyo3(text_signature="(self, /) -> ()")] fn close(&mut self) { self.dictionary = None; } /// Get POS Tuple by its id - #[pyo3(text_signature = "($self, pos_id: int)")] + #[pyo3(text_signature="(self, /, pos_id: int) -> tuple[str, str, str, str, str, str]")] fn pos_of<'p>(&'p self, py: Python<'p>, pos_id: usize) -> Option<&'p PyTuple> { let dic = self.dictionary.as_ref().unwrap(); dic.pos.get(pos_id).map(|x| x.as_ref(py)) diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs index 47e020ee..f1aa204d 100644 --- a/python/src/morpheme.rs +++ b/python/src/morpheme.rs @@ -91,7 +91,7 @@ impl PyMorphemeListWrapper { impl PyMorphemeListWrapper { /// Returns an empty morpheme list with dictionary #[classmethod] - #[pyo3(text_signature = "(dict: sudachipy.Dictionary) -> sudachipy.MorphemeList")] + #[pyo3(text_signature="(dict: Dictionary) -> MorphemeList")] fn empty(_cls: &PyType, py: Python, dict: &PyDictionary) -> PyResult { let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?; PyErr::warn( @@ -110,13 +110,13 @@ impl PyMorphemeListWrapper { } /// Returns the total cost of the path - #[pyo3(text_signature = "($self) -> int")] + #[pyo3(text_signature="(self, /) -> int")] fn get_internal_cost(&self, py: Python) -> i32 { self.internal(py).get_internal_cost() } /// Returns the number of morpheme in this list. - #[pyo3(text_signature = "($self) -> int")] + #[pyo3(text_signature="(self, /) -> int")] fn size(&self, py: Python) -> usize { self.internal(py).len() } @@ -279,21 +279,21 @@ impl PyMorpheme { #[pymethods] impl PyMorpheme { /// Returns the begin index of this in the input text - #[pyo3(text_signature = "($self) -> int")] + #[pyo3(text_signature="(self, /) -> int")] fn begin(&self, py: Python) -> usize { // call codepoint version self.morph(py).begin_c() } /// Returns the end index of this in the input text - #[pyo3(text_signature = "($self) -> int")] + #[pyo3(text_signature="(self, /) -> int")] fn end(&self, py: Python) -> usize { // call codepoint version self.morph(py).end_c() } /// Returns the substring of input text corresponding to the morpheme, or a projection if one is configured - #[pyo3(text_signature = "($self) -> str")] + #[pyo3(text_signature="(self, /) -> str")] fn surface<'py>(&'py self, py: Python<'py>) -> &'py PyString { let list = self.list(py); let morph = self.morph(py); @@ -304,14 +304,14 @@ impl PyMorpheme { } /// Returns the substring of input text corresponding to the morpheme regardless the configured projection - #[pyo3(text_signature = "($self) -> str")] + #[pyo3(text_signature="(self, /) -> str")] fn raw_surface<'py>(&'py self, py: Python<'py>) -> &'py PyString { PyString::new(py, self.morph(py).surface().deref()) } /// Returns the part of speech as a six-element tuple. /// Tuple elements are four POS levels, conjugation type and conjugation form. - #[pyo3(text_signature = "($self)")] + #[pyo3(text_signature="(self, /) -> tuple[str, str, str, str, str, str]")] fn part_of_speech<'py>(&'py self, py: Python<'py>) -> Py { let pos_id = self.part_of_speech_id(py); self.list(py) @@ -322,25 +322,25 @@ impl PyMorpheme { } /// Returns the id of the part of speech in the dictionary - #[pyo3(text_signature = "($self) -> int")] + #[pyo3(text_signature="(self, /) -> int")] pub fn part_of_speech_id(&self, py: Python) -> u16 { self.morph(py).part_of_speech_id() } /// Returns the dictionary form - #[pyo3(text_signature = "($self) -> str")] + #[pyo3(text_signature="(self, /) -> str")] fn dictionary_form<'py>(&'py self, py: Python<'py>) -> PyObject { self.morph(py).get_word_info().dictionary_form().into_py(py) } /// Returns the normalized form - #[pyo3(text_signature = "($self) -> str")] + #[pyo3(text_signature="(self, /) -> str")] fn normalized_form<'py>(&'py self, py: Python<'py>) -> PyObject { self.morph(py).get_word_info().normalized_form().into_py(py) } /// Returns the reading form - #[pyo3(text_signature = "($self) -> str")] + #[pyo3(text_signature="(self, /) -> str")] fn reading_form<'py>(&'py self, py: Python<'py>) -> PyObject { self.morph(py).get_word_info().reading_form().into_py(py) } @@ -358,7 +358,7 @@ impl PyMorpheme { /// :type out: Optional[sudachipy.MorphemeList] /// :type add_single: bool #[pyo3( - text_signature = "($self, mode, out = None, add_single = False) -> sudachipy.MorphemeList" + text_signature="(self, /, mode, out=None, add_single=False) -> MorphemeList" )] fn split<'py>( &'py self, @@ -402,19 +402,19 @@ impl PyMorpheme { } /// Returns whether if this is out of vocabulary word - #[pyo3(text_signature = "($self) -> bool")] + #[pyo3(text_signature="(self, /) -> bool")] fn is_oov(&self, py: Python) -> bool { self.morph(py).is_oov() } /// Returns word id of this word in the dictionary - #[pyo3(text_signature = "($self) -> int")] + #[pyo3(text_signature="(self, /) -> int")] fn word_id(&self, py: Python) -> u32 { self.morph(py).word_id().as_raw() } /// Returns the dictionary id which this word belongs - #[pyo3(text_signature = "($self) -> int")] + #[pyo3(text_signature="(self, /) -> int")] fn dictionary_id(&self, py: Python) -> i32 { let word_id = self.morph(py).word_id(); if word_id.is_oov() { @@ -425,7 +425,7 @@ impl PyMorpheme { } /// Returns the list of synonym group ids - #[pyo3(text_signature = "($self) -> List[int]")] + #[pyo3(text_signature="(self, /) -> List[int]")] fn synonym_group_ids<'py>(&'py self, py: Python<'py>) -> &'py PyList { let mref = self.morph(py); let ids = mref.get_word_info().synonym_group_ids(); @@ -433,7 +433,7 @@ impl PyMorpheme { } /// Returns the word info - #[pyo3(text_signature = "($self) -> sudachipy.WordInfo")] + #[pyo3(text_signature="(self, /) -> WordInfo")] fn get_word_info(&self, py: Python) -> PyResult { let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?; PyErr::warn(py, cat, "Users should not touch the raw WordInfo.", 1)?; diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs index fe3b66d3..16f2482a 100644 --- a/python/src/tokenizer.rs +++ b/python/src/tokenizer.rs @@ -71,7 +71,10 @@ impl PySplitMode { /// /// :param mode: str to parse. One of [A,B,C] in captital or lower case. #[new] - #[pyo3(signature=(mode=None, *))] + #[pyo3( + text_signature="(mode=None) -> SplitMode", + signature=(mode=None) + )] fn new(mode: Option<&str>) -> PyResult { let mode = match mode { Some(m) => m, @@ -133,8 +136,8 @@ impl PyTokenizer { /// :type mode: sudachipy.SplitMode /// :type out: sudachipy.MorphemeList #[pyo3( - text_signature = "($self, text: str, mode = None, logger = None, out = None) -> sudachipy.MorphemeList", - signature = (text, mode = None, logger = None, out = None) + text_signature="(self, /, text: str, mode=None, logger=None, out=None) -> MorphemeList", + signature=(text, mode=None, logger=None, out=None) )] #[allow(unused_variables)] fn tokenize<'py>( From dfc87edf656348474fef8b6aa46e8548e4895c5b Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 8 Jul 2024 11:05:04 +0900 Subject: [PATCH 4/9] add import of PosMatcher --- python/py_src/sudachipy/__init__.py | 1 + python/src/lib.rs | 1 + python/src/pos_matcher.rs | 6 ++++-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python/py_src/sudachipy/__init__.py b/python/py_src/sudachipy/__init__.py index bdf67f40..fb551538 100644 --- a/python/py_src/sudachipy/__init__.py +++ b/python/py_src/sudachipy/__init__.py @@ -5,6 +5,7 @@ MorphemeList, Morpheme, WordInfo, + PosMatcher, ) from .config import Config from . import errors diff --git a/python/src/lib.rs b/python/src/lib.rs index 4887a737..56a950c2 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -37,6 +37,7 @@ fn sudachipy(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; build::register_functions(m)?; Ok(()) } diff --git a/python/src/pos_matcher.rs b/python/src/pos_matcher.rs index a849edf5..586c7d90 100644 --- a/python/src/pos_matcher.rs +++ b/python/src/pos_matcher.rs @@ -29,7 +29,9 @@ use crate::morpheme::PyMorpheme; /// A part-of-speech matcher which checks if a morpheme belongs to a set of part of speech. /// /// Create using Dictionary.pos_matcher method. -#[pyclass(name = "PosMatcher", module = "sudachipy")] +/// +/// Use `__call__(m: Morpheme) -> bool` to check if given morpheme matches the PosMatcher. +#[pyclass(module = "sudachipy.pos_matcher", name = "PosMatcher")] pub struct PyPosMatcher { matcher: PosMatcher, dic: Arc, @@ -193,7 +195,7 @@ impl PyPosMatcher { } /// An iterator over POS tuples in the PosPatcher -#[pyclass(name = "PosMatcherIterator", module = "sudachipy")] +#[pyclass(module = "sudachipy.pos_matcher", name = "PosMatcherIterator")] pub struct PyPosIter { data: Vec, dic: Arc, From 8c35516a1f20fee8608401b1aea694063458c061 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 8 Jul 2024 11:56:02 +0900 Subject: [PATCH 5/9] sync pyi and rs --- python/py_src/sudachipy/sudachipy.pyi | 104 ++++++++++++++++---------- python/src/dictionary.rs | 54 ++++++------- python/src/morpheme.rs | 48 +++++++----- python/src/pos_matcher.rs | 10 ++- python/src/tokenizer.rs | 10 +-- 5 files changed, 136 insertions(+), 90 deletions(-) diff --git a/python/py_src/sudachipy/sudachipy.pyi b/python/py_src/sudachipy/sudachipy.pyi index 16c416f6..705b62af 100644 --- a/python/py_src/sudachipy/sudachipy.pyi +++ b/python/py_src/sudachipy/sudachipy.pyi @@ -1,6 +1,20 @@ +# Copyright (c) 2024 Works Applications Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from typing import ClassVar, Iterator, List, Tuple, Union, Callable, Iterable, Optional, Literal, Set from .config import Config +# Part Of Speech POS = Tuple[str, str, str, str, str, str] # POS element PE = Optional[str] @@ -14,6 +28,8 @@ PartialPOS = Union[ Tuple[()], ] +# Fields that can be specified for partial dictionary loading. +# See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html. FieldSet = Optional[Set[Literal["surface", "pos", "normalized_form", "dictionary_form", "reading_form", "word_structure", "split_a", "split_b", "synonym_group_id"]]] @@ -23,9 +39,7 @@ class SplitMode: Unit to split text. A == short mode - B == middle mode - C == long mode """ @@ -36,8 +50,9 @@ class SplitMode: @classmethod def __init__(cls, mode: str = "C") -> None: """ - Creates a split mode from a string value - :param mode: string representation of the split mode + Creates a split mode from a string value. + + :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case. """ ... @@ -54,14 +69,15 @@ class Dictionary: Creates a sudachi dictionary. If both config.systemDict and dict are not given, `sudachidict_core` is used. - If both config.systemDict and dict are given, dict_type is used. + If both config.systemDict and dict are given, dict is used. + If dict is an absolute path to a file, it is used as a dictionary. - :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.config.Config] object - :param config: alias to config_path, only one of them can be specified at the same time - :param resource_dir: path to the resource directory folder + :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object. + :param config: alias to config_path, only one of them can be specified at the same time. + :param resource_dir: path to the resource directory folder. :param dict: type of pre-packaged system dictionary, referring to sudachidict_ packages on PyPI: https://pypi.org/search/?q=sudachidict. Also, can be an _absolute_ path to a compiled dictionary file. - :param dict_type: deprecated alias to dict + :param dict_type: deprecated alias to dict. """ ... @@ -77,11 +93,11 @@ class Dictionary: *, projection: str = None) -> Tokenizer: """ - Creates a Sudachi Tokenizer. + Creates a sudachi tokenizer. :param mode: sets the analysis mode for this Tokenizer :param fields: load only a subset of fields. - See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html + See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html. :param projection: Projection override for created Tokenizer. See Config.projection for values. """ ... @@ -91,21 +107,21 @@ class Dictionary: Creates a new POS matcher. If target is a function, then it must return whether a POS should match or not. - If target a list, it should contain partially specified POS. - By partially specified it means that it is possible to omit POS fields or - use None as a sentinel value that matches any POS. + If target is a list, it should contain partially specified POS. + By partially specified it means that it is possible to omit POS fields or use None as a sentinel value that matches any POS. For example, ('名詞',) will match any noun and (None, None, None, None, None, '終止形') will match any word in 終止形 conjugation form. - :param target: can be either a function or a list of POS tuples. + :param target: can be either a list of POS partial tuples or a callable which maps POS to bool. """ ... def pre_tokenizer(self, mode: Union[SplitMode, Literal["A", "B", "C"]] = "C", fields: FieldSet = None, - handler: Optional[Callable[[int, object, MorphemeList], list]] = None, + handler: Optional[Callable[[ + int, object, MorphemeList], list]] = None, *, projection: str = None) -> object: """ @@ -113,10 +129,10 @@ class Dictionary: Requires package `tokenizers` to be installed. :param mode: Use this split mode (C by default) - :param fields: ask Sudachi to load only a subset of fields. See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html - :param handler: custom callable to transform MorphemeList into list of tokens. See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py - First two parameters are the index (int) and HuggingFace NormalizedString. - The handler must return a List[NormalizedString]. By default, just segment the tokens. + :param fields: ask Sudachi to load only a subset of fields. See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html. + :param handler: a custom callable to transform MorphemeList into list of tokens. If None, simply use surface as token representations. + It should be a `function(index: int, original: NormalizedString, morphemes: MorphemeList) -> List[NormalizedString]`. + See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py. :param projection: Projection override for created Tokenizer. See Config.projection for values. """ ... @@ -126,7 +142,7 @@ class Dictionary: Returns POS with the given id. :param pos_id: POS id - :return: POS tuple with the given id. + :return: POS tuple with the given id or None for non existing id. """ ... @@ -197,7 +213,8 @@ class Morpheme: def part_of_speech(self) -> POS: """ - Returns the part of speech. + Returns the part of speech as a six-element tuple. + Tuple elements are four POS levels, conjugation type and conjugation form. """ ... @@ -217,8 +234,8 @@ class Morpheme: """ Returns sub-morphemes in the provided split mode. - :param mode: mode of new split - :param out: write results to this MorhpemeList instead of creating new one + :param mode: mode of new split. + :param out: write results to this MorhpemeList instead of creating new one. See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for more information on output parameters. Returned MorphemeList will be invalidated if this MorphemeList is used as an output parameter. @@ -230,6 +247,7 @@ class Morpheme: def surface(self) -> str: """ Returns the substring of input text corresponding to the morpheme, or a projection if one is configured. + See `Config.projection`. """ ... @@ -237,6 +255,7 @@ class Morpheme: def raw_surface(self) -> str: """ Returns the substring of input text corresponding to the morpheme regardless the configured projection. + See `Config.projection`. """ ... @@ -255,7 +274,7 @@ class Morpheme: def __len__(self) -> int: """ - Returns morpheme length in codepoints + Returns morpheme length in codepoints. """ @@ -293,6 +312,11 @@ class MorphemeList: class Tokenizer: + """ + A sudachi tokenizer + + Create using Dictionary.create method. + """ SplitMode: ClassVar[SplitMode] = ... @classmethod def __init__(cls) -> None: ... @@ -303,13 +327,12 @@ class Tokenizer: """ Break text into morphemes. - SudachiPy 0.5.* had logger parameter, it is accepted, but ignored. - - :param text: text to analyze + :param text: text to analyze. :param mode: analysis mode. This parameter is deprecated. Pass the analysis mode at the Tokenizer creation time and create different tokenizers for different modes. If you need multi-level splitting, prefer using :py:meth:`Morpheme.split` method instead. + :param logger: Arg for v0.5.* compatibility. Ignored. :param out: tokenization results will be written into this MorphemeList, a new one will be created instead. See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details. """ @@ -342,41 +365,44 @@ class WordInfo: class PosMatcher: + """ + A part-of-speech matcher which checks if a morpheme belongs to a set of part of speech. + + Create using Dictionary.pos_matcher method. + """ + def __iter__(self) -> Iterator[POS]: ... def __len__(self) -> int: ... def __call__(self, m: Morpheme) -> bool: """ - Checks whether a morpheme has matching POS - :param m: morpheme - :return: if morpheme has matching POS + Checks whether a morpheme has matching POS. + + :param m: morpheme. + :return: if morpheme has matching POS. """ ... def __or__(self, other: PosMatcher) -> PosMatcher: """ - Returns a POS matcher which matches a POS if any of two matchers would match it - :return: PosMatcher + Returns a POS matcher which matches a POS if any of two matchers would match it. """ ... def __and__(self, other: PosMatcher) -> PosMatcher: """ - Returns a POS matcher which matches a POS if both matchers would match it at the same time - :return: PosMatcher + Returns a POS matcher which matches a POS if both matchers would match it at the same time. """ ... def __sub__(self, other: PosMatcher) -> PosMatcher: """ - Returns a POS matcher which matches a POS if self would match the POS and other would not match the POS - :return: PosMatcher + Returns a POS matcher which matches a POS if self would match the POS and other would not match the POS. """ ... def __invert__(self) -> PosMatcher: """ - Returns a POS matcher which matches all POS tags except ones defined in the current POS matcher - :return: PosMatcher + Returns a POS matcher which matches all POS tags except ones defined in the current POS matcher. """ ... diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs index e208492f..5f1e8f65 100644 --- a/python/src/dictionary.rs +++ b/python/src/dictionary.rs @@ -80,11 +80,12 @@ impl PyDicData { /// A sudachi dictionary. /// -/// If both config.systemDict and dict_type are not given, `sudachidict_core` is used. -/// If both config.systemDict and dict_type are given, dict_type is used. +/// If both config.systemDict and dict are not given, `sudachidict_core` is used. +/// If both config.systemDict and dict are given, dict is used. /// If dict is an absolute path to a file, it is used as a dictionary. /// -/// :param config_path: path to the configuration JSON file. +/// :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object. +/// :param config: alias to config_path, only one of them can be specified at the same time. /// :param resource_dir: path to the resource directory folder. /// :param dict: type of pre-packaged dictionary, referring to sudachidict_ packages on PyPI: https://pypi.org/search/?q=sudachidict. /// Also, can be an _absolute_ path to a compiled dictionary file. @@ -100,11 +101,12 @@ pub struct PyDictionary { impl PyDictionary { /// Creates a sudachi dictionary. /// - /// If both config.systemDict and dict_type are not given, `sudachidict_core` is used. - /// If both config.systemDict and dict_type are given, dict_type is used. + /// If both config.systemDict and dict are not given, `sudachidict_core` is used. + /// If both config.systemDict and dict are given, dict is used. /// If dict is an absolute path to a file, it is used as a dictionary. /// - /// :param config_path: path to the configuration JSON file. + /// :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object. + /// :param config: alias to config_path, only one of them can be specified at the same time. /// :param resource_dir: path to the resource directory folder. /// :param dict: type of pre-packaged dictionary, referring to sudachidict_ packages on PyPI: https://pypi.org/search/?q=sudachidict. /// Also, can be an _absolute_ path to a compiled dictionary file. @@ -229,11 +231,12 @@ impl PyDictionary { /// Creates a sudachi tokenizer. /// - /// :param mode: tokenizer's default split mode (C by default). + /// :param mode: sets the analysis mode for this Tokenizer /// :param fields: load only a subset of fields. - /// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html + /// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html. + /// :param projection: Projection override for created Tokenizer. See Config.projection for values. #[pyo3( - text_signature="(self, /, mode=None, fields=None, *, projection=None) -> Tokenizer", + text_signature="(self, /, mode=SplitMode.C, fields=None, *, projection=None) -> Tokenizer", signature=(mode=None, fields=None, *, projection=None) )] fn create<'py>( @@ -267,14 +270,13 @@ impl PyDictionary { /// Creates a POS matcher object /// /// If target is a function, then it must return whether a POS should match or not. - /// If target a list, it should contain partially specified POS. - /// By partially specified it means that it is possible to omit POS fields or - /// use None as a sentinel value that matches any POS. + /// If target is a list, it should contain partially specified POS. + /// By partially specified it means that it is possible to omit POS fields or use None as a sentinel value that matches any POS. /// /// For example, ('名詞',) will match any noun and /// (None, None, None, None, None, '終止形‐一般') will match any word in 終止形‐一般 conjugation form. /// - /// :param target: can be either a callable or list of POS partial tuples + /// :param target: can be either a list of POS partial tuples or a callable which maps POS to bool. #[pyo3(text_signature="(self, /, target) -> PosMatcher")] fn pos_matcher<'py>(&'py self, py: Python<'py>, target: &PyAny) -> PyResult { PyPosMatcher::create(py, self.dictionary.as_ref().unwrap(), target) @@ -285,15 +287,13 @@ impl PyDictionary { /// /// :param mode: Use this split mode (C by default) /// :param fields: ask Sudachi to load only a subset of fields. - /// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html - /// :param handler: a custom callable to transform MorphemeList into list of tokens. - /// It should be should be a `function(index: int, original: NormalizedString, morphemes: MorphemeList) -> List[NormalizedString]`. - /// See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py - /// If nothing was passed, simply use surface as token representations. - /// :param projection: projection mode for a created PreTokenizer. - /// See :class:`sudachipy.config.Config` object documentation for supported projections. + /// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html. + /// :param handler: a custom callable to transform MorphemeList into list of tokens. If None, simply use surface as token representations. + /// It should be a `function(index: int, original: NormalizedString, morphemes: MorphemeList) -> List[NormalizedString]`. + /// See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py. + /// :param projection: Projection override for created Tokenizer. See Config.projection for values. /// - /// :type mode: sudachipy.SplitMode + /// :type mode: SplitMode /// :type fields: Set[str] #[pyo3( text_signature="(self, /, mode=None, fields=None, handler=None, *, projection=None) -> tokenizers.PreTokenizer", @@ -350,8 +350,9 @@ impl PyDictionary { /// :param surface: find all morphemes with the given surface /// :param out: if passed, reuse the given morpheme list instead of creating a new one. /// See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details. + /// /// :type surface: str - /// :type out: sudachipy.MorphemeList + /// :type out: MorphemeList #[pyo3(text_signature="(self, /, surface, out=None) -> MorphemeList")] fn lookup<'p>( &'p self, @@ -379,14 +380,17 @@ impl PyDictionary { Ok(l) } - /// Close this dictionary + /// Close this dictionary. #[pyo3(text_signature="(self, /) -> ()")] fn close(&mut self) { self.dictionary = None; } - /// Get POS Tuple by its id - #[pyo3(text_signature="(self, /, pos_id: int) -> tuple[str, str, str, str, str, str]")] + /// Returns POS with the given id. + /// + /// :param pos_id: POS id + /// :return: POS tuple with the given id or None for non existing id. + #[pyo3(text_signature="(self, /, pos_id: int) -> tuple[str, str, str, str, str, str] | None")] fn pos_of<'p>(&'p self, py: Python<'p>, pos_id: usize) -> Option<&'p PyTuple> { let dic = self.dictionary.as_ref().unwrap(); dic.pos.get(pos_id).map(|x| x.as_ref(py)) diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs index f1aa204d..0a18f6c4 100644 --- a/python/src/morpheme.rs +++ b/python/src/morpheme.rs @@ -31,7 +31,10 @@ use crate::word_info::PyWordInfo; pub(crate) type PyMorphemeList = MorphemeList>; pub(crate) type PyProjector = Option>; -/// A list of morphemes +/// A list of morphemes. +/// +/// An object can not be instantiated manually. +/// Use Tokenizer.tokenize("") to create an empty morpheme list. #[pyclass(module = "sudachipy.morphemelist", name = "MorphemeList")] pub struct PyMorphemeListWrapper { /// use `internal()` function instead @@ -89,7 +92,7 @@ impl PyMorphemeListWrapper { #[pymethods] impl PyMorphemeListWrapper { - /// Returns an empty morpheme list with dictionary + /// Returns an empty morpheme list with dictionary. #[classmethod] #[pyo3(text_signature="(dict: Dictionary) -> MorphemeList")] fn empty(_cls: &PyType, py: Python, dict: &PyDictionary) -> PyResult { @@ -109,7 +112,7 @@ impl PyMorphemeListWrapper { }) } - /// Returns the total cost of the path + /// Returns the total cost of the path. #[pyo3(text_signature="(self, /) -> int")] fn get_internal_cost(&self, py: Python) -> i32 { self.internal(py).get_internal_cost() @@ -278,21 +281,23 @@ impl PyMorpheme { #[pymethods] impl PyMorpheme { - /// Returns the begin index of this in the input text + /// Returns the begin index of this in the input text. #[pyo3(text_signature="(self, /) -> int")] fn begin(&self, py: Python) -> usize { // call codepoint version self.morph(py).begin_c() } - /// Returns the end index of this in the input text + /// Returns the end index of this in the input text. #[pyo3(text_signature="(self, /) -> int")] fn end(&self, py: Python) -> usize { // call codepoint version self.morph(py).end_c() } - /// Returns the substring of input text corresponding to the morpheme, or a projection if one is configured + /// Returns the substring of input text corresponding to the morpheme, or a projection if one is configured. + /// + /// See `Config.projection`. #[pyo3(text_signature="(self, /) -> str")] fn surface<'py>(&'py self, py: Python<'py>) -> &'py PyString { let list = self.list(py); @@ -303,14 +308,16 @@ impl PyMorpheme { } } - /// Returns the substring of input text corresponding to the morpheme regardless the configured projection + /// Returns the substring of input text corresponding to the morpheme regardless the configured projection. + /// + /// See `Config.projection`. #[pyo3(text_signature="(self, /) -> str")] fn raw_surface<'py>(&'py self, py: Python<'py>) -> &'py PyString { PyString::new(py, self.morph(py).surface().deref()) } /// Returns the part of speech as a six-element tuple. - /// Tuple elements are four POS levels, conjugation type and conjugation form. + /// Tuple elements are four POS levels, conjugation type and conjugation form. #[pyo3(text_signature="(self, /) -> tuple[str, str, str, str, str, str]")] fn part_of_speech<'py>(&'py self, py: Python<'py>) -> Py { let pos_id = self.part_of_speech_id(py); @@ -321,25 +328,25 @@ impl PyMorpheme { .clone_ref(py) } - /// Returns the id of the part of speech in the dictionary + /// Returns the id of the part of speech in the dictionary. #[pyo3(text_signature="(self, /) -> int")] pub fn part_of_speech_id(&self, py: Python) -> u16 { self.morph(py).part_of_speech_id() } - /// Returns the dictionary form + /// Returns the dictionary form. #[pyo3(text_signature="(self, /) -> str")] fn dictionary_form<'py>(&'py self, py: Python<'py>) -> PyObject { self.morph(py).get_word_info().dictionary_form().into_py(py) } - /// Returns the normalized form + /// Returns the normalized form. #[pyo3(text_signature="(self, /) -> str")] fn normalized_form<'py>(&'py self, py: Python<'py>) -> PyObject { self.morph(py).get_word_info().normalized_form().into_py(py) } - /// Returns the reading form + /// Returns the reading form. #[pyo3(text_signature="(self, /) -> str")] fn reading_form<'py>(&'py self, py: Python<'py>) -> PyObject { self.morph(py).get_word_info().reading_form().into_py(py) @@ -347,13 +354,14 @@ impl PyMorpheme { /// Returns sub-morphemes in the provided split mode. /// - /// :param mode: mode of new split - /// :param out: write results to this MorhpemeList instead of creating new one + /// :param mode: mode of new split. + /// :param out: write results to this MorhpemeList instead of creating new one. /// See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for /// more information on output parameters. /// Returned MorphemeList will be invalidated if this MorphemeList is used as an output parameter. /// :param add_single: return lists with the current morpheme if the split hasn't produced any elements. /// When False is passed, empty lists are returned instead. + /// /// :type mode: sudachipy.SplitMode /// :type out: Optional[sudachipy.MorphemeList] /// :type add_single: bool @@ -401,19 +409,19 @@ impl PyMorpheme { Ok(out_cell) } - /// Returns whether if this is out of vocabulary word + /// Returns whether if this is out of vocabulary word. #[pyo3(text_signature="(self, /) -> bool")] fn is_oov(&self, py: Python) -> bool { self.morph(py).is_oov() } - /// Returns word id of this word in the dictionary + /// Returns word id of this word in the dictionary. #[pyo3(text_signature="(self, /) -> int")] fn word_id(&self, py: Python) -> u32 { self.morph(py).word_id().as_raw() } - /// Returns the dictionary id which this word belongs + /// Returns the dictionary id which this word belongs. #[pyo3(text_signature="(self, /) -> int")] fn dictionary_id(&self, py: Python) -> i32 { let word_id = self.morph(py).word_id(); @@ -424,7 +432,7 @@ impl PyMorpheme { } } - /// Returns the list of synonym group ids + /// Returns the list of synonym group ids. #[pyo3(text_signature="(self, /) -> List[int]")] fn synonym_group_ids<'py>(&'py self, py: Python<'py>) -> &'py PyList { let mref = self.morph(py); @@ -432,7 +440,7 @@ impl PyMorpheme { PyList::new(py, ids) } - /// Returns the word info + /// Returns the word info. #[pyo3(text_signature="(self, /) -> WordInfo")] fn get_word_info(&self, py: Python) -> PyResult { let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?; @@ -441,7 +449,7 @@ impl PyMorpheme { Ok(self.morph(py).get_word_info().clone().into()) } - /// Returns morpheme length in codepoints + /// Returns morpheme length in codepoints. pub fn __len__(&self, py: Python) -> usize { let m = self.morph(py); m.end_c() - m.begin_c() diff --git a/python/src/pos_matcher.rs b/python/src/pos_matcher.rs index 586c7d90..16d1fa56 100644 --- a/python/src/pos_matcher.rs +++ b/python/src/pos_matcher.rs @@ -30,7 +30,7 @@ use crate::morpheme::PyMorpheme; /// /// Create using Dictionary.pos_matcher method. /// -/// Use `__call__(m: Morpheme) -> bool` to check if given morpheme matches the PosMatcher. +/// Use `__call__(m: Morpheme) -> bool` to check whether a morpheme has matching POS. #[pyclass(module = "sudachipy.pos_matcher", name = "PosMatcher")] pub struct PyPosMatcher { matcher: PosMatcher, @@ -123,6 +123,10 @@ impl PyPosMatcher { #[pymethods] impl PyPosMatcher { + /// Checks whether a morpheme has matching POS. + /// + /// :param m: morpheme. + /// :return: if morpheme has matching POS. pub fn __call__<'py>(&'py self, py: Python<'py>, m: &'py PyMorpheme) -> bool { let pos_id = m.part_of_speech_id(py); self.matcher.matches_id(pos_id) @@ -140,6 +144,7 @@ impl PyPosMatcher { self.matcher.num_entries() } + /// Returns a POS matcher which matches a POS if any of two matchers would match it. pub fn __or__(&self, other: &Self) -> Self { assert_eq!( Arc::as_ptr(&self.dic), @@ -153,6 +158,7 @@ impl PyPosMatcher { } } + /// Returns a POS matcher which matches a POS if both matchers would match it at the same time. pub fn __and__(&self, other: &Self) -> Self { assert_eq!( Arc::as_ptr(&self.dic), @@ -166,6 +172,7 @@ impl PyPosMatcher { } } + /// Returns a POS matcher which matches a POS if self would match the POS and other would not match the POS. pub fn __sub__(&self, other: &Self) -> Self { assert_eq!( Arc::as_ptr(&self.dic), @@ -179,6 +186,7 @@ impl PyPosMatcher { } } + /// Returns a POS matcher which matches all POS tags except ones defined in the current POS matcher. pub fn __invert__(&self) -> Self { let max_id = self.dic.pos.len(); // map -> filter chain is needed to handle exactly u16::MAX POS entries diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs index 16f2482a..8c7c1c84 100644 --- a/python/src/tokenizer.rs +++ b/python/src/tokenizer.rs @@ -35,7 +35,7 @@ use crate::morpheme::{PyMorphemeListWrapper, PyProjector}; /// B == middle mode /// C == long mode /// -/// :param mode: str to parse. One of [A,B,C] in captital or lower case. +/// :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case. #[pyclass(module = "sudachipy.tokenizer", name = "SplitMode", frozen)] #[derive(Clone, PartialEq, Eq, Copy, Debug)] #[repr(u8)] @@ -88,7 +88,7 @@ impl PySplitMode { } } -/// Sudachi Tokenizer +/// A sudachi tokenizer /// /// Create using Dictionary.create method. #[pyclass(module = "sudachipy.tokenizer", name = "Tokenizer")] @@ -123,15 +123,15 @@ impl PyTokenizer { /// Break text into morphemes. /// - /// SudachiPy 0.5.* had logger parameter, it is accepted, but ignored. - /// - /// :param text: text to analyze + /// :param text: text to analyze. /// :param mode: analysis mode. /// This parameter is deprecated. /// Pass the analysis mode at the Tokenizer creation time and create different tokenizers for different modes. /// If you need multi-level splitting, prefer using :py:meth:`Morpheme.split` method instead. + /// :param logger: Arg for v0.5.* compatibility. Ignored. /// :param out: tokenization results will be written into this MorphemeList, a new one will be created instead. /// See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details. + /// /// :type text: str /// :type mode: sudachipy.SplitMode /// :type out: sudachipy.MorphemeList From 706a573311551542cc726486daabfb10bf2c5966 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 8 Jul 2024 13:53:56 +0900 Subject: [PATCH 6/9] add type fields for rs --- python/src/build.rs | 26 ++++++++++++++++++++++-- python/src/dictionary.rs | 36 ++++++++++++++++++++++++++------- python/src/morpheme.rs | 42 +++++++++++++++++++-------------------- python/src/pos_matcher.rs | 4 +++- python/src/tokenizer.rs | 14 +++++++++---- 5 files changed, 86 insertions(+), 36 deletions(-) diff --git a/python/src/build.rs b/python/src/build.rs index 350f2fb3..2b2ce94f 100644 --- a/python/src/build.rs +++ b/python/src/build.rs @@ -59,8 +59,19 @@ fn create_file(p: &Path) -> std::io::Result { } /// Build system dictionary from matrix and lexicons. +/// +/// :param matrix: Path to the matrix file. +/// :param lex: List of paths to lexicon files. +/// :param output: Path to output built dictionray. +/// :param description: A description text to embed in the dictionary. +/// :return: A build report, list of (part, size, time). +/// +/// :type matrix: pathlib.Path | str | bytes +/// :type lex: list[pathlib.Path | str | bytes] +/// :type output: pathlib.Path | str +/// :type description: str #[pyfunction] -#[pyo3(text_signature="(matrix, lex, output, description=None) -> list[tuple[str, int, float]]")] +#[pyo3(text_signature = "(matrix, lex, output, description=None) -> list[tuple[str, int, float]]")] fn build_system_dic<'p>( py: Python<'p>, matrix: &'p PyAny, @@ -89,8 +100,19 @@ fn build_system_dic<'p>( } /// Build user dictionary from lexicons based on the given system dictionary. +/// +/// :param system: Path to the system dictionary. +/// :param lex: List of paths to lexicon files. +/// :param output: Path to output built dictionray. +/// :param description: A description text to embed in the dictionary. +/// :return: A build report, list of (part, size, time). +/// +/// :type system: pathlib.Path | str +/// :type lex: list[pathlib.Path | str | bytes] +/// :type output: pathlib.Path | str +/// :type description: str #[pyfunction] -#[pyo3(text_signature="(system, lex, output, description=None) -> list[tuple[str, int, float]]")] +#[pyo3(text_signature = "(system, lex, output, description=None) -> list[tuple[str, int, float]]")] fn build_user_dic<'p>( py: Python<'p>, system: &'p PyAny, diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs index 5f1e8f65..2b5c849b 100644 --- a/python/src/dictionary.rs +++ b/python/src/dictionary.rs @@ -90,6 +90,12 @@ impl PyDicData { /// :param dict: type of pre-packaged dictionary, referring to sudachidict_ packages on PyPI: https://pypi.org/search/?q=sudachidict. /// Also, can be an _absolute_ path to a compiled dictionary file. /// :param dict_type: deprecated alias to dict. +/// +/// :type config_path: Config | pathlib.Path | str | None +/// :type config: Config | pathlib.Path | str | None +/// :type resource_dir: pathlib.Path | str | None +/// :type dict: pathlib.Path | str | None +/// :type dict_type: pathlib.Path | str | None #[pyclass(module = "sudachipy.dictionary", name = "Dictionary")] #[derive(Clone)] pub struct PyDictionary { @@ -111,6 +117,12 @@ impl PyDictionary { /// :param dict: type of pre-packaged dictionary, referring to sudachidict_ packages on PyPI: https://pypi.org/search/?q=sudachidict. /// Also, can be an _absolute_ path to a compiled dictionary file. /// :param dict_type: deprecated alias to dict. + /// + /// :type config_path: Config | pathlib.Path | str | None + /// :type config: Config | pathlib.Path | str | None + /// :type resource_dir: pathlib.Path | str | None + /// :type dict: pathlib.Path | str | None + /// :type dict_type: pathlib.Path | str | None #[new] #[pyo3( text_signature="(config_path=None, resource_dir=None, dict=None, dict_type=None, *, config=None) -> Dictionary", @@ -235,6 +247,10 @@ impl PyDictionary { /// :param fields: load only a subset of fields. /// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html. /// :param projection: Projection override for created Tokenizer. See Config.projection for values. + /// + /// :type mode: SplitMode | str | None + /// :type fields: set[str] | None + /// :type projection: str | None #[pyo3( text_signature="(self, /, mode=SplitMode.C, fields=None, *, projection=None) -> Tokenizer", signature=(mode=None, fields=None, *, projection=None) @@ -277,7 +293,9 @@ impl PyDictionary { /// (None, None, None, None, None, '終止形‐一般') will match any word in 終止形‐一般 conjugation form. /// /// :param target: can be either a list of POS partial tuples or a callable which maps POS to bool. - #[pyo3(text_signature="(self, /, target) -> PosMatcher")] + /// + /// :type target: Iterable[PartialPOS] | Callable[[POS], bool] + #[pyo3(text_signature = "(self, /, target) -> PosMatcher")] fn pos_matcher<'py>(&'py self, py: Python<'py>, target: &PyAny) -> PyResult { PyPosMatcher::create(py, self.dictionary.as_ref().unwrap(), target) } @@ -293,8 +311,10 @@ impl PyDictionary { /// See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py. /// :param projection: Projection override for created Tokenizer. See Config.projection for values. /// - /// :type mode: SplitMode - /// :type fields: Set[str] + /// :type mode: SplitMode | str | None + /// :type fields: set[str] | None + /// :type handler: Callable[[int, NormalizedString, MorphemeList], list[NormalizedString]] | None + /// :type projection: str | None #[pyo3( text_signature="(self, /, mode=None, fields=None, handler=None, *, projection=None) -> tokenizers.PreTokenizer", signature=(mode=None, fields=None, handler=None, *, projection=None) @@ -352,8 +372,8 @@ impl PyDictionary { /// See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details. /// /// :type surface: str - /// :type out: MorphemeList - #[pyo3(text_signature="(self, /, surface, out=None) -> MorphemeList")] + /// :type out: MorphemeList | None + #[pyo3(text_signature = "(self, /, surface, out=None) -> MorphemeList")] fn lookup<'p>( &'p self, py: Python<'p>, @@ -381,7 +401,7 @@ impl PyDictionary { } /// Close this dictionary. - #[pyo3(text_signature="(self, /) -> ()")] + #[pyo3(text_signature = "(self, /) -> ()")] fn close(&mut self) { self.dictionary = None; } @@ -390,7 +410,9 @@ impl PyDictionary { /// /// :param pos_id: POS id /// :return: POS tuple with the given id or None for non existing id. - #[pyo3(text_signature="(self, /, pos_id: int) -> tuple[str, str, str, str, str, str] | None")] + /// + /// :type pos_id: int + #[pyo3(text_signature = "(self, /, pos_id: int) -> tuple[str, str, str, str, str, str] | None")] fn pos_of<'p>(&'p self, py: Python<'p>, pos_id: usize) -> Option<&'p PyTuple> { let dic = self.dictionary.as_ref().unwrap(); dic.pos.get(pos_id).map(|x| x.as_ref(py)) diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs index 0a18f6c4..522d8ecd 100644 --- a/python/src/morpheme.rs +++ b/python/src/morpheme.rs @@ -94,7 +94,7 @@ impl PyMorphemeListWrapper { impl PyMorphemeListWrapper { /// Returns an empty morpheme list with dictionary. #[classmethod] - #[pyo3(text_signature="(dict: Dictionary) -> MorphemeList")] + #[pyo3(text_signature = "(dict: Dictionary) -> MorphemeList")] fn empty(_cls: &PyType, py: Python, dict: &PyDictionary) -> PyResult { let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?; PyErr::warn( @@ -113,13 +113,13 @@ impl PyMorphemeListWrapper { } /// Returns the total cost of the path. - #[pyo3(text_signature="(self, /) -> int")] + #[pyo3(text_signature = "(self, /) -> int")] fn get_internal_cost(&self, py: Python) -> i32 { self.internal(py).get_internal_cost() } /// Returns the number of morpheme in this list. - #[pyo3(text_signature="(self, /) -> int")] + #[pyo3(text_signature = "(self, /) -> int")] fn size(&self, py: Python) -> usize { self.internal(py).len() } @@ -282,14 +282,14 @@ impl PyMorpheme { #[pymethods] impl PyMorpheme { /// Returns the begin index of this in the input text. - #[pyo3(text_signature="(self, /) -> int")] + #[pyo3(text_signature = "(self, /) -> int")] fn begin(&self, py: Python) -> usize { // call codepoint version self.morph(py).begin_c() } /// Returns the end index of this in the input text. - #[pyo3(text_signature="(self, /) -> int")] + #[pyo3(text_signature = "(self, /) -> int")] fn end(&self, py: Python) -> usize { // call codepoint version self.morph(py).end_c() @@ -298,7 +298,7 @@ impl PyMorpheme { /// Returns the substring of input text corresponding to the morpheme, or a projection if one is configured. /// /// See `Config.projection`. - #[pyo3(text_signature="(self, /) -> str")] + #[pyo3(text_signature = "(self, /) -> str")] fn surface<'py>(&'py self, py: Python<'py>) -> &'py PyString { let list = self.list(py); let morph = self.morph(py); @@ -311,14 +311,14 @@ impl PyMorpheme { /// Returns the substring of input text corresponding to the morpheme regardless the configured projection. /// /// See `Config.projection`. - #[pyo3(text_signature="(self, /) -> str")] + #[pyo3(text_signature = "(self, /) -> str")] fn raw_surface<'py>(&'py self, py: Python<'py>) -> &'py PyString { PyString::new(py, self.morph(py).surface().deref()) } /// Returns the part of speech as a six-element tuple. /// Tuple elements are four POS levels, conjugation type and conjugation form. - #[pyo3(text_signature="(self, /) -> tuple[str, str, str, str, str, str]")] + #[pyo3(text_signature = "(self, /) -> tuple[str, str, str, str, str, str]")] fn part_of_speech<'py>(&'py self, py: Python<'py>) -> Py { let pos_id = self.part_of_speech_id(py); self.list(py) @@ -329,25 +329,25 @@ impl PyMorpheme { } /// Returns the id of the part of speech in the dictionary. - #[pyo3(text_signature="(self, /) -> int")] + #[pyo3(text_signature = "(self, /) -> int")] pub fn part_of_speech_id(&self, py: Python) -> u16 { self.morph(py).part_of_speech_id() } /// Returns the dictionary form. - #[pyo3(text_signature="(self, /) -> str")] + #[pyo3(text_signature = "(self, /) -> str")] fn dictionary_form<'py>(&'py self, py: Python<'py>) -> PyObject { self.morph(py).get_word_info().dictionary_form().into_py(py) } /// Returns the normalized form. - #[pyo3(text_signature="(self, /) -> str")] + #[pyo3(text_signature = "(self, /) -> str")] fn normalized_form<'py>(&'py self, py: Python<'py>) -> PyObject { self.morph(py).get_word_info().normalized_form().into_py(py) } /// Returns the reading form. - #[pyo3(text_signature="(self, /) -> str")] + #[pyo3(text_signature = "(self, /) -> str")] fn reading_form<'py>(&'py self, py: Python<'py>) -> PyObject { self.morph(py).get_word_info().reading_form().into_py(py) } @@ -362,12 +362,10 @@ impl PyMorpheme { /// :param add_single: return lists with the current morpheme if the split hasn't produced any elements. /// When False is passed, empty lists are returned instead. /// - /// :type mode: sudachipy.SplitMode - /// :type out: Optional[sudachipy.MorphemeList] + /// :type mode: SplitMode | None + /// :type out: MorphemeList | None /// :type add_single: bool - #[pyo3( - text_signature="(self, /, mode, out=None, add_single=False) -> MorphemeList" - )] + #[pyo3(text_signature = "(self, /, mode, out=None, add_single=False) -> MorphemeList")] fn split<'py>( &'py self, py: Python<'py>, @@ -410,19 +408,19 @@ impl PyMorpheme { } /// Returns whether if this is out of vocabulary word. - #[pyo3(text_signature="(self, /) -> bool")] + #[pyo3(text_signature = "(self, /) -> bool")] fn is_oov(&self, py: Python) -> bool { self.morph(py).is_oov() } /// Returns word id of this word in the dictionary. - #[pyo3(text_signature="(self, /) -> int")] + #[pyo3(text_signature = "(self, /) -> int")] fn word_id(&self, py: Python) -> u32 { self.morph(py).word_id().as_raw() } /// Returns the dictionary id which this word belongs. - #[pyo3(text_signature="(self, /) -> int")] + #[pyo3(text_signature = "(self, /) -> int")] fn dictionary_id(&self, py: Python) -> i32 { let word_id = self.morph(py).word_id(); if word_id.is_oov() { @@ -433,7 +431,7 @@ impl PyMorpheme { } /// Returns the list of synonym group ids. - #[pyo3(text_signature="(self, /) -> List[int]")] + #[pyo3(text_signature = "(self, /) -> List[int]")] fn synonym_group_ids<'py>(&'py self, py: Python<'py>) -> &'py PyList { let mref = self.morph(py); let ids = mref.get_word_info().synonym_group_ids(); @@ -441,7 +439,7 @@ impl PyMorpheme { } /// Returns the word info. - #[pyo3(text_signature="(self, /) -> WordInfo")] + #[pyo3(text_signature = "(self, /) -> WordInfo")] fn get_word_info(&self, py: Python) -> PyResult { let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?; PyErr::warn(py, cat, "Users should not touch the raw WordInfo.", 1)?; diff --git a/python/src/pos_matcher.rs b/python/src/pos_matcher.rs index 16d1fa56..bb9749f2 100644 --- a/python/src/pos_matcher.rs +++ b/python/src/pos_matcher.rs @@ -125,8 +125,10 @@ impl PyPosMatcher { impl PyPosMatcher { /// Checks whether a morpheme has matching POS. /// - /// :param m: morpheme. + /// :param m: a morpheme to check. /// :return: if morpheme has matching POS. + /// + /// :type m: Morpheme pub fn __call__<'py>(&'py self, py: Python<'py>, m: &'py PyMorpheme) -> bool { let pos_id = m.part_of_speech_id(py); self.matcher.matches_id(pos_id) diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs index 8c7c1c84..c14f7076 100644 --- a/python/src/tokenizer.rs +++ b/python/src/tokenizer.rs @@ -36,6 +36,9 @@ use crate::morpheme::{PyMorphemeListWrapper, PyProjector}; /// C == long mode /// /// :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case. +/// If None, returns SplitMode.C. +/// +/// :type mode: str | None #[pyclass(module = "sudachipy.tokenizer", name = "SplitMode", frozen)] #[derive(Clone, PartialEq, Eq, Copy, Debug)] #[repr(u8)] @@ -67,9 +70,12 @@ impl From for PySplitMode { #[pymethods] impl PySplitMode { - /// Parse SplitMode from a character. + /// Creates a split mode from a string value. + /// + /// :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case. + /// If None, returns SplitMode.C. /// - /// :param mode: str to parse. One of [A,B,C] in captital or lower case. + /// :type mode: str | None #[new] #[pyo3( text_signature="(mode=None) -> SplitMode", @@ -133,8 +139,8 @@ impl PyTokenizer { /// See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details. /// /// :type text: str - /// :type mode: sudachipy.SplitMode - /// :type out: sudachipy.MorphemeList + /// :type mode: SplitMode | str | None + /// :type out: MorphemeList #[pyo3( text_signature="(self, /, text: str, mode=None, logger=None, out=None) -> MorphemeList", signature=(text, mode=None, logger=None, out=None) From 5d8620ee643096027a687275b26838cb70874a68 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 8 Jul 2024 14:16:45 +0900 Subject: [PATCH 7/9] improve pyi --- python/py_src/sudachipy/sudachipy.pyi | 47 ++++++++++++++++++--------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/python/py_src/sudachipy/sudachipy.pyi b/python/py_src/sudachipy/sudachipy.pyi index 705b62af..0b1c4fc2 100644 --- a/python/py_src/sudachipy/sudachipy.pyi +++ b/python/py_src/sudachipy/sudachipy.pyi @@ -28,12 +28,20 @@ PartialPOS = Union[ Tuple[()], ] -# Fields that can be specified for partial dictionary loading. -# See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html. +""" +Fields that can be specified for partial dictionary loading. +See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html. +""" FieldSet = Optional[Set[Literal["surface", "pos", "normalized_form", "dictionary_form", "reading_form", "word_structure", "split_a", "split_b", "synonym_group_id"]]] +""" +Strings that can be parsed as SplitMode +""" +SplitModeStr = Literal["A", "a", "B", "b", "C", "c"] + + class SplitMode: """ Unit to split text. @@ -48,11 +56,12 @@ class SplitMode: C: ClassVar[SplitMode] = ... @classmethod - def __init__(cls, mode: str = "C") -> None: + def __init__(cls, mode: Optional[SplitModeStr] = "C") -> None: """ Creates a split mode from a string value. :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case. + If None, returns SplitMode.C. """ ... @@ -88,10 +97,10 @@ class Dictionary: ... def create(self, - mode: Union[SplitMode, Literal["A", "B", "C"]] = SplitMode.C, - fields: FieldSet = None, + mode: Union[SplitMode, SplitModeStr, None] = SplitMode.C, + fields: Optional[FieldSet] = None, *, - projection: str = None) -> Tokenizer: + projection: Optional[str] = None) -> Tokenizer: """ Creates a sudachi tokenizer. @@ -118,12 +127,12 @@ class Dictionary: ... def pre_tokenizer(self, - mode: Union[SplitMode, Literal["A", "B", "C"]] = "C", - fields: FieldSet = None, + mode: Union[SplitMode, SplitModeStr, None] = SplitMode.C, + fields: Optional[FieldSet] = None, handler: Optional[Callable[[ int, object, MorphemeList], list]] = None, *, - projection: str = None) -> object: + projection: Optional[str] = None) -> object: """ Creates HuggingFace Tokenizers-compatible PreTokenizer. Requires package `tokenizers` to be installed. @@ -230,7 +239,10 @@ class Morpheme: """ ... - def split(self, mode: Union[SplitMode, Literal["A", "B", "C"]], out: Optional[MorphemeList] = None, add_single: bool = True) -> MorphemeList: + def split(self, + mode: Union[SplitMode, SplitModeStr], + out: Optional[MorphemeList] = None, + add_single: bool = True) -> MorphemeList: """ Returns sub-morphemes in the provided split mode. @@ -288,7 +300,7 @@ class MorphemeList: def __init__(self) -> None: ... @classmethod - def empty(cls, dict) -> MorphemeList: + def empty(cls, dict: Dictionary) -> MorphemeList: """ Returns an empty morpheme list with dictionary. """ @@ -306,7 +318,7 @@ class MorphemeList: """ ... - def __getitem__(self, index) -> Morpheme: ... + def __getitem__(self, index: int) -> Morpheme: ... def __iter__(self) -> Iterator[Morpheme]: ... def __len__(self) -> int: ... @@ -318,11 +330,13 @@ class Tokenizer: Create using Dictionary.create method. """ SplitMode: ClassVar[SplitMode] = ... + @classmethod def __init__(cls) -> None: ... - def tokenize(self, text: str, - mode: Union[SplitMode, Literal["A", "B", "C"]] = ..., + def tokenize(self, + text: str, + mode: Union[SplitMode, SplitModeStr, None] = None, out: Optional[MorphemeList] = None) -> MorphemeList: """ Break text into morphemes. @@ -359,6 +373,7 @@ class WordInfo: surface: ClassVar[str] = ... synonym_group_ids: ClassVar[List[int]] = ... word_structure: ClassVar[List[int]] = ... + @classmethod def __init__(self) -> None: ... def length(self) -> int: ... @@ -374,11 +389,11 @@ class PosMatcher: def __iter__(self) -> Iterator[POS]: ... def __len__(self) -> int: ... - def __call__(self, m: Morpheme) -> bool: + def __call__(self, /, m: Morpheme) -> bool: """ Checks whether a morpheme has matching POS. - :param m: morpheme. + :param m: a morpheme to check. :return: if morpheme has matching POS. """ ... From d1c31655292adc80e1b9a1051bb4b90752500e6f Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 8 Jul 2024 14:51:00 +0900 Subject: [PATCH 8/9] add deprecated directive and fix --- python/py_src/sudachipy/sudachipy.pyi | 9 +++++++++ python/src/build.rs | 4 ++-- python/src/dictionary.rs | 4 ++-- python/src/morpheme.rs | 6 ++++++ python/src/tokenizer.rs | 2 ++ 5 files changed, 21 insertions(+), 4 deletions(-) diff --git a/python/py_src/sudachipy/sudachipy.pyi b/python/py_src/sudachipy/sudachipy.pyi index 0b1c4fc2..ca39a95c 100644 --- a/python/py_src/sudachipy/sudachipy.pyi +++ b/python/py_src/sudachipy/sudachipy.pyi @@ -47,7 +47,9 @@ class SplitMode: Unit to split text. A == short mode + B == middle mode + C == long mode """ @@ -205,6 +207,9 @@ class Morpheme: def get_word_info(self) -> WordInfo: """ Returns the word info. + + ..deprecated:: v0.6.0 + Users should not touch the raw WordInfo. """ ... @@ -293,6 +298,7 @@ class Morpheme: class MorphemeList: """ A list of morphemes. + An object can not be instantiated manually. Use Tokenizer.tokenize("") to create an empty morpheme list. """ @@ -303,6 +309,9 @@ class MorphemeList: def empty(cls, dict: Dictionary) -> MorphemeList: """ Returns an empty morpheme list with dictionary. + + .. deprecated:: + Use Tokenizer.tokenize("") if you need. """ ... diff --git a/python/src/build.rs b/python/src/build.rs index 2b2ce94f..b37ed807 100644 --- a/python/src/build.rs +++ b/python/src/build.rs @@ -65,7 +65,7 @@ fn create_file(p: &Path) -> std::io::Result { /// :param output: Path to output built dictionray. /// :param description: A description text to embed in the dictionary. /// :return: A build report, list of (part, size, time). -/// +/// /// :type matrix: pathlib.Path | str | bytes /// :type lex: list[pathlib.Path | str | bytes] /// :type output: pathlib.Path | str @@ -106,7 +106,7 @@ fn build_system_dic<'p>( /// :param output: Path to output built dictionray. /// :param description: A description text to embed in the dictionary. /// :return: A build report, list of (part, size, time). -/// +/// /// :type system: pathlib.Path | str /// :type lex: list[pathlib.Path | str | bytes] /// :type output: pathlib.Path | str diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs index 2b5c849b..22241f95 100644 --- a/python/src/dictionary.rs +++ b/python/src/dictionary.rs @@ -87,7 +87,7 @@ impl PyDicData { /// :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object. /// :param config: alias to config_path, only one of them can be specified at the same time. /// :param resource_dir: path to the resource directory folder. -/// :param dict: type of pre-packaged dictionary, referring to sudachidict_ packages on PyPI: https://pypi.org/search/?q=sudachidict. +/// :param dict: type of pre-packaged dictionary, referring to sudachidict_ packages on PyPI: https://pypi.org/search/?q=sudachidict. /// Also, can be an _absolute_ path to a compiled dictionary file. /// :param dict_type: deprecated alias to dict. /// @@ -114,7 +114,7 @@ impl PyDictionary { /// :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object. /// :param config: alias to config_path, only one of them can be specified at the same time. /// :param resource_dir: path to the resource directory folder. - /// :param dict: type of pre-packaged dictionary, referring to sudachidict_ packages on PyPI: https://pypi.org/search/?q=sudachidict. + /// :param dict: type of pre-packaged dictionary, referring to sudachidict_ packages on PyPI: https://pypi.org/search/?q=sudachidict. /// Also, can be an _absolute_ path to a compiled dictionary file. /// :param dict_type: deprecated alias to dict. /// diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs index 522d8ecd..b9367e10 100644 --- a/python/src/morpheme.rs +++ b/python/src/morpheme.rs @@ -93,6 +93,9 @@ impl PyMorphemeListWrapper { #[pymethods] impl PyMorphemeListWrapper { /// Returns an empty morpheme list with dictionary. + /// + /// .. deprecated:: 0.6.0 + /// Use Tokenizer.tokenize("") if you need. #[classmethod] #[pyo3(text_signature = "(dict: Dictionary) -> MorphemeList")] fn empty(_cls: &PyType, py: Python, dict: &PyDictionary) -> PyResult { @@ -439,6 +442,9 @@ impl PyMorpheme { } /// Returns the word info. + /// + /// ..deprecated:: v0.6.0 + /// Users should not touch the raw WordInfo. #[pyo3(text_signature = "(self, /) -> WordInfo")] fn get_word_info(&self, py: Python) -> PyResult { let cat = PyModule::import(py, "builtins")?.getattr("DeprecationWarning")?; diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs index c14f7076..d96763de 100644 --- a/python/src/tokenizer.rs +++ b/python/src/tokenizer.rs @@ -32,7 +32,9 @@ use crate::morpheme::{PyMorphemeListWrapper, PyProjector}; /// Unit to split text. /// /// A == short mode +/// /// B == middle mode +/// /// C == long mode /// /// :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case. From 4a3da5bacd868112165ac5f3c5c49d5f82eba48f Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Wed, 25 Sep 2024 09:36:20 +0900 Subject: [PATCH 9/9] update Dictionary arg name --- python/README.md | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/python/README.md b/python/README.md index 4d95d7fb..b1ad3e5e 100644 --- a/python/README.md +++ b/python/README.md @@ -66,7 +66,7 @@ $ pip install sudachipy ### Step 2. Get a Dictionary -You can get dictionary as a Python package. It make take a while to download the dictionary file (around 70MB for the `core` edition). +You can get dictionary as a Python package. It may take a while to download the dictionary file (around 70MB for the `core` edition). ```bash $ pip install sudachidict_core @@ -209,7 +209,7 @@ There are three editions of Sudachi Dictionary, namely, `small`, `core`, and `fu SudachiPy uses `sudachidict_core` by default. -Dictionaries are installed as Python packages `sudachidict_small`, `sudachidict_core`, and `sudachidict_full`. +Dictionaries can be installed as Python packages `sudachidict_small`, `sudachidict_core`, and `sudachidict_full`. * [SudachiDict-small · PyPI](https://pypi.org/project/SudachiDict-small/) * [SudachiDict-core · PyPI](https://pypi.org/project/SudachiDict-core/) @@ -234,19 +234,19 @@ $ echo "外国人参政権" | sudachipy -s full ### Dictionary option: Python package -You can specify the dictionary with the `Dicionary()` argument; `config_path` or `dict_type`. +You can specify the dictionary with the `Dicionary()` argument; `config` or `dict`. ```python -class Dictionary(config_path=None, resource_dir=None, dict_type=None) +class Dictionary(config=None, resource_dir=None, dict=None) ``` -1. `config_path` - * You can specify the file path to the setting file with `config_path` (See [Dictionary in The Setting File](#Dictionary in The Setting File) for the detail). +1. `config` + * You can specify the file path to the setting file with `config` (See [Dictionary in The Setting File](#Dictionary in The Setting File) for the detail). * If the dictionary file is specified in the setting file as `systemDict`, SudachiPy will use the dictionary. -2. `dict_type` - * You can also specify the dictionary type with `dict_type`. - * The available arguments are `small`, `core`, or `full`. - * If different dictionaries are specified with `config_path` and `dict_type`, **a dictionary defined `dict_type` overrides** those defined in the config path. +2. `dict` + * You can also specify the dictionary type with `dict`. + * The available arguments are `small`, `core`, `full`, or a path to the dictionary file. + * If different dictionaries are specified with `config` and `dict`, **a dictionary defined `dict` overrides** those defined in the config. ```python from sudachipy import Dictionary @@ -255,16 +255,16 @@ from sudachipy import Dictionary tokenizer_obj = Dictionary().create() # The dictionary given by the `systemDict` key in the config file (/path/to/sudachi.json) will be used -tokenizer_obj = Dictionary(config_path="/path/to/sudachi.json").create() +tokenizer_obj = Dictionary(config="/path/to/sudachi.json").create() -# The dictionary specified by `dict_type` will be set. -tokenizer_obj = Dictionary(dict_type="core").create() # sudachidict_core (same as default) -tokenizer_obj = Dictionary(dict_type="small").create() # sudachidict_small -tokenizer_obj = Dictionary(dict_type="full").create() # sudachidict_full +# The dictionary specified by `dict` will be used. +tokenizer_obj = Dictionary(dict="core").create() # sudachidict_core (same as default) +tokenizer_obj = Dictionary(dict="small").create() # sudachidict_small +tokenizer_obj = Dictionary(dict="full").create() # sudachidict_full -# The dictionary specified by `dict_type` overrides those defined in the config path. +# The dictionary specified by `dict` overrides those defined in the config. # In the following code, `sudachidict_full` will be used regardless of a dictionary defined in the config file. -tokenizer_obj = Dictionary(config_path="/path/to/sudachi.json", dict_type="full").create() +tokenizer_obj = Dictionary(config="/path/to/sudachi.json", dict="full").create() ``` @@ -303,10 +303,8 @@ Then specify your `sudachi.json` with the `-r` option. $ sudachipy -r path/to/sudachi.json ``` - You can build a user dictionary with the subcommand `ubuild`. - ```bash $ sudachipy ubuild -h usage: sudachipy ubuild [-h] [-o file] [-d string] -s file file [file ...]