Skip to content

Commit

Permalink
Merge branch 'develop' into fix/267-python-document
Browse files Browse the repository at this point in the history
  • Loading branch information
mh-northlander authored Nov 11, 2024
2 parents 4a3da5b + e759196 commit ff2c2d2
Show file tree
Hide file tree
Showing 14 changed files with 371 additions and 421 deletions.
338 changes: 129 additions & 209 deletions Cargo.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ name = "sudachipy"
crate-type = ["cdylib"]

[dependencies]
pyo3 = { version = "0.20", features = ["extension-module"] }
thread_local = "1.1" # Apache 2.0/MIT
pyo3 = { version = "0.21", features = ["extension-module"] }
scopeguard = "1" # Apache 2.0/MIT
thread_local = "1.1" # Apache 2.0/MIT

[dependencies.sudachi]
path = "../sudachi"
42 changes: 26 additions & 16 deletions python/py_src/sudachipy/command_line.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019 Works Applications Co., Ltd.
# Copyright (c) 2019-2024 Works Applications Co., Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -24,6 +24,13 @@
from . import sudachipy


logging.basicConfig(
style="{",
format='{levelname} {asctime} [{module}:{funcName}:{lineno}] {message}',
datefmt="%m-%d-%Y %H:%M:%S",
)


def _set_default_subparser(self, name, args=None):
"""
copy and modify code from https://bitbucket.org/ruamel/std.argparse
Expand Down Expand Up @@ -51,7 +58,7 @@ def _set_default_subparser(self, name, args=None):
argparse.ArgumentParser.set_default_subparser = _set_default_subparser


def run(tokenizer, input_, output, print_all, morphs, is_stdout):
def run(tokenizer, input_, output, print_all, pos_list, is_stdout):
# get an empty MorphemeList for memory reuse
mlist = tokenizer.tokenize("")
for line in input_:
Expand All @@ -60,7 +67,7 @@ def run(tokenizer, input_, output, print_all, morphs, is_stdout):
for m in tokenizer.tokenize(line, out=mlist):
list_info = [
m.surface(),
morphs[m.part_of_speech_id()],
pos_list[m.part_of_speech_id()],
m.normalized_form()]
if print_all:
list_info += [
Expand Down Expand Up @@ -97,27 +104,27 @@ def _command_tokenize(args, print_usage):
if args.fpath_out:
output = open(args.fpath_out, "w", encoding="utf-8")

stdout_logger = logging.getLogger(__name__)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
stdout_logger.addHandler(handler)
stdout_logger.setLevel(logging.DEBUG)
stdout_logger.propagate = False
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

print_all = args.a
debug = args.d
if debug:
logger.warning("-d option is not implemented in python.")

try:
dict_ = Dictionary(config_path=args.fpath_setting,
dict_type=args.system_dict_type)
# empty matcher - get all POS tags
all_morphs = dict_.pos_matcher([()])
all_pos_matcher = dict_.pos_matcher([()])
# precompute output POS strings
morphs = [",".join(ms) for ms in all_morphs]
pos_list = [",".join(ms) for ms in all_pos_matcher]

tokenizer_obj = dict_.create(mode=args.mode)
input_ = fileinput.input(
args.in_files, openhook=fileinput.hook_encoded("utf-8"))
run(tokenizer_obj, input_, output, print_all, morphs, is_stdout=args.fpath_out is None)
run(tokenizer_obj, input_, output, print_all,
pos_list, is_stdout=args.fpath_out is None)
finally:
if args.fpath_out:
output.close()
Expand All @@ -139,7 +146,8 @@ def _command_build(args, print_usage):

out_file = Path(args.out_file)
if out_file.exists():
print("File", out_file, "already exists, refusing to overwrite it", file=sys.stderr)
print("File", out_file,
"already exists, refusing to overwrite it", file=sys.stderr)
return

description = args.description or ""
Expand All @@ -161,7 +169,8 @@ def _command_build(args, print_usage):
def _command_user_build(args, print_usage):
system = Path(args.system_dic)
if not system.exists():
print("System dictionary file", system, "does not exist", file=sys.stderr)
print("System dictionary file", system,
"does not exist", file=sys.stderr)
return print_usage()

in_files = []
Expand All @@ -174,7 +183,8 @@ def _command_user_build(args, print_usage):

out_file = Path(args.out_file)
if out_file.exists():
print("File", out_file, "already exists, refusing to overwrite it", file=sys.stderr)
print("File", out_file,
"already exists, refusing to overwrite it", file=sys.stderr)
return

description = args.description or ""
Expand Down Expand Up @@ -217,7 +227,7 @@ def main():
parser_tk.add_argument("-a", action="store_true",
help="print all of the fields")
parser_tk.add_argument("-d", action="store_true",
help="print the debug information")
help="print the debug information (not implemented yet)")
parser_tk.add_argument("-v", "--version", action="store_true",
dest="version", help="print sudachipy version")
parser_tk.add_argument("in_files", metavar="file",
Expand Down
98 changes: 59 additions & 39 deletions python/src/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,17 @@ use sudachi::config::Config;
use sudachi::dic::build::{DataSource, DictBuilder};
use sudachi::dic::dictionary::JapaneseDictionary;

pub fn register_functions(m: &PyModule) -> PyResult<()> {
pub fn register_functions(m: &Bound<PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(build_system_dic, m)?)?;
m.add_function(wrap_pyfunction!(build_user_dic, m)?)?;
Ok(())
}

fn to_stats<T: DictionaryAccess>(py: Python, builder: DictBuilder<T>) -> PyResult<&PyList> {
let stats = PyList::empty(py);
fn to_stats<T: DictionaryAccess>(py: Python, builder: DictBuilder<T>) -> PyResult<Bound<PyList>> {
let stats = PyList::empty_bound(py);

for p in builder.report() {
let t = PyTuple::new(
let t = PyTuple::new_bound(
py,
[
p.part().into_py(py),
Expand Down Expand Up @@ -72,23 +72,26 @@ fn create_file(p: &Path) -> std::io::Result<File> {
/// :type description: str
#[pyfunction]
#[pyo3(text_signature = "(matrix, lex, output, description=None) -> list[tuple[str, int, float]]")]
fn build_system_dic<'p>(
py: Python<'p>,
matrix: &'p PyAny,
lex: &'p PyList,
output: &'p PyAny,
fn build_system_dic<'py>(
py: Python<'py>,
matrix: &Bound<'py, PyAny>,
lex: &Bound<'py, PyList>,
output: &Bound<'py, PyAny>,
description: Option<&str>,
) -> PyResult<&'p PyList> {
) -> PyResult<Bound<'py, PyList>> {
let mut builder = DictBuilder::new_system();
description.map(|d| builder.set_description(d));

let matrix_src = as_data_source(py, matrix)?;
let matrix_path = resolve_as_pypathstr(py, matrix)?;
let matrix_src = as_data_source(matrix_path.as_ref(), matrix)?;
errors::wrap_ctx(builder.read_conn(matrix_src), matrix)?;
for f in lex.iter() {
let lex_src = as_data_source(py, &f)?;
let lex_path = resolve_as_pypathstr(py, &f)?;
let lex_src = as_data_source(lex_path.as_ref(), &f)?;
errors::wrap_ctx(builder.read_lexicon(lex_src), &f)?;
}
let out_file = match as_data_source(py, output)? {
let out_path = resolve_as_pypathstr(py, output)?;
let out_file = match as_data_source(out_path.as_ref(), output)? {
DataSource::File(p) => errors::wrap_ctx(create_file(p), p)?,
DataSource::Data(_) => return errors::wrap(Err("can't use bytes for output")),
};
Expand All @@ -113,14 +116,15 @@ fn build_system_dic<'p>(
/// :type description: str
#[pyfunction]
#[pyo3(text_signature = "(system, lex, output, description=None) -> list[tuple[str, int, float]]")]
fn build_user_dic<'p>(
py: Python<'p>,
system: &'p PyAny,
lex: &'p PyList,
output: &'p PyAny,
fn build_user_dic<'py>(
py: Python<'py>,
system: &Bound<'py, PyAny>,
lex: &Bound<'py, PyList>,
output: &Bound<'py, PyAny>,
description: Option<&str>,
) -> PyResult<&'p PyList> {
let system_dic = match as_data_source(py, system)? {
) -> PyResult<Bound<'py, PyList>> {
let system_path = resolve_as_pypathstr(py, system)?;
let system_dic = match as_data_source(system_path.as_ref(), system)? {
DataSource::File(f) => {
let resource_path = get_default_resource_dir(py)?;
let cfg = Config::minimal_at(resource_path).with_system_dic(f);
Expand All @@ -137,10 +141,12 @@ fn build_user_dic<'p>(
description.map(|d| builder.set_description(d));

for f in lex.iter() {
let lex_src = as_data_source(py, &f)?;
let lex_path = resolve_as_pypathstr(py, &f)?;
let lex_src = as_data_source(lex_path.as_ref(), &f)?;
errors::wrap_ctx(builder.read_lexicon(lex_src), &f)?;
}
let out_file = match as_data_source(py, output)? {
let out_path = resolve_as_pypathstr(py, output)?;
let out_file = match as_data_source(out_path.as_ref(), output)? {
DataSource::File(p) => errors::wrap_ctx(create_file(p), p)?,
DataSource::Data(_) => return errors::wrap(Err("can't use bytes for output")),
};
Expand All @@ -151,25 +157,39 @@ fn build_user_dic<'p>(
to_stats(py, builder)
}

fn as_data_source<'p>(py: Python<'p>, data: &'p PyAny) -> PyResult<DataSource<'p>> {
let path = py
.import("pathlib")?
.getattr("Path")?
.downcast::<PyType>()?;
fn resolve_as_pypathstr<'py>(
py: Python<'py>,
data: &Bound<'py, PyAny>,
) -> PyResult<Option<Bound<'py, PyString>>> {
let binding = py.import_bound("pathlib")?.getattr("Path")?;
let path = binding.downcast::<PyType>()?;
if data.is_instance(path)? {
let pypath = data.call_method0("resolve")?.str()?;
Ok(DataSource::File(Path::new(pypath.to_str()?)))
Ok(Some(data.call_method0("resolve")?.str()?))
} else if data.is_instance_of::<PyString>() {
let pypath = data.str()?;
Ok(DataSource::File(Path::new(pypath.to_str()?)))
} else if data.is_instance_of::<PyBytes>() {
let data = data.downcast::<PyBytes>()?;
Ok(DataSource::Data(data.as_bytes()))
Ok(Some(data.str()?))
} else {
Err(pyo3::exceptions::PyValueError::new_err(format!(
"data source should can be only Path, bytes or str, was {}: {}",
data,
data.get_type()
)))
Ok(None)
}
}

fn as_data_source<'py>(
resolved_path: Option<&'py Bound<'py, PyString>>,
original_obj: &'py Bound<'py, PyAny>,
) -> PyResult<DataSource<'py>> {
match resolved_path {
Some(pystr) => Ok(DataSource::File(Path::new(pystr.to_str()?))),
None => {
if original_obj.is_instance_of::<PyBytes>() {
Ok(DataSource::Data(
original_obj.downcast::<PyBytes>()?.as_bytes(),
))
} else {
Err(pyo3::exceptions::PyValueError::new_err(format!(
"data source should can be only Path, bytes or str, was {}: {}",
original_obj,
original_obj.get_type()
)))
}
}
}
}
Loading

0 comments on commit ff2c2d2

Please sign in to comment.