Skip to content

Commit

Permalink
Merge pull request #35 from rskmoi/v0.2.2
Browse files Browse the repository at this point in the history
V0.2.2
  • Loading branch information
rskmoi authored Jul 17, 2023
2 parents 4bfd380 + ececbe9 commit cdcc6ed
Show file tree
Hide file tree
Showing 37 changed files with 534 additions and 388 deletions.
29 changes: 18 additions & 11 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,31 +6,38 @@ name: Python package
on: [push]

jobs:
build:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: 3.11
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Lint with mypy, ruff and black
run: |
bash scripts/lint.sh
build:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
python-version: [ '3.8', '3.9', '3.10', '3.11']

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest
pip install -r requirements.txt
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pytest tests
Expand Down
3 changes: 0 additions & 3 deletions MANIFEST.in

This file was deleted.

24 changes: 13 additions & 11 deletions namedivider/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
from .name_divider import NameDivider
from .divider.divided_name import DividedName
from .divider.basic_name_divider import BasicNameDivider
from .divider.config import BasicNameDividerConfig, NameDividerVersions
from .divider.divided_name import DividedName
from .divider.gbdt_name_divider import GBDTNameDivider
from .feature.kanji import KanjiStatistics
from .divider.config import NameDividerVersions, BasicNameDividerConfig
from .name_divider import NameDivider
from .version import __version__

__all__ = ["NameDivider",
"BasicNameDivider",
"GBDTNameDivider",
"DividedName",
"KanjiStatistics",
"NameDividerVersions",
"BasicNameDividerConfig",
"__version__"]
__all__ = [
"NameDivider",
"BasicNameDivider",
"GBDTNameDivider",
"DividedName",
"KanjiStatistics",
"NameDividerVersions",
"BasicNameDividerConfig",
"__version__",
]
66 changes: 35 additions & 31 deletions namedivider/cli.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from pathlib import Path

import typer
from tqdm import tqdm
from pathlib import Path

from namedivider.divider.basic_name_divider import BasicNameDivider
from namedivider.divider.config import BasicNameDividerConfig, GBDTNameDividerConfig
from namedivider.divider.gbdt_name_divider import GBDTNameDivider
from namedivider.divider.name_divider_base import _NameDivider
from namedivider.divider.basic_name_divider import BasicNameDivider, BasicNameDividerConfig
from namedivider.divider.gbdt_name_divider import GBDTNameDivider, GBDTNameDividerConfig

CURRENT_DIR = Path(__file__).resolve().parent

Expand All @@ -12,19 +15,21 @@

def get_divider(mode: str, separator: str) -> _NameDivider:
if mode == "basic":
config = BasicNameDividerConfig(separator=separator)
return BasicNameDivider(config=config)
basic_config = BasicNameDividerConfig(separator=separator)
return BasicNameDivider(config=basic_config)
elif mode == "gbdt":
config = GBDTNameDividerConfig(separator=separator)
return GBDTNameDivider(config=config)
gbdt_config = GBDTNameDividerConfig(separator=separator)
return GBDTNameDivider(config=gbdt_config)
else:
raise ValueError(f"Mode must be in [basic, gbdt], but got {mode}")


@app.command()
def name(undivided_name: str = typer.Argument(..., help="Undivided name"),
separator: str = typer.Option(" ", "--separator", "-s", help="Separator between family name and given name"),
mode: str = typer.Option("basic", "--mode", "-m", help="Divider Mode. You can choice basic or gbdt.")):
def name(
undivided_name: str = typer.Argument(..., help="Undivided name"),
separator: str = typer.Option(" ", "--separator", "-s", help="Separator between family name and given name"),
mode: str = typer.Option("basic", "--mode", "-m", help="Divider Mode. You can choice basic or gbdt."),
) -> None:
"""
Divides an undivided name.
:param undivided_name: Undivided name
Expand All @@ -35,14 +40,14 @@ def name(undivided_name: str = typer.Argument(..., help="Undivided name"),


@app.command()
def file(undivided_name_text: Path = typer.Argument(...,
help="File path of text file",
exists=True,
dir_okay=False,
readable=True),
separator: str = typer.Option(" ", "--separator", "-s", help="Separator between family name and given name"),
mode: str = typer.Option("basic", "--mode", "-m", help="Divider Mode. You can choice basic or gbdt."),
encoding: str = typer.Option("utf-8", "--encoding", "-e", help="Encoding of text file")):
def file(
undivided_name_text: Path = typer.Argument(
..., help="File path of text file", exists=True, dir_okay=False, readable=True
),
separator: str = typer.Option(" ", "--separator", "-s", help="Separator between family name and given name"),
mode: str = typer.Option("basic", "--mode", "-m", help="Divider Mode. You can choice basic or gbdt."),
encoding: str = typer.Option("utf-8", "--encoding", "-e", help="Encoding of text file"),
) -> None:
"""
Divides names in text file.
The text file must have one name per line.
Expand Down Expand Up @@ -76,15 +81,14 @@ def file(undivided_name_text: Path = typer.Argument(...,


@app.command()
def accuracy(divided_name_text: Path = typer.Argument(...,
help="File path of text file",
exists=True,
dir_okay=False,
readable=True),
separator: str = typer.Option(" ", "--separator", "-s",
help="Separator between family name and given name"),
mode: str = typer.Option("basic", "--mode", "-m", help="Divider Mode. You can choice basic or gbdt."),
encoding: str = typer.Option("utf-8", "--encoding", "-e", help="Encoding of text file")):
def accuracy(
divided_name_text: Path = typer.Argument(
..., help="File path of text file", exists=True, dir_okay=False, readable=True
),
separator: str = typer.Option(" ", "--separator", "-s", help="Separator between family name and given name"),
mode: str = typer.Option("basic", "--mode", "-m", help="Divider Mode. You can choice basic or gbdt."),
encoding: str = typer.Option("utf-8", "--encoding", "-e", help="Encoding of text file"),
) -> None:
"""
Check the accuracy of this tool.
The text file must have one name per line, and name must be divided py separator.
Expand All @@ -109,13 +113,13 @@ def accuracy(divided_name_text: Path = typer.Argument(...,
"""
divider = get_divider(mode=mode, separator=separator)
with open(divided_name_text, "rb") as f:
divided_name_text = f.read().decode(encoding).strip().split("\n")
divided_names = f.read().decode(encoding).strip().split("\n")
is_correct_list = []
wrong_list = []
for _divided_name in tqdm(divided_name_text):
for _divided_name in tqdm(divided_names):
_undivided_name = _divided_name.replace(separator, "")
_divided_name_pred = str(divider.divide_name(_undivided_name))
is_correct = (_divided_name == _divided_name_pred)
is_correct = _divided_name == _divided_name_pred
is_correct_list.append(is_correct)
if not is_correct:
wrong_list.append(f"True: {_divided_name}, Pred: {_divided_name_pred}")
Expand All @@ -124,5 +128,5 @@ def accuracy(divided_name_text: Path = typer.Argument(...,
print("\n".join(wrong_list))


if __name__ == '__main__':
if __name__ == "__main__":
app()
18 changes: 11 additions & 7 deletions namedivider/divided_name.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import warnings
from typing import Dict
from dataclasses import dataclass, asdict
from dataclasses import asdict, dataclass
from typing import Any, Dict


@dataclass(frozen=True)
Expand All @@ -13,13 +13,17 @@ class DividedName:
:param score: Confidence level, from 0 to 1
:param algorithm: The name of dividing algorithm
"""
warnings.warn("namedivider.divided_name.DividedName is deprecated in 0.2 and will be removed in 0.4. "
"Use namedivider.divider.divided_name.DividedName if you want to use DividedName class.",
category=FutureWarning)

warnings.warn(
"namedivider.divided_name.DividedName is deprecated in 0.2 and will be removed in 0.4. "
"Use namedivider.divider.divided_name.DividedName if you want to use DividedName class.",
category=FutureWarning,
stacklevel=1,
)
family: str
given: str
separator: str = " "
score: float = 1.
score: float = 1.0
algorithm: str = ""

def __str__(self) -> str:
Expand All @@ -29,7 +33,7 @@ def __str__(self) -> str:
"""
return f"{self.family}{self.separator}{self.given}"

def to_dict(self) -> Dict:
def to_dict(self) -> Dict[str, Any]:
"""
:return: Dictionary of divided name
:rtype: Dict
Expand Down
8 changes: 5 additions & 3 deletions namedivider/divider/basic_name_divider.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from typing import Optional

from namedivider.divider.config import BasicNameDividerConfig
from namedivider.divider.name_divider_base import _NameDivider
from namedivider.feature.kanji import KanjiStatisticsRepository
from namedivider.feature.extractor import SimpleFeatureExtractor
from namedivider.feature.kanji import KanjiStatisticsRepository


class BasicNameDivider(_NameDivider):
Expand All @@ -10,7 +12,7 @@ class BasicNameDivider(_NameDivider):
Prior to v0.1.0, this was provided as a 'NameDivider' class.
"""

def __init__(self, config: BasicNameDividerConfig = None):
def __init__(self, config: Optional[BasicNameDividerConfig] = None):
if config is None:
config = BasicNameDividerConfig()
super().__init__(config=config)
Expand All @@ -32,4 +34,4 @@ def calc_score(self, family: str, given: str) -> float:
return order_score
length_score = (features.family_length_score + features.given_length_score) / len(name)

return (order_score + length_score) / 2.
return (order_score + length_score) / 2.0
29 changes: 14 additions & 15 deletions namedivider/divider/config.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from typing import Union
from dataclasses import dataclass
from enum import Enum, auto
from pathlib import Path
from dataclasses import dataclass
from namedivider.util import \
get_kanji_csv_default_path, \
get_family_name_pkl_default_path, \
get_gbdt_model_v1_default_path
from typing import Union

from namedivider.util import (
get_family_name_pkl_default_path,
get_gbdt_model_v1_default_path,
get_kanji_csv_default_path,
)

KANJI_CSV_DEFAULT_PATH = get_kanji_csv_default_path()
FAMILY_NAME_PKL_DEFAULT_PATH = get_family_name_pkl_default_path()
Expand All @@ -29,6 +31,7 @@ class NameDividerConfigBase:
into orthographic character form(正字体) before processing them.
algorithm_name: Name of algorithm.
"""

separator: str = " "
normalize_name: bool = True
algorithm_name: str = "unknown_algorithm"
Expand All @@ -40,6 +43,7 @@ class BasicNameDividerConfig(NameDividerConfigBase):
path_csv: Path of the file containing the kanji information.
only_order_score_when_4: If True, only order score is used for 4-character names. Not recommended to be True.
"""

path_csv: Union[str, Path] = KANJI_CSV_DEFAULT_PATH
only_order_score_when_4: bool = False
algorithm_name: str = "kanji_feature"
Expand All @@ -56,6 +60,7 @@ class GBDTNameDividerConfig(NameDividerConfigBase):
Path of a file with multiple family names enumerated.
path_model: Path of a GBDT model.
"""

path_csv: Union[str, Path] = KANJI_CSV_DEFAULT_PATH
path_family_names: Union[str, Path] = FAMILY_NAME_PKL_DEFAULT_PATH
path_model: Union[str, Path] = GBDT_MODEL_V1_DEFAULT_PATH
Expand All @@ -65,17 +70,11 @@ class GBDTNameDividerConfig(NameDividerConfigBase):
def get_config_from_version(version: NameDividerVersions) -> NameDividerConfigBase:
if version == NameDividerVersions.BASIC_NAME_DIVIDER_V1:
return BasicNameDividerConfig(
separator=" ",
normalize_name=False,
path_csv=KANJI_CSV_DEFAULT_PATH,
only_order_score_when_4=True
separator=" ", normalize_name=False, path_csv=KANJI_CSV_DEFAULT_PATH, only_order_score_when_4=True
)
elif version == NameDividerVersions.BASIC_NAME_DIVIDER_V2:
return BasicNameDividerConfig(
separator=" ",
normalize_name=True,
path_csv=KANJI_CSV_DEFAULT_PATH,
only_order_score_when_4=False
separator=" ", normalize_name=True, path_csv=KANJI_CSV_DEFAULT_PATH, only_order_score_when_4=False
)
elif version == NameDividerVersions.BASIC_NAME_DIVIDER_LATEST:
return BasicNameDividerConfig()
Expand All @@ -85,7 +84,7 @@ def get_config_from_version(version: NameDividerVersions) -> NameDividerConfigBa
normalize_name=True,
path_csv=KANJI_CSV_DEFAULT_PATH,
path_family_names=FAMILY_NAME_PKL_DEFAULT_PATH,
path_model=GBDT_MODEL_V1_DEFAULT_PATH
path_model=GBDT_MODEL_V1_DEFAULT_PATH,
)
elif version == NameDividerVersions.GBDT_NAME_DIVIDER_LATEST:
return GBDTNameDividerConfig()
Expand Down
9 changes: 5 additions & 4 deletions namedivider/divider/divided_name.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Dict
from dataclasses import dataclass, asdict
from dataclasses import asdict, dataclass
from typing import Any, Dict


@dataclass(frozen=True)
Expand All @@ -12,10 +12,11 @@ class DividedName:
:param score: Confidence level, from 0 to 1
:param algorithm: The name of dividing algorithm
"""

family: str
given: str
separator: str = " "
score: float = 1.
score: float = 1.0
algorithm: str = ""

def __str__(self) -> str:
Expand All @@ -25,7 +26,7 @@ def __str__(self) -> str:
"""
return f"{self.family}{self.separator}{self.given}"

def to_dict(self) -> Dict:
def to_dict(self) -> Dict[str, Any]:
"""
:return: Dictionary of divided name
:rtype: Dict
Expand Down
Loading

0 comments on commit cdcc6ed

Please sign in to comment.