-
Notifications
You must be signed in to change notification settings - Fork 206
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
8,200 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,306 @@ | ||
""" | ||
Clean and validate a DataFrame column containing language. | ||
""" | ||
|
||
# pylint: disable=too-many-arguments, global-statement | ||
|
||
from os import path | ||
from typing import Any, Union, Tuple, Optional | ||
|
||
import dask | ||
import dask.dataframe as dd | ||
import numpy as np | ||
import pandas as pd | ||
|
||
from ..progress_bar import ProgressBar | ||
from .utils import NULL_VALUES, to_dask | ||
|
||
DEFAULT_LANGUAGE_DATA_FILE = path.join(path.split(path.abspath(__file__))[0], "language_data.csv") | ||
|
||
DATA = pd.read_csv(DEFAULT_LANGUAGE_DATA_FILE, dtype=str) | ||
|
||
|
||
def clean_language( | ||
df: Union[pd.DataFrame, dd.DataFrame], | ||
col: str, | ||
input_format: Union[str, Tuple[str, ...]] = "auto", | ||
output_format: str = "name", | ||
kb_path: str = "default", | ||
inplace: bool = False, | ||
errors: str = "coerce", | ||
progress: bool = True, | ||
) -> pd.DataFrame: | ||
""" | ||
Clean language type data in a DataFrame column. | ||
Parameters | ||
---------- | ||
df | ||
A pandas or Dask DataFrame containing the data to be cleaned. | ||
col | ||
The name of the column containing data of language type. | ||
input_format | ||
The ISO 639 input format of the language. | ||
- 'auto': infer the input format | ||
- 'name': language name ('English') | ||
- 'alpha-2': alpha-2 code ('en') | ||
- 'alpha-3': alpha-3 code ('eng') | ||
Can also be a tuple containing any combination of input formats, | ||
for example to clean a column containing name and alpha-2 | ||
codes set input_format to ('name', 'alpha-2'). | ||
(default: 'auto') | ||
output_format | ||
The desired ISO 639 format of the language. | ||
- 'name': language name ('English') | ||
- 'alpha-2': alpha-2 code ('en') | ||
- 'alpha-3': alpha-3 code ('eng') | ||
(default: 'name') | ||
kb_path | ||
The path of user specified knowledge base. | ||
In current stage, it should be in the user's local directory | ||
following by the format we proposing. | ||
(default: 'default') | ||
inplace | ||
If True, delete the column containing the data that was cleaned. | ||
Otherwise, keep the original column. | ||
(default: False) | ||
errors | ||
How to handle parsing errors. | ||
- 'coerce': invalid parsing will be set to NaN. | ||
- 'ignore': invalid parsing will return the input. | ||
- 'raise': invalid parsing will raise an exception. | ||
(default: 'coerce') | ||
progress | ||
If True, display a progress bar. | ||
(default: True) | ||
Examples | ||
-------- | ||
Clean a column of language data. | ||
>>> df = pd.DataFrame({'language': ['eng', 'zh', 'Japanese']}) | ||
>>> clean_language(df, 'language') | ||
language language_clean | ||
0 eng English | ||
1 zh Chinese | ||
2 Japanese Japanese | ||
""" | ||
# load knowledge base | ||
_load_kb(kb_path) | ||
|
||
valid_output_formats = {"name", "alpha-2", "alpha-3"} | ||
if output_format not in valid_output_formats: | ||
raise ValueError( | ||
f'output_format {output_format} is invalid, it needs to be "name", ' | ||
'"alpha-2" or "alpha-3"' | ||
) | ||
input_formats = _convert_format_to_tuple(input_format) | ||
|
||
# convert to dask | ||
if isinstance(df, pd.DataFrame): | ||
df = to_dask(df) | ||
|
||
df[f"{col}_clean"] = df[col].map_partitions( | ||
lambda srs: [_format_language(x, input_formats, output_format, errors) for x in srs], | ||
meta=object, | ||
) | ||
|
||
with ProgressBar(minimum=0, disable=not progress): | ||
df = dask.compute(df)[0] | ||
|
||
if inplace: | ||
df = df.drop(columns=col) | ||
|
||
return df | ||
|
||
|
||
def validate_language( | ||
x: Union[str, pd.Series, dd.Series, pd.DataFrame, dd.DataFrame], | ||
col: Optional[str] = None, | ||
input_format: Union[str, Tuple[str, ...]] = "auto", | ||
kb_path: str = "default", | ||
) -> Union[bool, pd.Series, pd.DataFrame]: | ||
""" | ||
Validate language type data in a DataFrame column. For each cell, return True or False. | ||
Parameters | ||
---------- | ||
x | ||
Language data to be validated. It could be a single string, or | ||
a pandas or Dask DataFrame (with the parameter `col` to specify | ||
the column containing language data), or a pandas or Dask Series. | ||
col | ||
The name of the column to be validated. | ||
If x is not a pandas or Dask DataFrame, it would be ignored. | ||
If x is a pandas or Dask DataFrame but `col` is not specified, | ||
then the whole dataframe will be validated. | ||
(default: None) | ||
input_format | ||
The ISO 639 input format of the language. | ||
- 'auto': infer the input format | ||
- 'name': language name ('English') | ||
- 'alpha-2': alpha-2 code ('en') | ||
- 'alpha-3': alpha-3 code ('eng') | ||
Can also be a tuple containing any combination of input formats, | ||
for example to clean a column containing name and alpha-2 | ||
codes set input_format to ('name', 'alpha-2'). | ||
(default: 'auto') | ||
kb_path | ||
The path of user specified knowledge base. | ||
In current stage, it should be in the user's local directory | ||
following by the format we proposing. | ||
(default: "default") | ||
""" | ||
# load knowledge base | ||
_load_kb(kb_path) | ||
|
||
input_formats = _convert_format_to_tuple(input_format) | ||
|
||
if isinstance(x, str): | ||
return _check_language(x, input_formats, False) | ||
|
||
if isinstance(x, pd.Series): | ||
return x.apply(_check_language, args=(input_formats, False)) | ||
|
||
if isinstance(x, dd.Series): | ||
res = x.map_partitions( | ||
lambda srs: [_check_language(val, input_formats, False) for val in srs], meta=bool | ||
) | ||
return pd.Series(dask.compute(res)[0][0], name=x.name) # extract twice to get a list | ||
|
||
if isinstance(x, pd.DataFrame): | ||
x = to_dask(x) | ||
|
||
if isinstance(x, dd.DataFrame): | ||
if col is not None: | ||
res = x[col].map_partitions( | ||
lambda srs: [_check_language(val, input_formats, False) for val in srs], meta=bool | ||
) | ||
return pd.Series(dask.compute(res)[0][0], name=col) | ||
else: | ||
# validate the whole dataframe and return pd.Dataframe. | ||
res_df = pd.DataFrame() | ||
for col_t in x.columns: | ||
res = x[col_t].map_partitions( | ||
lambda srs: [_check_language(val, input_formats, False) for val in srs], | ||
meta=bool, | ||
) | ||
res_df[col_t] = pd.Series(dask.compute(res)[0][0], name=col_t) | ||
return res_df | ||
else: | ||
raise TypeError("must be str, Series or DataFrame") | ||
|
||
|
||
def _format_language( | ||
val: str, input_formats: Tuple[str, ...], output_format: str, errors: str | ||
) -> Any: | ||
""" | ||
Reformat a language string with proper output format. | ||
""" | ||
result_index, status = _check_language(val, input_formats, True) | ||
|
||
if status == "null": | ||
return np.nan | ||
if status == "unknown": | ||
if errors == "raise": | ||
raise ValueError(f"unable to parse value {val}") | ||
return val if errors == "ignore" else np.nan | ||
|
||
result = DATA.loc[result_index, output_format] | ||
if pd.isna(result): | ||
# country doesn't have the required output format | ||
if errors == "raise": | ||
raise ValueError(f"unable to parse value {val}") | ||
return val if errors == "ignore" else np.nan | ||
|
||
return result.title() if output_format == "name" else result | ||
|
||
|
||
def _check_language(val: str, input_formats: Tuple[str, ...], clean: bool) -> Any: | ||
""" | ||
Find the index of the given language string in the DATA dataframe. | ||
Parameters | ||
---------- | ||
val | ||
String containing the language value to be cleaned. | ||
input_formats | ||
Tuple containing potential ISO 639 input formats of the language. | ||
clean | ||
If True, a tuple (index, status) is returned. There are 3 status: | ||
- "null": val is a null value. | ||
- "unknown": val could not be parsed. | ||
- "success": a successful parse of the value. | ||
If False, the function returns True/False to be used by the validate function. | ||
""" | ||
if val in NULL_VALUES: | ||
return (None, "null") if clean else False | ||
|
||
val = str(val).lower().strip() | ||
|
||
for fmt in input_formats: | ||
try: | ||
ind = DATA.loc[DATA[fmt].str.lower() == val].index[0] | ||
except IndexError: | ||
continue | ||
else: | ||
return (ind, "success") if clean else True | ||
|
||
return (None, "unknown") if clean else False | ||
|
||
|
||
def _load_kb( | ||
kb_path: str, | ||
# encode: Optional[str] = None | ||
) -> Any: | ||
""" | ||
Load knowledge base from a specified path. | ||
""" | ||
global DATA | ||
if kb_path == "default": | ||
DATA = pd.read_csv(DEFAULT_LANGUAGE_DATA_FILE, dtype=str) | ||
else: | ||
DATA = pd.read_csv(kb_path, dtype=str) | ||
# check whether the format of the knowledge base is valid | ||
valid_formats = {"name", "alpha-2", "alpha-3"} | ||
for fmt in valid_formats: | ||
if fmt not in DATA.columns: | ||
raise KeyError( | ||
"knowledge base does not follow the format, " | ||
'it needs to contain "name", "alpha-2", and "alpha-3"' | ||
) | ||
|
||
|
||
def _convert_format_to_tuple(input_format: Union[str, Tuple[str, ...]]) -> Tuple[str, ...]: | ||
""" | ||
Converts a string input format to a tuple of allowed input formats and | ||
raises an error if an input format is not valid. | ||
""" | ||
if isinstance(input_format, str): | ||
if input_format == "auto": | ||
return ("name", "alpha-2", "alpha-3") | ||
else: | ||
input_format = (input_format,) | ||
|
||
valid_input_formats = {"auto", "name", "alpha-2", "alpha-3"} | ||
for fmt in input_format: | ||
if fmt not in valid_input_formats: | ||
raise ValueError( | ||
f'input_format {fmt} is invalid, it needs to be one of "auto", ' | ||
'"name", "alpha-2" or "alpha-3"' | ||
) | ||
if "auto" in input_format: | ||
return ("name", "alpha-2", "alpha-3") | ||
|
||
return input_format |
Oops, something went wrong.