Skip to content

Commit

Permalink
add clean_language function
Browse files Browse the repository at this point in the history
  • Loading branch information
NoirTree committed Jul 10, 2021
1 parent c735cd9 commit b5af897
Show file tree
Hide file tree
Showing 2 changed files with 8,200 additions and 0 deletions.
306 changes: 306 additions & 0 deletions dataprep/clean/clean_language.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,306 @@
"""
Clean and validate a DataFrame column containing language.
"""

# pylint: disable=too-many-arguments, global-statement

from os import path
from typing import Any, Union, Tuple, Optional

import dask
import dask.dataframe as dd
import numpy as np
import pandas as pd

from ..progress_bar import ProgressBar
from .utils import NULL_VALUES, to_dask

DEFAULT_LANGUAGE_DATA_FILE = path.join(path.split(path.abspath(__file__))[0], "language_data.csv")

DATA = pd.read_csv(DEFAULT_LANGUAGE_DATA_FILE, dtype=str)


def clean_language(
df: Union[pd.DataFrame, dd.DataFrame],
col: str,
input_format: Union[str, Tuple[str, ...]] = "auto",
output_format: str = "name",
kb_path: str = "default",
inplace: bool = False,
errors: str = "coerce",
progress: bool = True,
) -> pd.DataFrame:
"""
Clean language type data in a DataFrame column.
Parameters
----------
df
A pandas or Dask DataFrame containing the data to be cleaned.
col
The name of the column containing data of language type.
input_format
The ISO 639 input format of the language.
- 'auto': infer the input format
- 'name': language name ('English')
- 'alpha-2': alpha-2 code ('en')
- 'alpha-3': alpha-3 code ('eng')
Can also be a tuple containing any combination of input formats,
for example to clean a column containing name and alpha-2
codes set input_format to ('name', 'alpha-2').
(default: 'auto')
output_format
The desired ISO 639 format of the language.
- 'name': language name ('English')
- 'alpha-2': alpha-2 code ('en')
- 'alpha-3': alpha-3 code ('eng')
(default: 'name')
kb_path
The path of user specified knowledge base.
In current stage, it should be in the user's local directory
following by the format we proposing.
(default: 'default')
inplace
If True, delete the column containing the data that was cleaned.
Otherwise, keep the original column.
(default: False)
errors
How to handle parsing errors.
- 'coerce': invalid parsing will be set to NaN.
- 'ignore': invalid parsing will return the input.
- 'raise': invalid parsing will raise an exception.
(default: 'coerce')
progress
If True, display a progress bar.
(default: True)
Examples
--------
Clean a column of language data.
>>> df = pd.DataFrame({'language': ['eng', 'zh', 'Japanese']})
>>> clean_language(df, 'language')
language language_clean
0 eng English
1 zh Chinese
2 Japanese Japanese
"""
# load knowledge base
_load_kb(kb_path)

valid_output_formats = {"name", "alpha-2", "alpha-3"}
if output_format not in valid_output_formats:
raise ValueError(
f'output_format {output_format} is invalid, it needs to be "name", '
'"alpha-2" or "alpha-3"'
)
input_formats = _convert_format_to_tuple(input_format)

# convert to dask
if isinstance(df, pd.DataFrame):
df = to_dask(df)

df[f"{col}_clean"] = df[col].map_partitions(
lambda srs: [_format_language(x, input_formats, output_format, errors) for x in srs],
meta=object,
)

with ProgressBar(minimum=0, disable=not progress):
df = dask.compute(df)[0]

if inplace:
df = df.drop(columns=col)

return df


def validate_language(
x: Union[str, pd.Series, dd.Series, pd.DataFrame, dd.DataFrame],
col: Optional[str] = None,
input_format: Union[str, Tuple[str, ...]] = "auto",
kb_path: str = "default",
) -> Union[bool, pd.Series, pd.DataFrame]:
"""
Validate language type data in a DataFrame column. For each cell, return True or False.
Parameters
----------
x
Language data to be validated. It could be a single string, or
a pandas or Dask DataFrame (with the parameter `col` to specify
the column containing language data), or a pandas or Dask Series.
col
The name of the column to be validated.
If x is not a pandas or Dask DataFrame, it would be ignored.
If x is a pandas or Dask DataFrame but `col` is not specified,
then the whole dataframe will be validated.
(default: None)
input_format
The ISO 639 input format of the language.
- 'auto': infer the input format
- 'name': language name ('English')
- 'alpha-2': alpha-2 code ('en')
- 'alpha-3': alpha-3 code ('eng')
Can also be a tuple containing any combination of input formats,
for example to clean a column containing name and alpha-2
codes set input_format to ('name', 'alpha-2').
(default: 'auto')
kb_path
The path of user specified knowledge base.
In current stage, it should be in the user's local directory
following by the format we proposing.
(default: "default")
"""
# load knowledge base
_load_kb(kb_path)

input_formats = _convert_format_to_tuple(input_format)

if isinstance(x, str):
return _check_language(x, input_formats, False)

if isinstance(x, pd.Series):
return x.apply(_check_language, args=(input_formats, False))

if isinstance(x, dd.Series):
res = x.map_partitions(
lambda srs: [_check_language(val, input_formats, False) for val in srs], meta=bool
)
return pd.Series(dask.compute(res)[0][0], name=x.name) # extract twice to get a list

if isinstance(x, pd.DataFrame):
x = to_dask(x)

if isinstance(x, dd.DataFrame):
if col is not None:
res = x[col].map_partitions(
lambda srs: [_check_language(val, input_formats, False) for val in srs], meta=bool
)
return pd.Series(dask.compute(res)[0][0], name=col)
else:
# validate the whole dataframe and return pd.Dataframe.
res_df = pd.DataFrame()
for col_t in x.columns:
res = x[col_t].map_partitions(
lambda srs: [_check_language(val, input_formats, False) for val in srs],
meta=bool,
)
res_df[col_t] = pd.Series(dask.compute(res)[0][0], name=col_t)
return res_df
else:
raise TypeError("must be str, Series or DataFrame")


def _format_language(
val: str, input_formats: Tuple[str, ...], output_format: str, errors: str
) -> Any:
"""
Reformat a language string with proper output format.
"""
result_index, status = _check_language(val, input_formats, True)

if status == "null":
return np.nan
if status == "unknown":
if errors == "raise":
raise ValueError(f"unable to parse value {val}")
return val if errors == "ignore" else np.nan

result = DATA.loc[result_index, output_format]
if pd.isna(result):
# country doesn't have the required output format
if errors == "raise":
raise ValueError(f"unable to parse value {val}")
return val if errors == "ignore" else np.nan

return result.title() if output_format == "name" else result


def _check_language(val: str, input_formats: Tuple[str, ...], clean: bool) -> Any:
"""
Find the index of the given language string in the DATA dataframe.
Parameters
----------
val
String containing the language value to be cleaned.
input_formats
Tuple containing potential ISO 639 input formats of the language.
clean
If True, a tuple (index, status) is returned. There are 3 status:
- "null": val is a null value.
- "unknown": val could not be parsed.
- "success": a successful parse of the value.
If False, the function returns True/False to be used by the validate function.
"""
if val in NULL_VALUES:
return (None, "null") if clean else False

val = str(val).lower().strip()

for fmt in input_formats:
try:
ind = DATA.loc[DATA[fmt].str.lower() == val].index[0]
except IndexError:
continue
else:
return (ind, "success") if clean else True

return (None, "unknown") if clean else False


def _load_kb(
kb_path: str,
# encode: Optional[str] = None
) -> Any:
"""
Load knowledge base from a specified path.
"""
global DATA
if kb_path == "default":
DATA = pd.read_csv(DEFAULT_LANGUAGE_DATA_FILE, dtype=str)
else:
DATA = pd.read_csv(kb_path, dtype=str)
# check whether the format of the knowledge base is valid
valid_formats = {"name", "alpha-2", "alpha-3"}
for fmt in valid_formats:
if fmt not in DATA.columns:
raise KeyError(
"knowledge base does not follow the format, "
'it needs to contain "name", "alpha-2", and "alpha-3"'
)


def _convert_format_to_tuple(input_format: Union[str, Tuple[str, ...]]) -> Tuple[str, ...]:
"""
Converts a string input format to a tuple of allowed input formats and
raises an error if an input format is not valid.
"""
if isinstance(input_format, str):
if input_format == "auto":
return ("name", "alpha-2", "alpha-3")
else:
input_format = (input_format,)

valid_input_formats = {"auto", "name", "alpha-2", "alpha-3"}
for fmt in input_format:
if fmt not in valid_input_formats:
raise ValueError(
f'input_format {fmt} is invalid, it needs to be one of "auto", '
'"name", "alpha-2" or "alpha-3"'
)
if "auto" in input_format:
return ("name", "alpha-2", "alpha-3")

return input_format
Loading

0 comments on commit b5af897

Please sign in to comment.