add clean_language function

sfu-db · Jul 10, 2021 · b5af897 · b5af897
1 parent c735cd9
commit b5af897
Show file tree

Hide file tree

Showing 2 changed files with 8,200 additions and 0 deletions.
diff --git a/dataprep/clean/clean_language.py b/dataprep/clean/clean_language.py
@@ -0,0 +1,306 @@
+"""
+Clean and validate a DataFrame column containing language.
+"""
+
+# pylint: disable=too-many-arguments, global-statement
+
+from os import path
+from typing import Any, Union, Tuple, Optional
+
+import dask
+import dask.dataframe as dd
+import numpy as np
+import pandas as pd
+
+from ..progress_bar import ProgressBar
+from .utils import NULL_VALUES, to_dask
+
+DEFAULT_LANGUAGE_DATA_FILE = path.join(path.split(path.abspath(__file__))[0], "language_data.csv")
+
+DATA = pd.read_csv(DEFAULT_LANGUAGE_DATA_FILE, dtype=str)
+
+
+def clean_language(
+    df: Union[pd.DataFrame, dd.DataFrame],
+    col: str,
+    input_format: Union[str, Tuple[str, ...]] = "auto",
+    output_format: str = "name",
+    kb_path: str = "default",
+    inplace: bool = False,
+    errors: str = "coerce",
+    progress: bool = True,
+) -> pd.DataFrame:
+    """
+    Clean language type data in a DataFrame column.
+
+    Parameters
+    ----------
+        df
+            A pandas or Dask DataFrame containing the data to be cleaned.
+        col
+            The name of the column containing data of language type.
+        input_format
+            The ISO 639 input format of the language.
+                - 'auto': infer the input format
+                - 'name': language name ('English')
+                - 'alpha-2': alpha-2 code ('en')
+                - 'alpha-3': alpha-3 code ('eng')
+
+            Can also be a tuple containing any combination of input formats,
+            for example to clean a column containing name and alpha-2
+            codes set input_format to ('name', 'alpha-2').
+
+            (default: 'auto')
+        output_format
+            The desired ISO 639 format of the language.
+                - 'name': language name ('English')
+                - 'alpha-2': alpha-2 code ('en')
+                - 'alpha-3': alpha-3 code ('eng')
+
+            (default: 'name')
+        kb_path
+            The path of user specified knowledge base.
+            In current stage, it should be in the user's local directory
+            following by the format we proposing.
+
+            (default: 'default')
+        inplace
+           If True, delete the column containing the data that was cleaned.
+           Otherwise, keep the original column.
+
+           (default: False)
+        errors
+            How to handle parsing errors.
+            - 'coerce': invalid parsing will be set to NaN.
+            - 'ignore': invalid parsing will return the input.
+            - 'raise': invalid parsing will raise an exception.
+
+            (default: 'coerce')
+        progress
+            If True, display a progress bar.
+
+            (default: True)
+
+    Examples
+    --------
+    Clean a column of language data.
+
+    >>> df = pd.DataFrame({'language': ['eng', 'zh', 'Japanese']})
+    >>> clean_language(df, 'language')
+        language    language_clean
+    0       eng     English
+    1        zh     Chinese
+    2  Japanese     Japanese
+    """
+    # load knowledge base
+    _load_kb(kb_path)
+
+    valid_output_formats = {"name", "alpha-2", "alpha-3"}
+    if output_format not in valid_output_formats:
+        raise ValueError(
+            f'output_format {output_format} is invalid, it needs to be "name", '
+            '"alpha-2" or "alpha-3"'
+        )
+    input_formats = _convert_format_to_tuple(input_format)
+
+    # convert to dask
+    if isinstance(df, pd.DataFrame):
+        df = to_dask(df)
+
+    df[f"{col}_clean"] = df[col].map_partitions(
+        lambda srs: [_format_language(x, input_formats, output_format, errors) for x in srs],
+        meta=object,
+    )
+
+    with ProgressBar(minimum=0, disable=not progress):
+        df = dask.compute(df)[0]
+
+    if inplace:
+        df = df.drop(columns=col)
+
+    return df
+
+
+def validate_language(
+    x: Union[str, pd.Series, dd.Series, pd.DataFrame, dd.DataFrame],
+    col: Optional[str] = None,
+    input_format: Union[str, Tuple[str, ...]] = "auto",
+    kb_path: str = "default",
+) -> Union[bool, pd.Series, pd.DataFrame]:
+    """
+    Validate language type data in a DataFrame column. For each cell, return True or False.
+
+    Parameters
+    ----------
+    x
+        Language data to be validated. It could be a single string, or
+        a pandas or Dask DataFrame (with the parameter `col` to specify
+        the column containing language data), or a pandas or Dask Series.
+    col
+        The name of the column to be validated.
+        If x is not a pandas or Dask DataFrame, it would be ignored.
+        If x is a pandas or Dask DataFrame but `col` is not specified,
+        then the whole dataframe will be validated.
+
+        (default: None)
+    input_format
+        The ISO 639 input format of the language.
+            - 'auto': infer the input format
+            - 'name': language name ('English')
+            - 'alpha-2': alpha-2 code ('en')
+            - 'alpha-3': alpha-3 code ('eng')
+
+        Can also be a tuple containing any combination of input formats,
+        for example to clean a column containing name and alpha-2
+        codes set input_format to ('name', 'alpha-2').
+
+        (default: 'auto')
+    kb_path
+        The path of user specified knowledge base.
+        In current stage, it should be in the user's local directory
+        following by the format we proposing.
+
+        (default: "default")
+    """
+    # load knowledge base
+    _load_kb(kb_path)
+
+    input_formats = _convert_format_to_tuple(input_format)
+
+    if isinstance(x, str):
+        return _check_language(x, input_formats, False)
+
+    if isinstance(x, pd.Series):
+        return x.apply(_check_language, args=(input_formats, False))
+
+    if isinstance(x, dd.Series):
+        res = x.map_partitions(
+            lambda srs: [_check_language(val, input_formats, False) for val in srs], meta=bool
+        )
+        return pd.Series(dask.compute(res)[0][0], name=x.name)  # extract twice to get a list
+
+    if isinstance(x, pd.DataFrame):
+        x = to_dask(x)
+
+    if isinstance(x, dd.DataFrame):
+        if col is not None:
+            res = x[col].map_partitions(
+                lambda srs: [_check_language(val, input_formats, False) for val in srs], meta=bool
+            )
+            return pd.Series(dask.compute(res)[0][0], name=col)
+        else:
+            # validate the whole dataframe and return pd.Dataframe.
+            res_df = pd.DataFrame()
+            for col_t in x.columns:
+                res = x[col_t].map_partitions(
+                    lambda srs: [_check_language(val, input_formats, False) for val in srs],
+                    meta=bool,
+                )
+                res_df[col_t] = pd.Series(dask.compute(res)[0][0], name=col_t)
+            return res_df
+    else:
+        raise TypeError("must be str, Series or DataFrame")
+
+
+def _format_language(
+    val: str, input_formats: Tuple[str, ...], output_format: str, errors: str
+) -> Any:
+    """
+    Reformat a language string with proper output format.
+    """
+    result_index, status = _check_language(val, input_formats, True)
+
+    if status == "null":
+        return np.nan
+    if status == "unknown":
+        if errors == "raise":
+            raise ValueError(f"unable to parse value {val}")
+        return val if errors == "ignore" else np.nan
+
+    result = DATA.loc[result_index, output_format]
+    if pd.isna(result):
+        # country doesn't have the required output format
+        if errors == "raise":
+            raise ValueError(f"unable to parse value {val}")
+        return val if errors == "ignore" else np.nan
+
+    return result.title() if output_format == "name" else result
+
+
+def _check_language(val: str, input_formats: Tuple[str, ...], clean: bool) -> Any:
+    """
+    Find the index of the given language string in the DATA dataframe.
+
+    Parameters
+    ----------
+    val
+        String containing the language value to be cleaned.
+    input_formats
+        Tuple containing potential ISO 639 input formats of the language.
+    clean
+        If True, a tuple (index, status) is returned. There are 3 status:
+             - "null": val is a null value.
+             - "unknown": val could not be parsed.
+             - "success": a successful parse of the value.
+        If False, the function returns True/False to be used by the validate function.
+    """
+    if val in NULL_VALUES:
+        return (None, "null") if clean else False
+
+    val = str(val).lower().strip()
+
+    for fmt in input_formats:
+        try:
+            ind = DATA.loc[DATA[fmt].str.lower() == val].index[0]
+        except IndexError:
+            continue
+        else:
+            return (ind, "success") if clean else True
+
+    return (None, "unknown") if clean else False
+
+
+def _load_kb(
+    kb_path: str,
+    # encode: Optional[str] = None
+) -> Any:
+    """
+    Load knowledge base from a specified path.
+    """
+    global DATA
+    if kb_path == "default":
+        DATA = pd.read_csv(DEFAULT_LANGUAGE_DATA_FILE, dtype=str)
+    else:
+        DATA = pd.read_csv(kb_path, dtype=str)
+        # check whether the format of the knowledge base is valid
+        valid_formats = {"name", "alpha-2", "alpha-3"}
+        for fmt in valid_formats:
+            if fmt not in DATA.columns:
+                raise KeyError(
+                    "knowledge base does not follow the format, "
+                    'it needs to contain "name", "alpha-2", and "alpha-3"'
+                )
+
+
+def _convert_format_to_tuple(input_format: Union[str, Tuple[str, ...]]) -> Tuple[str, ...]:
+    """
+    Converts a string input format to a tuple of allowed input formats and
+    raises an error if an input format is not valid.
+    """
+    if isinstance(input_format, str):
+        if input_format == "auto":
+            return ("name", "alpha-2", "alpha-3")
+        else:
+            input_format = (input_format,)
+
+    valid_input_formats = {"auto", "name", "alpha-2", "alpha-3"}
+    for fmt in input_format:
+        if fmt not in valid_input_formats:
+            raise ValueError(
+                f'input_format {fmt} is invalid, it needs to be one of "auto", '
+                '"name", "alpha-2" or "alpha-3"'
+            )
+    if "auto" in input_format:
+        return ("name", "alpha-2", "alpha-3")
+
+    return input_format