From 23e7f359c6f23ae0aac7c5ab877cfa50d9e9ccd5 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Sun, 10 Nov 2019 21:51:15 -0600 Subject: [PATCH] Now can op to calculate advanced stats --- optimus/dataframe/extension.py | 16 +- optimus/helpers/constants.py | 2 + optimus/profiler/profiler.py | 485 ++++++++++++++++----------------- 3 files changed, 245 insertions(+), 258 deletions(-) diff --git a/optimus/dataframe/extension.py b/optimus/dataframe/extension.py index 5991c046..ba0ca663 100644 --- a/optimus/dataframe/extension.py +++ b/optimus/dataframe/extension.py @@ -462,7 +462,7 @@ def reset(self): @add_method(DataFrame) -def send(self, name=None, infer=True, mismatch=None, stats=True): +def send(self, name=None, infer=True, mismatch=None, stats=True, advanced_stats=True): """ Profile and send the data to the queue :param self: @@ -476,12 +476,14 @@ def send(self, name=None, infer=True, mismatch=None, stats=True): if name is not None: df.set_name(name) - columns, output = Profiler.instance.dataset(df, columns="*", buckets=35, infer=infer, relative_error=RELATIVE_ERROR, - approx_count=True, - sample=10000, - stats=stats, - format="json", - mismatch=mismatch) + output = Profiler.instance.dataset(df, columns="*", buckets=35, infer=infer, relative_error=RELATIVE_ERROR, + approx_count=True, + sample=10000, + stats=stats, + format="json", + mismatch=mismatch, + advanced_stats=advanced_stats + ) if Comm.instance: Comm.instance.send(output) diff --git a/optimus/helpers/constants.py b/optimus/helpers/constants.py index f7f41ecd..92d147e1 100644 --- a/optimus/helpers/constants.py +++ b/optimus/helpers/constants.py @@ -75,6 +75,8 @@ # Profiler PROFILER_COLUMN_TYPES = {"categorical", "numeric", "date", "null", "array", "binary"} +PYTHON_TO_PROFILER = {"string": "categorical", "boolean": "categorical", "int": "numeric", "decimal": "numeric", + "date": "date", "array": "array", "binaty": "binary", "null": "null"} SPARK_DTYPES_TO_PROFILER = {"int": ["smallint", "tinyint", "bigint", "int"], "decimal": ["float", "double"], "string": "string", "date": {"date", "timestamp"}, "boolean": "boolean", "binary": "binary", diff --git a/optimus/profiler/profiler.py b/optimus/profiler/profiler.py index 57ed3709..416e7fa6 100644 --- a/optimus/profiler/profiler.py +++ b/optimus/profiler/profiler.py @@ -13,7 +13,7 @@ from optimus.helpers.check import is_column_a, is_dict, is_list_of_str from optimus.helpers.columns import parse_columns from optimus.helpers.columns_expression import zeros_agg, count_na_agg, hist_agg, percentile_agg, count_uniques_agg -from optimus.helpers.constants import RELATIVE_ERROR, Actions, PYSPARK_NUMERIC_TYPES +from optimus.helpers.constants import RELATIVE_ERROR, Actions, PYSPARK_NUMERIC_TYPES, PYTHON_TO_PROFILER from optimus.helpers.decorators import time_it from optimus.helpers.functions import absolute_path from optimus.helpers.json import json_converter @@ -54,61 +54,9 @@ def __init__(self, output_path=None): self.output_columns = {} - def _count_data_types(self, df, columns, infer=False, mismatch=None): - """ - Count the number of int, float, string, date and booleans and output the count in json format - :param df: Dataframe to be processed - :param columns: Columns to be processed - :param infer: infer the column datatype - - :return: json - """ - - columns = parse_columns(df, columns) - - count_by_data_type = df.cols.count_by_dtypes(columns, infer=infer, mismatch=mismatch) - count_by_data_type_no_mismatch = copy.deepcopy(count_by_data_type) - # Info from all the columns - type_details = {} - - for col_name in columns: - - """ - Function for determine if register value is float or int or string. - :param col_name: - :return: - """ - # Not count mismatch - if "mismatch" in count_by_data_type_no_mismatch[col_name]: - count_by_data_type_no_mismatch[col_name].pop("mismatch") - - # Get the greatest count by column data type - greatest_data_type_count = max(count_by_data_type_no_mismatch[col_name], - key=count_by_data_type_no_mismatch[col_name].get) - if greatest_data_type_count == "string" or greatest_data_type_count == "boolean": - cat = "categorical" - elif greatest_data_type_count == "int" or greatest_data_type_count == "decimal": - cat = "numeric" - elif greatest_data_type_count == "date": - cat = "date" - elif greatest_data_type_count == "array": - cat = "array" - elif greatest_data_type_count == "binary": - cat = "binary" - elif greatest_data_type_count == "null": - cat = "null" - else: - cat = None - - assign(type_details, col_name + ".dtype", greatest_data_type_count, dict) - assign(type_details, col_name + ".type", cat, dict) - assign(type_details, col_name + ".stats", count_by_data_type[col_name], dict) - # print(type_details) - return type_details - @time_it def run(self, df, columns="*", buckets=MAX_BUCKETS, infer=False, relative_error=RELATIVE_ERROR, approx_count=True, - mismatch=None): + mismatch=None, advanced_stats=True): """ Return dataframe statistical information in HTML Format :param df: Dataframe to be analyzed @@ -118,12 +66,13 @@ def run(self, df, columns="*", buckets=MAX_BUCKETS, infer=False, relative_error= :param relative_error: Relative Error for quantile discretizer calculation :param approx_count: Use approx_count_distinct or countDistinct :param mismatch: + :param advanced_stats: :return: """ columns = parse_columns(df, columns) - columns, output = self.dataset(df, columns, buckets, infer, relative_error, approx_count, format="dict", - mismatch=mismatch) + output = self.dataset(df, columns, buckets, infer, relative_error, approx_count, format="dict", + mismatch=mismatch, advanced_stats=advanced_stats) # Load jinja template_loader = jinja2.FileSystemLoader(searchpath=absolute_path("/profiler/templates/out")) @@ -218,13 +167,138 @@ def to_file(self, path=None, output="html"): def to_json(self, df, columns="*", buckets=10, infer=False, relative_error=RELATIVE_ERROR, approx_count=True, sample=10000, stats=True, mismatch=None): - columns, output = self.dataset(df, columns=columns, buckets=buckets, infer=infer, relative_error=relative_error, - approx_count=approx_count, - sample=sample, stats=stats, format="json", mismatch=mismatch) - return output + return self.dataset(df, columns=columns, buckets=buckets, infer=infer, relative_error=relative_error, + approx_count=approx_count, + sample=sample, stats=stats, format="json", mismatch=mismatch) + + def cols_needs_profiling(self, df, columns): + """ + Calculate the columns that needs to be profiled. + :return: + """ + # Metadata + # If not empty the profiler already run. + # So process the dataframe's metadata to be sure which columns need to be profiled + + actions = df.get_meta("transformations.actions") + are_actions = actions is not None and len(actions) > 0 + + # Process actions to check if any column must be processed + if self.is_cached(): + if are_actions: + + drop = ["drop"] + + def match_actions_names(_actions): + """ + Get a list of columns which have been applied and specific action. + :param _actions: + :return: + """ + + _actions_json = df.get_meta("transformations.actions") + + modified = [] + for action in _actions: + if _actions_json.get(action): + # Check if was renamed + col = _actions_json.get(action) + if len(match_renames(col)) == 0: + _result = col + else: + _result = match_renames(col) + modified = modified + _result + + return modified + + def match_renames(_col_names): + """ + Get a list fo columns and return the renamed version. + :param _col_names: + :return: + """ + _renamed_columns = [] + _actions = df.get_meta("transformations.actions") + _rename = _actions.get("rename") + + def get_name(_col_name): + c = _rename.get(_col_name) + # The column has not been rename. Get the actual column name + if c is None: + c = _col_name + return c + + if _rename: + # if a list + if is_list_of_str(_col_names): + for _col_name in _col_names: + # The column name has been changed. Get the new name + _renamed_columns.append(get_name(_col_name)) + # if a dict + if is_dict(_col_names): + for _col1, _col2 in _col_names.items(): + _renamed_columns.append({get_name(_col1): get_name(_col2)}) + + else: + _renamed_columns = _col_names + return _renamed_columns + + # New columns + new_columns = [] + + current_col_names = df.cols.names() + renamed_cols = match_renames(df.get_meta("transformations.columns")) + for current_col_name in current_col_names: + if current_col_name not in renamed_cols: + new_columns.append(current_col_name) + + # Rename keys to match new names + profiler_columns = self.output_columns["columns"] + actions = df.get_meta("transformations.actions") + rename = actions.get("rename") + if rename: + for k, v in actions["rename"].items(): + profiler_columns[v] = profiler_columns.pop(k) + profiler_columns[v]["name"] = v + + # Drop Keys + for col_names in match_actions_names(drop): + profiler_columns.pop(col_names) + + # Copy Keys + copy_columns = df.get_meta("transformations.actions.copy") + if copy_columns is not None: + for source, target in copy_columns.items(): + profiler_columns[target] = profiler_columns[source].copy() + profiler_columns[target]["name"] = target + # Check is a new column is a copied column + new_columns = list(set(new_columns) - set(copy_columns.values())) + + # Actions applied to current columns + + modified_columns = match_actions_names(Actions.list()) + calculate_columns = modified_columns + new_columns + + # Remove duplicated. + calculate_columns = list(set(calculate_columns)) + + elif not are_actions: + calculate_columns = None + # elif not is_cached: + else: + calculate_columns = columns + + return calculate_columns + + def is_cached(self): + """ + + :return: + """ + return len(self.output_columns) > 0 def dataset(self, df, columns="*", buckets=10, infer=False, relative_error=RELATIVE_ERROR, approx_count=True, - sample=10000, stats=True, format="dict", mismatch=None): + sample=10000, stats=True, format="dict", mismatch=None, advanced_stats=False): """ Return the profiling data in json format :param df: Dataframe to be processed @@ -237,136 +311,24 @@ def dataset(self, df, columns="*", buckets=10, infer=False, relative_error=RELAT :param stats: calculate stats, if not only data table returned :param format: dict or json :param mismatch: - :return: json file + :return: dict or json """ - output_columns = self.output_columns + cols_to_profile = self.cols_needs_profiling(df, columns) - # Metadata - # If not empty the profiler already run. - # So process the dataframe's metadata to be sure which columns need to be profiled - is_cached = len(self.output_columns) > 0 - actions = df.get_meta("transformations.actions") - are_actions = actions is not None and len(actions) > 0 - - # Process actions to check if any column must be processed - if is_cached and are_actions: - - drop = ["drop"] - - def match_actions_names(_actions): - """ - Get a list of columns which have been applied and specific action. - :param _actions: - :return: - """ - - _actions_json = df.get_meta("transformations.actions") - - modified = [] - for action in _actions: - if _actions_json.get(action): - # Check if was renamed - col = _actions_json.get(action) - if len(match_renames(col)) == 0: - _result = col - else: - _result = match_renames(col) - modified = modified + _result - - return modified - - def match_renames(_col_names): - """ - Get a list fo columns and return the renamed version. - :param _col_names: - :return: - """ - _renamed_columns = [] - _actions = df.get_meta("transformations.actions") - _rename = _actions.get("rename") - - def get_name(_col_name): - c = _rename.get(_col_name) - # The column has not been rename. Get the actual column name - if c is None: - c = _col_name - return c - - if _rename: - # if a list - if is_list_of_str(_col_names): - for _col_name in _col_names: - # The column name has been changed. Get the new name - _renamed_columns.append(get_name(_col_name)) - # if a dict - if is_dict(_col_names): - for _col1, _col2 in _col_names.items(): - _renamed_columns.append({get_name(_col1): get_name(_col2)}) - - else: - _renamed_columns = _col_names - return _renamed_columns - - # New columns - new_columns = [] - - current_col_names = df.cols.names() - renamed_cols = match_renames(df.get_meta("transformations.columns")) - for current_col_name in current_col_names: - if current_col_name not in renamed_cols: - new_columns.append(current_col_name) - - # Rename keys to match new names - profiler_columns = self.output_columns["columns"] - actions = df.get_meta("transformations.actions") - rename = actions.get("rename") - if rename: - for k, v in actions["rename"].items(): - profiler_columns[v] = profiler_columns.pop(k) - profiler_columns[v]["name"] = v - - # Drop Keys - for col_names in match_actions_names(drop): - profiler_columns.pop(col_names) - - # Copy Keys - copy_columns = df.get_meta("transformations.actions.copy") - if copy_columns is not None: - for source, target in copy_columns.items(): - profiler_columns[target] = profiler_columns[source].copy() - profiler_columns[target]["name"] = target - # Check is a new column is a copied column - new_columns = list(set(new_columns) - set(copy_columns.values())) - - # Actions applied to current columns - - modified_columns = match_actions_names(Actions.list()) - calculate_columns = modified_columns + new_columns - - # Remove duplicated. - calculate_columns = list(set(calculate_columns)) - - elif is_cached and not are_actions: - calculate_columns = None - # elif not is_cached: - else: - calculate_columns = columns - - # print ("calculate_columns",calculate_columns) # Get the stats for all the columns if stats is True: # Are there column to process? - if calculate_columns or not is_cached: + if cols_to_profile or not self.is_cached(): rows_count = df.count() self.rows_count = rows_count self.cols_count = cols_count = len(df.columns) - output_columns = self.columns_stats(df, calculate_columns, buckets, infer, relative_error, approx_count, - mismatch) + output_columns = self.columns_stats(df, cols_to_profile, buckets, infer, relative_error, approx_count, + mismatch, advanced_stats) # Update last profiling info # Merge old and current profiling - if is_cached: + if self.is_cached(): output_columns["columns"].update(self.output_columns["columns"]) assign(output_columns, "name", df.get_name(), dict) @@ -402,26 +364,27 @@ def get_name(_col_name): df = df.set_meta(value={}) df = df.columns_meta(df.cols.names()) - col_names = output_columns["columns"].keys() + # col_names = output_columns["columns"].keys() if format == "json": result = json.dumps(output_columns, ignore_nan=True, default=json_converter) else: result = output_columns self.output_columns = output_columns - df = df.set_meta("transformations.actions", {}) + df.set_meta("transformations.actions", {}) - return col_names, result + return result def columns_stats(self, df, columns, buckets=10, infer=False, relative_error=RELATIVE_ERROR, approx_count=True, - mismatch=None): + mismatch=None, advanced_stats=True): """ Return statistical information about a specific column in json format :param df: Dataframe to be processed :param columns: Columns that you want to profile :param buckets: Create buckets divided by range. Each bin is equal. :param infer: try to infer the column dataType - :param relative_error: relative error when the percentile is calculated. 0 is more precision/slow 1 less precision/faster + :param relative_error: relative error when the percentile is calculated. + 0 more precision/slow 1 less precision/faster :param approx_count: Use the function approx_count_distinct or countDistinct. approx_count_distinct is faster :param mismatch: :return: json object @@ -434,7 +397,26 @@ def columns_stats(self, df, columns, buckets=10, infer=False, relative_error=REL logger.print("Processing Stats For columns...") # Get columns data types. This is necessary to make the pertinent histogram calculations. - type_details = self._count_data_types(df, columns, infer, mismatch) + count_by_data_type = df.cols.count_by_dtypes(columns, infer=infer, mismatch=mismatch) + + count_by_data_type_no_mismatch = copy.deepcopy(count_by_data_type) + + # Info from all the columns + type_details = {} + + for col_name in columns: + # Not count mismatch + if "mismatch" in count_by_data_type_no_mismatch[col_name]: + count_by_data_type_no_mismatch[col_name].pop("mismatch") + + # Get the greatest count by column data type + greatest_data_type_count = max(count_by_data_type_no_mismatch[col_name], + key=count_by_data_type_no_mismatch[col_name].get) + cat = PYTHON_TO_PROFILER.get(greatest_data_type_count) + + assign(type_details, col_name + ".dtype", greatest_data_type_count, dict) + assign(type_details, col_name + ".type", cat, dict) + assign(type_details, col_name + ".stats", count_by_data_type[col_name], dict) # Count the categorical, numerical, boolean and date columns count_types = {} @@ -446,23 +428,16 @@ def columns_stats(self, df, columns, buckets=10, infer=False, relative_error=REL count_types[name] = 1 # List the data types this data set have - total = 0 - dtypes = [] - for key, value in count_types.items(): - if value > 0: - dtypes.append(key) - total = total + 1 - - count_types = fill_missing_col_types(count_types) + dtypes = [key for key, value in count_types.items() if value > 0] columns_info = {} - columns_info["count_types"] = count_types - columns_info["total_count_dtypes"] = total + columns_info["count_types"] = fill_missing_col_types(count_types) + columns_info["total_count_dtypes"] = len(dtypes) columns_info["dtypes_list"] = dtypes columns_info["columns"] = type_details # Aggregation - stats = Profiler.columns_agg(df, columns, buckets, relative_error, approx_count) + stats = self.columns_agg(df, columns, buckets, relative_error, approx_count, advanced_stats) # Calculate Frequency logger.print("Processing Frequency ...") @@ -471,7 +446,6 @@ def columns_stats(self, df, columns, buckets=10, infer=False, relative_error=REL if df_freq is not None: freq = df_freq.cols.frequency("*", buckets, True, self.rows_count) - # Calculate percentage for col_name in columns: col_info = {} assign(col_info, "stats", stats[col_name], dict) @@ -480,7 +454,6 @@ def columns_stats(self, df, columns, buckets=10, infer=False, relative_error=REL if col_name in freq: assign(col_info, "frequency", freq[col_name]) - col_info["stats"].update(self.extra_columns_stats(df, col_name, stats)) assign(col_info, "name", col_name) assign(col_info, "column_dtype", columns_info["columns"][col_name]['dtype']) assign(col_info, "dtypes_stats", columns_info["columns"][col_name]['stats']) @@ -491,46 +464,50 @@ def columns_stats(self, df, columns, buckets=10, infer=False, relative_error=REL return columns_info - @staticmethod - def columns_agg(df, columns, buckets=10, relative_error=RELATIVE_ERROR, approx_count=True): + def columns_agg(self, df, columns, buckets=10, relative_error=RELATIVE_ERROR, approx_count=True, advanced_stats=True): columns = parse_columns(df, columns) n = BATCH_SIZE list_columns = [columns[i * n:(i + 1) * n] for i in range((len(columns) + n - 1) // n)] - # we have problems sending +100 columns at the same time. Process in batch + # we have problems sending +100 columns at the same time. Processing in batch result = {} + for i, cols in enumerate(list_columns): logger.print("Batch Stats {BATCH_NUMBER}. Processing columns{COLUMNS}".format(BATCH_NUMBER=i, COLUMNS=cols)) + # Count uniques is necessary for calculate the histogram buckets funcs = [count_uniques_agg] exprs = df.cols.create_exprs(cols, funcs, approx_count) - # TODO: in basic calculations funcs = [F.min, F.max] - funcs = [F.min, F.max, F.stddev, F.kurtosis, F.mean, F.skewness, F.sum, F.variance, zeros_agg] + funcs = [F.min, F.max] exprs.extend(df.cols.create_exprs(cols, funcs)) - # TODO: None in basic calculation - funcs = [percentile_agg] - exprs.extend(df.cols.create_exprs(cols, funcs, df, [0.05, 0.25, 0.5, 0.75, 0.95], - relative_error)) - funcs = [count_na_agg] exprs.extend(df.cols.create_exprs(cols, funcs, df)) + + if advanced_stats is True: + funcs = [F.stddev, F.kurtosis, F.mean, F.skewness, F.sum, F.variance, zeros_agg] + exprs.extend(df.cols.create_exprs(cols, funcs)) + + # TODO: None in basic calculation + funcs = [percentile_agg] + exprs.extend(df.cols.create_exprs(cols, funcs, df, [0.05, 0.25, 0.5, 0.75, 0.95], + relative_error)) + result.update(df.cols.exec_agg(exprs)) - exprs = [] n = BATCH_SIZE result_hist = {} list_columns = [columns[i * n:(i + 1) * n] for i in range((len(columns) + n - 1) // n)] + for i, cols in enumerate(list_columns): logger.print( "Batch Histogram {BATCH_NUMBER}. Processing columns{COLUMNS}".format(BATCH_NUMBER=i, COLUMNS=cols)) funcs = [hist_agg] - # min_max = None for col_name in cols: - # Only process histogram id numeric. For toher data types using frequency + # Only process histogram for numeric columns. For other data types using frequency if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES): min_max = {"min": result[col_name]["min"], "max": result[col_name]["max"]} buckets = result[col_name]["count_uniques"] - 1 @@ -539,7 +516,6 @@ def columns_agg(df, columns, buckets=10, relative_error=RELATIVE_ERROR, approx_c elif buckets == 0: buckets = 1 exprs.extend(df.cols.create_exprs(col_name, funcs, df, buckets, min_max)) - agg_result = df.cols.exec_agg(exprs) if agg_result is not None: result_hist.update(agg_result) @@ -548,56 +524,63 @@ def columns_agg(df, columns, buckets=10, relative_error=RELATIVE_ERROR, approx_c for col_name in result: if col_name in result_hist: result[col_name].update(result_hist[col_name]) - return result - def extra_columns_stats(self, df, col_name, stats): - """ - Specific Stats for numeric columns - :param df: - :param col_name: - :param stats: - :return: - """ + def extra_columns_stats(df, col_name, stats): + """ + Specific Stats for numeric columns + :param df: + :param col_name: + :param stats: + :return: + """ - col_info = {} + col_info = {} - max_value = stats[col_name]["max"] - min_value = stats[col_name]["min"] + max_value = stats[col_name]["max"] + min_value = stats[col_name]["min"] - if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES): - stddev = stats[col_name]['stddev'] - mean = stats[col_name]['mean'] + if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES): + stddev = stats[col_name]['stddev'] + mean = stats[col_name]['mean'] - quantile = stats[col_name]["percentile"] - if max_value is not None and min_value is not None: - col_info['range'] = max_value - min_value - else: - col_info['range'] = None + quantile = stats[col_name]["percentile"] + if max_value is not None and min_value is not None: + col_info['range'] = max_value - min_value + else: + col_info['range'] = None - col_info['median'] = quantile["0.5"] + col_info['median'] = quantile["0.5"] - q1 = quantile["0.25"] - q3 = quantile["0.75"] + q1 = quantile["0.25"] + q3 = quantile["0.75"] - if q1 is not None and q3 is not None: - col_info['interquartile_range'] = q3 - q1 - else: - col_info['interquartile_range'] = None + if q1 is not None and q3 is not None: + col_info['interquartile_range'] = q3 - q1 + else: + col_info['interquartile_range'] = None - if mean != 0 and mean is not None: - col_info['coef_variation'] = round((stddev / mean), 5) - else: - col_info['coef_variation'] = None + if mean != 0 and mean is not None: + col_info['coef_variation'] = round((stddev / mean), 5) + else: + col_info['coef_variation'] = None + + mad = df.cols.mad(col_name) + if mad is not None: + col_info['mad'] = round(df.cols.mad(col_name), 5) + else: + col_info['mad'] = None + + col_info['p_count_na'] = round((stats[col_name]['count_na'] * 100) / self.rows_count, 2) + col_info['p_count_uniques'] = round((stats[col_name]['count_uniques'] * 100) / self.rows_count, 2) + return col_info + + if advanced_stats is True: + for col_name in columns: + result.update(extra_columns_stats(df, col_name, result)) + + return result - mad = df.cols.mad(col_name) - if mad is not None: - col_info['mad'] = round(df.cols.mad(col_name), 5) - else: - col_info['mad'] = None - col_info['p_count_na'] = round((stats[col_name]['count_na'] * 100) / self.rows_count, 2) - col_info['p_count_uniques'] = round((stats[col_name]['count_uniques'] * 100) / self.rows_count, 2) - return col_info @staticmethod def missing_values(df, columns):