Skip to content

Commit

Permalink
Fix count_by_dtypes
Browse files Browse the repository at this point in the history
  • Loading branch information
argenisleon committed Dec 2, 2019
1 parent dde5b7f commit 5da97c2
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 4 deletions.
4 changes: 1 addition & 3 deletions optimus/dataframe/columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
compress_list, compress_dict
from optimus.helpers.raiseit import RaiseIt
from optimus.ml.encoding import string_to_index as ml_string_to_index
from optimus.profiler.functions import fill_missing_var_types
from optimus.profiler.functions import fill_missing_var_types, parse_profiler_dtypes

ENGINE = "spark"
# Because the monkey patching and the need to call set a function we need to rename the standard python set.
Expand Down Expand Up @@ -1809,10 +1809,8 @@ def count_by_dtypes(columns, infer=False, str_funcs=None, int_funcs=None):

# Process mismatch
for col_name, result_dtypes in result.items():
result[col_name]["mismatch"] = 0
for result_dtype, count in result_dtypes.items():
if is_tuple(count):
result[col_name]["mismatch"] = result[col_name]["mismatch"] + count[1]
result[col_name][result_dtype] = count[0]

if infer is True:
Expand Down
18 changes: 17 additions & 1 deletion optimus/profiler/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,26 @@
import math

from optimus.helpers.constants import ProfilerDataTypes, PROFILER_COLUMN_TYPES, \
CONFIDENCE_LEVEL_CONSTANT
CONFIDENCE_LEVEL_CONSTANT, SPARK_DTYPES_TO_PROFILER
from optimus.helpers.json import json_converter


def parse_profiler_dtypes(col_data_type):
"""
Parse a spark data type to a profiler data type
:return:
"""

columns = {}
for col_name, data_type_count in col_data_type.items():
columns[col_name] = {data_type: 0 for data_type in ["null", "missing"]}
for data_type, count in data_type_count.items():
for profiler_data_type, spark_data_type in SPARK_DTYPES_TO_PROFILER.items():
if data_type in SPARK_DTYPES_TO_PROFILER[profiler_data_type]:
columns[col_name][profiler_data_type] = count
return columns


def fill_missing_var_types(var_types, dtypes):
"""
Fill missing data types with 0
Expand Down

0 comments on commit 5da97c2

Please sign in to comment.