Fix count_by_dtypes

eapframework · Dec 2, 2019 · 5da97c2 · 5da97c2
1 parent dde5b7f
commit 5da97c2
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 4 deletions.
diff --git a/optimus/dataframe/columns.py b/optimus/dataframe/columns.py
@@ -43,7 +43,7 @@
     compress_list, compress_dict
 from optimus.helpers.raiseit import RaiseIt
 from optimus.ml.encoding import string_to_index as ml_string_to_index
-from optimus.profiler.functions import fill_missing_var_types
+from optimus.profiler.functions import fill_missing_var_types, parse_profiler_dtypes
 
 ENGINE = "spark"
 # Because the monkey patching and the need to call set a function we need to rename the standard python set.
@@ -1809,10 +1809,8 @@ def count_by_dtypes(columns, infer=False, str_funcs=None, int_funcs=None):
 
         # Process mismatch
         for col_name, result_dtypes in result.items():
-            result[col_name]["mismatch"] = 0
             for result_dtype, count in result_dtypes.items():
                 if is_tuple(count):
-                    result[col_name]["mismatch"] = result[col_name]["mismatch"] + count[1]
                     result[col_name][result_dtype] = count[0]
 
         if infer is True:

diff --git a/optimus/profiler/functions.py b/optimus/profiler/functions.py
@@ -3,10 +3,26 @@
 import math
 
 from optimus.helpers.constants import ProfilerDataTypes, PROFILER_COLUMN_TYPES, \
-    CONFIDENCE_LEVEL_CONSTANT
+    CONFIDENCE_LEVEL_CONSTANT, SPARK_DTYPES_TO_PROFILER
 from optimus.helpers.json import json_converter
 
 
+def parse_profiler_dtypes(col_data_type):
+    """
+       Parse a spark data type to a profiler data type
+       :return:
+       """
+
+    columns = {}
+    for col_name, data_type_count in col_data_type.items():
+        columns[col_name] = {data_type: 0 for data_type in ["null", "missing"]}
+        for data_type, count in data_type_count.items():
+            for profiler_data_type, spark_data_type in SPARK_DTYPES_TO_PROFILER.items():
+                if data_type in SPARK_DTYPES_TO_PROFILER[profiler_data_type]:
+                    columns[col_name][profiler_data_type] = count
+    return columns
+
+
 def fill_missing_var_types(var_types, dtypes):
     """
     Fill missing data types with 0