diff --git a/optimus/dataframe/columns.py b/optimus/dataframe/columns.py index 06e015fd..9119fc75 100644 --- a/optimus/dataframe/columns.py +++ b/optimus/dataframe/columns.py @@ -37,6 +37,7 @@ from optimus.helpers.parser import compress_list, compress_dict, parse_python_dtypes, parse_col_names_funcs_to_keys from optimus.helpers.raiseit import RaiseIt from optimus.ml.encoding import string_to_index as ml_string_to_index +from optimus.ml.encoding import index_to_string as ml_index_to_string from optimus.profiler.functions import fill_missing_var_types, parse_profiler_dtypes from optimus import ROOT_DIR @@ -2092,6 +2093,21 @@ def string_to_index(input_cols=None, output_cols=None, columns=None): return df + @add_attr(cols) + def index_to_string(input_cols=None, output_cols=None, columns=None): + """ + Encodes a string column of labels to a column of label indices + :param input_cols: + :param output_cols: + :param columns: + :return: + """ + df = self + + df = ml_index_to_string(df, input_cols, output_cols, columns) + + return df + @add_attr(cols) def bucketizer(input_cols, splits, output_cols=None): """ diff --git a/optimus/helpers/constants.py b/optimus/helpers/constants.py index cbedab98..8a692ada 100644 --- a/optimus/helpers/constants.py +++ b/optimus/helpers/constants.py @@ -41,6 +41,7 @@ class Actions(Enum): VALUES_TO_COLS = "values_to_cols" SET = "set" STRING_TO_INDEX = "string_to_index" + INDEX_TO_STRING = "index_to_string" MIN_MAX_SCALER = "min_max_scaler" MAX_ABS_SCALER = "max_abs_scaler" # ROWS diff --git a/optimus/ml/encoding.py b/optimus/ml/encoding.py index 1218a625..ae257b22 100644 --- a/optimus/ml/encoding.py +++ b/optimus/ml/encoding.py @@ -1,10 +1,10 @@ from pyspark.ml import feature, Pipeline -from pyspark.ml.feature import StringIndexer, IndexToString, OneHotEncoder, VectorAssembler, Normalizer +from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, Normalizer, IndexToString -from optimus.infer import is_, is_str, is_dataframe from optimus.helpers.columns import parse_columns, name_col, get_output_cols from optimus.helpers.constants import Actions from optimus.helpers.raiseit import RaiseIt +from optimus.infer import is_, is_str, is_dataframe def n_gram(df, input_col, n=2): @@ -36,6 +36,7 @@ def string_to_index(df, input_cols, output_cols=None, columns=None, **kargs): :param df: Dataframe to be transformed :param input_cols: Columns to be indexed. :param output_cols:Column where the ouput is going to be saved + :param columns: :return: Dataframe with indexed columns. """ df_actual = df @@ -43,7 +44,7 @@ def string_to_index(df, input_cols, output_cols=None, columns=None, **kargs): if columns is None: input_cols = parse_columns(df, input_cols) if output_cols is None: - output_cols = [name_col(input_col, "index_to_string") for input_col in input_cols] + output_cols = [name_col(input_col, "string_to_index") for input_col in input_cols] output_cols = get_output_cols(input_cols, output_cols) else: input_cols, output_cols = zip(*columns) @@ -59,27 +60,34 @@ def string_to_index(df, input_cols, output_cols=None, columns=None, **kargs): return df -def index_to_string(df, input_cols, output_col=None, **kargs): +def index_to_string(df, input_cols, output_cols=None, columns=None, **kargs): """ Maps a column of indices back to a new column of corresponding string values. The index-string mapping is either from the ML attributes of the input column, or from user-supplied labels (which take precedence over ML attributes). :param df: Dataframe to be transformed. :param input_cols: Columns to be indexed. - :param output_col: Column where the output is going to be saved. + :param output_cols: Column where the output is going to be saved. + :param columns: :return: Dataframe with indexed columns. """ + df_actual = df - input_cols = parse_columns(df, input_cols) - if output_col is None: - output_col = name_col(input_cols, "index_to_string") - - indexers = [IndexToString(inputCol=column, outputCol=output_col, **kargs) for column in - list(set(input_cols))] + if columns is None: + input_cols = parse_columns(df, input_cols) + if output_cols is None: + output_cols = [name_col(input_col, "index_to_string") for input_col in input_cols] + output_cols = get_output_cols(input_cols, output_cols) + else: + input_cols, output_cols = zip(*columns) + indexers = [IndexToString(inputCol=input_col, outputCol=output_col, **kargs) for input_col, output_col + in zip(list(set(input_cols)), list(set(output_cols)))] pipeline = Pipeline(stages=indexers) df = pipeline.fit(df).transform(df) + df = df.preserve_meta(df_actual, Actions.INDEX_TO_STRING.value, output_cols) + return df diff --git a/tests/creator/creator.ipynb b/tests/creator/creator.ipynb index 3358caa9..8d06dd73 100644 --- a/tests/creator/creator.ipynb +++ b/tests/creator/creator.ipynb @@ -9,18 +9,9 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" @@ -28,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 2, "metadata": { "inputHidden": false, "outputHidden": false @@ -41,9 +32,26 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "C:\\Users\\argenisleon\\Documents\\Optimus\\tests\\creator\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " You are using PySparkling of version 2.4.10, but your PySpark is of\n", + " version 2.3.1. Please make sure Spark and PySparkling versions are compatible. \n" + ] + } + ], "source": [ "from optimus import Optimus\n", "from optimus.helpers.test import Test" @@ -51,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -79,20 +87,7 @@ " \\____/ .___/\\__/_/_/ /_/ /_/\\__,_/____/ \n", " /_/ \n", " \n", - "INFO:optimus:Transform and Roll out...\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "C:/Users/argenisleon/Documents/Optimus/optimus/../parse/infer.py\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ + "INFO:optimus:Transform and Roll out...\n", "INFO:optimus:Optimus successfully imported. Have fun :).\n", "INFO:optimus:Config.ini not found\n" ] @@ -104,7 +99,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 5, "metadata": { "lines_to_next_cell": 2 }, @@ -1280,7 +1275,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -2319,7 +2314,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -2328,7 +2323,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -2341,7 +2336,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -2363,4989 +2358,975 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(source_df_string_to_index, \"cols.string_to_index\", None, \"df\", None, \"rank\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(source_df_string_to_index, \"cols.values_to_cols\", None, \"df\", None, \"rank\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(source_df_string_to_index, \"cols.values_to_cols\", \"all_columns\", \"df\", None, [\"names\",\"height(ft)\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.remove\", None, \"df\", None, string_col, \"i\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.remove\", \"list\", \"df\", string_col, [\"a\",\"i\",\"Es\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.remove\", \"list_output\", \"df\", string_col, [\"a\",\"i\",\"Es\"], output_cols=string_col+\"_new\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.min\", None, \"json\", numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.min\", \"all_columns\", \"json\", None, \"*\")\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.max\", None, \"json\", numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.max\", \"all_columns\", \"json\", None, \"*\")\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.range\", None, \"json\",None, numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.range\", \"all_columns\", \"json\",None, \"*\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "source_df.table()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.median\", None, \"json\", None,numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.median\", \"all_columns\", \"json\", None, \"*\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.percentile\", None, \"json\", None, numeric_col, [0.05, 0.25], 1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.percentile\", \"all_columns\", \"json\", None, \"*\", [0.05, 0.25], 1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## MAD" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.mad\", None, \"json\", None, numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.mad\", \"all_columns\", \"json\", None, \"*\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.std\", None, \"json\", numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.std\", \"all_columns\", \"json\", None, \"*\")\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.kurt\", None, \"json\", None, numeric_col)\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.kurt\", \"all_columns\", \"json\", None, \"*\")\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.mean\", None, \"json\", numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.mean\", \"all_columns\", \"json\", None, \"*\")\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.skewness\", None, \"json\", numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.skewness\", \"all_columns\", \"json\", None, \"*\")\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.sum\", None, \"json\", numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.sum\", \"all_columns\", \"json\", None,\"*\")\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.variance\", None, \"json\", numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.variance\", \"all_columns\", \"json\", None, \"*\")\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "source_df.table()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pyspark.sql import functions as F\n", - "source_df.select(F.abs(F.col(\"age\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.abs\", None, \"df\", None,\"weight(t)\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.abs\", \"all_columns\", \"json\", None, \"*\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "source_df.table()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pyspark.sql import functions as F\n", - "\n", - "source_df.select(F.abs(\"weight(t)\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 30, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Creating test_cols_mode() test function...\n" + "Creating test_cols_string_to_index() test function...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:optimus:test_cols_mode()\n" + "INFO:optimus:test_cols_string_to_index()\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "None\n" - ] - } - ], - "source": [ - "t.create(None, \"cols.mode\", None, \"json\", None, numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating test_cols_mode_all_columns() test function...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:optimus:test_cols_mode_all_columns()\n" + "[\"'rank'\"]\n" ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[{'names': None}, {'height(ft)': None}, {'function': None}, {'rank': [8, 7, 10]}, {'age': 5000000}, {'weight(t)': None}, {'japanese name': None}, {'last position seen': None}, {'date arrival': '1980/04/10'}, {'last date seen': None}, {'attributes': None}, {'Date Type': None}, {'timestamp': datetime.datetime(2014, 6, 24, 0, 0)}, {'Cybertronian': True}, {'function(binary)': None}, {'NullType': None}]\n", - "Wall time: 40.1 s\n" - ] - } - ], - "source": [ - "%%time\n", - "t.create(None, \"cols.mode\", \"all_columns\", \"json\", None,\"*\")" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating file ../test_df_cols.py\n", - "Done\n" - ] - } - ], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.count\", None, \"json\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Count na" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.count_na\", None, \"json\", None, numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.count_na\", \"all_columns\", \"json\",None, \"*\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "source_df.cols.names(\"rank\",[\"str\",\"int\",\"float\"],True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.count_zeros\", None, \"json\", numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.count_zeros\", \"all_columns\", \"json\", None, \"*\")\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "source_df.cols.names()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Value counts" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.value_counts\", None, \"json\", None, numeric_col)\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.value_counts\", \"all_columns\", \"json\", None, \"*\")\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "t.create(None, \"cols.count_uniques\", None, \"json\", None, numeric_col)\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.count_uniques\", \"all_columns\", \"json\",None, \"*\")\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.unique\", None, \"json\", None,numeric_col)\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.unique\", \"all_columns\", \"json\", None,\"*\")\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.add\", None, \"df\", [numeric_col, numeric_col_B])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.add\", \"all_columns\", \"df\", \"*\")," - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.sub\", None, \"df\", [numeric_col, numeric_col_B])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.sub\", \"all_columns\", \"df\", \"*\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.mul\", None, \"df\", [numeric_col, numeric_col_B])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.mul\", \"all_columns\", \"df\", \"*\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.div\", None, \"df\", [numeric_col, numeric_col_B])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.div\", \"all_columns\", \"df\", \"*\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.z_score\", None, \"df\", numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.z_score\", \"all_columns\", \"df\", \"*\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.iqr\", None, \"json\", None, numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.iqr\", \"all_columns\", \"json\",None, \"*\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.lower\", None, \"df\", string_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.lower\", \"all_columns\", \"df\", \"*\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.upper\", None, \"df\", string_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.upper\", \"all_columns\", \"df\", \"*\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.trim\", None, \"df\", numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.trim\", \"all_columns\", \"df\", \"*\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.reverse\", None, \"df\", string_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.reverse\", \"all_columns\", \"df\", \"*\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.remove_accents\", None, \"df\", string_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.remove_accents\", \"all_columns\", \"df\", string_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "source_df.table()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.remove_special_chars\", None, \"df\", string_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.remove_special_chars\", \"all_columns\",\"df\", None, \"*\")\n", - "t.run()\n", - "# t.create(None, \"cols.value_counts\", None, \"json\", None, numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "source_df.cols.remove_special_chars(\"*\").table()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.remove_white_spaces\", None, \"df\", string_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.remove_white_spaces\", \"all_columns\", \"df\", \"*\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.date_transform\", None, \"df\", date_col, \"yyyy/MM/dd\", \"dd-MM-YYYY\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.date_transform\", \"all_columns\", \"df\", [date_col, date_col_B], \"yyyy/MM/dd\", \"dd-MM-YYYY\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "# t.create(None, \"cols.years_between\", None, \"df\", date_col, \"yyyy/MM/dd\")\n", - "t.delete(None, \"cols.years_between\", None, \"df\", date_col, \"yyyy/MM/dd\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "# t.create(None, \"cols.years_between\", \"multiple_columns\", \"df\", [date_col, date_col_B], \"yyyy/MM/dd\")\n", - "t.delete(None, \"cols.years_between\", \"multiple_columns\", \"df\", [date_col, date_col_B], \"yyyy/MM/dd\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.impute\", None, \"df\", numeric_col_B)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating test_cols_impute_all_columns() test function...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:optimus:test_cols_impute_all_columns()\n", - "INFO:optimus:Using 'column_exp' to process column 'names' with function func_col_exp\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
Viewing 7 of 7 rows / 16 columns
\n", - "
1 partition(s)
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
names
\n", - "
1 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
height(ft)
\n", - "
2 (smallint)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
function
\n", - "
3 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
rank
\n", - "
4 (tinyint)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
age
\n", - "
5 (int)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
weight(t)
\n", - "
6 (float)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
japanese name
\n", - "
7 (array<string>)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
last position seen
\n", - "
8 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
date arrival
\n", - "
9 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
last date seen
\n", - "
10 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
attributes
\n", - "
11 (array<float>)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
Date Type
\n", - "
12 (date)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
timestamp
\n", - "
13 (timestamp)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
Cybertronian
\n", - "
14 (boolean)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
function(binary)
\n", - "
15 (binary)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
NullType
\n", - "
16 (null)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " Optim'us\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " -28\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " Leader\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 5000000\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 4.300000190734863\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " ['Inochi',⋅'Convoy']\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 19.442735,-99.201111\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1980/04/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2016/09/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " [8.53439998626709,⋅4300.0]\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2016-09-10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2014-06-24⋅00:00:00\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " True\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " bytearray(b'Leader')\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " bumbl#ebéé⋅⋅\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 17\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " Espionage\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 7\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 5000000\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2.0\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " ['Bumble',⋅'Goldback']\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 10.642707,-71.612534\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1980/04/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2015/08/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " [5.334000110626221,⋅2000.0]\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2015-08-10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2014-06-24⋅00:00:00\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " True\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " bytearray(b'Espionage')\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " ironhide&\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 26\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " Security\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 7\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 5000000\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 4.0\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " ['Roadbuster']\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 37.789563,-122.400356\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1980/04/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2014/07/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " [7.924799919128418,⋅4000.0]\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2014-06-24\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2014-06-24⋅00:00:00\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " True\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " bytearray(b'Security')\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " Jazz\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 13\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " First⋅Lieutenant\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 8\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 5000000\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1.7999999523162842\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " ['Meister']\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 33.670666,-117.841553\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1980/04/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2013/06/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " [3.962399959564209,⋅1800.0]\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2013-06-24\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2014-06-24⋅00:00:00\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " True\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " bytearray(b'First⋅Lieutenant')\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " Megatron\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 5000000\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 5.699999809265137\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " ['Megatron']\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1980/04/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2012/05/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " [None,⋅5700.0]\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2012-05-10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2014-06-24⋅00:00:00\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " True\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " bytearray(b'None')\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " Metroplex_)^$\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 300\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " Battle⋅Station\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 8\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 5000000\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " ['Metroflex']\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1980/04/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2011/04/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " [91.44000244140625,⋅None]\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2011-04-10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2014-06-24⋅00:00:00\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " True\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " bytearray(b'Battle⋅Station')\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "\n", - "\n", - "
Viewing 7 of 7 rows / 16 columns
\n", - "
1 partition(s)
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Wall time: 8.41 s\n" - ] - } - ], - "source": [ - "%%time\n", - "t.create(None, \"cols.impute\", \"all_columns\",\"df\", None ,\"names\",\"categorical\")" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating file ../test_df_cols.py\n", - "Done\n" - ] - } - ], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Hist" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.hist\", None, \"json\", None, [\"height(ft)\",numeric_col_B], 4)\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None,\"cols.hist\",\"all_columns\",\"json\",None, \"Date Type\",4)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.frequency\", None, \"dict\", None, numeric_col_B, 4)\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.frequency\", \"all_columns\", \"dict\", None, \"*\", 4)\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "t.create(None, \"cols.schema_dtype\", None, \"json\", numeric_col_B)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Problems with casting\n", - "# t.delete(None, \"cols.schema_dtype\", \"all_columns\", \"json\", \"*\")\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.dtypes\", None, \"json\", None, numeric_col_B)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.dtypes\", \"all_columns\", \"json\", \"*\")" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating test_cols_select_by_dtypes_str() test function...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:optimus:test_cols_select_by_dtypes_str()\n", - "INFO:optimus:`height(ft)`,`rank`,`age`,`weight(t)`,`japanese name`,`attributes`,`Date Type`,`timestamp`,`Cybertronian`,`function(binary)`,`NullType` column(s) was not processed because is/are not str\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
Viewing 7 of 7 rows / 5 columns
\n", - "
1 partition(s)
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
names
\n", - "
1 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
function
\n", - "
2 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
last position seen
\n", - "
3 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
date arrival
\n", - "
4 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
last date seen
\n", - "
5 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " Optim'us\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " Leader\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 19.442735,-99.201111\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1980/04/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2016/09/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " bumbl#ebéé⋅⋅\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " Espionage\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 10.642707,-71.612534\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1980/04/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2015/08/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " ironhide&\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " Security\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 37.789563,-122.400356\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1980/04/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2014/07/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " Jazz\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " First⋅Lieutenant\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 33.670666,-117.841553\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1980/04/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2013/06/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " Megatron\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1980/04/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2012/05/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " Metroplex_)^$\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " Battle⋅Station\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1980/04/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2011/04/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "\n", - "\n", - "
Viewing 7 of 7 rows / 5 columns
\n", - "
1 partition(s)
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "t.create(None, \"cols.select_by_dtypes\", \"str\", \"df\", None, \"str\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.select_by_dtypes\", \"int\", \"df\", \"int\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.select_by_dtypes\", \"float\", \"df\", \"float\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.select_by_dtypes\", \"array\", \"df\", \"array\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.names\", None, \"json\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.qcut\", None, \"df\", numeric_col_B, 4)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.qcut\", \"all_columns\", \"df\", \"*\", 4)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.clip\", None, \"df\", numeric_col_B, 3, 5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.clip\", \"all_columns\", \"df\", \"*\", 3, 5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.replace\", \"full\", \"df\", None,string_col,[\"First Lieutenant\",\"Battle\"], \"Match\", search_by=\"full\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.replace\", \"words\", \"df\", None,string_col,[\"Security\", \"Leader\"], \"Match\", search_by=\"words\")\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.replace\", \"chars\", \"df\", None,string_col,[\"F\", \"E\"], \"Match\", search_by=\"chars\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.replace\", \"numeric\", \"df\", None,\"age\",5000000, 5, search_by=\"numeric\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "# Assert is failing I can see why\n", - "t.create(None, \"cols.replace\", \"all_columns\", \"df\", None,\"*\", [\"Jazz\", \"Leader\"], \"Match\")\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Its necesary to save the function \n", - "t.delete(None, \"cols.apply_expr\", None, \"df\", numeric_col_B, func)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Its necesary to save the function \n", - "t.delete(None, \"cols.apply_expr\", \"all_columns\", \"df\", [numeric_col_B,numeric_col_C], func)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.append\", \"number\", \"df\", new_col, 1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_col = op.create.df(\n", - " [\n", - " (\"new_col\", \"str\", True),\n", - " \n", - "\n", - " ],[\n", - " (\"q\"),(\"w\"), (\"e\"), (\"r\"),\n", - "\n", - " ])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.append\", \"dataframes\", \"df\", None, df_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "#t.create(None, \"cols.append\", \"advance\", \"df\", [(\"new_col_4\", \"test\"),\n", - " # (\"new_col_5\", df[numeric_col_B] * 2),\n", - " # (\"new_col_6\", [1, 2, 3])\n", - " # ])," - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.rename\", None, \"df\", numeric_col_B, numeric_col_B + \"(old)\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.rename\", \"list\", \"df\", [numeric_col, numeric_col + \"(tons)\", numeric_col_B, numeric_col_B + \"(old)\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.rename\", \"function\", \"df\", str.upper)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.drop\", None, \"df\", numeric_col_B)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.cast\", None, \"df\", string_col, \"string\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.cast\", \"all_columns\", \"df\", \"*\", \"string\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Problems with precision\n", - "t.delete(None, \"cols.cast\", \"vector\", \"df\", array_col, Vectors)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.keep\", None, \"df\", numeric_col_B)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.move\", \"after\", \"df\", numeric_col_B, \"after\", array_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.move\", \"before\", \"df\", numeric_col_B, \"before\", array_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.move\", \"beginning\", \"df\", numeric_col_B, \"beginning\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.move\", \"end\", \"df\", numeric_col_B, \"end\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.select\", None, \"df\", 0, numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.select\", \"regex\", \"df\", \"n.*\", regex=True)," - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.sort\", None, \"df\")\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.sort\", \"desc\", \"df\", None,\"desc\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.sort\", \"asc\", \"df\", None, \"asc\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.fill_na\", None, \"df\", numeric_col, \"1\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.fill_na\", \"array\", \"df\", None, \"japanese name\", [\"1\",\"2\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.fill_na\", \"bool\", \"df\", None, \"Cybertronian\", False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "jupyter": { - "outputs_hidden": true - } - }, - "outputs": [], - "source": [ - "t.create(None, \"cols.fill_na\", \"all_columns\", \"df\", [\"names\",\"height(ft)\", \"function\", \"rank\", \"age\"], \"2\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Nest" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.nest\", None, \"df\", None, [numeric_col, numeric_col_B], separator=\" \",output_col=new_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# t.create(None, \"cols.nest\", \"mix\", \"df\", [F.col(numeric_col_C), F.col(numeric_col_B)], \"E\", separator=\"--\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_na = source_df.cols.drop(\"NullType\").rows.drop_na(\"*\")\n", - "\n", - "t.create(df_na, \"cols.nest\", \"vector_all_columns\", \"df\", None,[numeric_col_C, numeric_col_B], shape=\"vector\", output_col=new_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(df_na, \"cols.nest\", \"vector\", \"df\", None, [numeric_col_C, numeric_col_B], shape=\"vector\",output_col=new_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.nest\", \"array\", \"df\", None, [numeric_col, numeric_col_B,numeric_col_C], shape=\"array\", output_col=new_col)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating test_cols_count_by_dtypes() test function...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:optimus:test_cols_count_by_dtypes()\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'names': {'null': 1, 'missing': 0, 'string': 6}, 'height(ft)': {'null': 2, 'missing': 0, 'int': 5}, 'function': {'null': 1, 'missing': 0, 'string': 6}, 'rank': {'null': 1, 'missing': 0, 'int': 6}, 'age': {'null': 1, 'missing': 0, 'int': 6}, 'weight(t)': {'null': 2, 'missing': 0, 'decimal': 5}, 'japanese name': {'null': 1, 'missing': 0, 'array': 6}, 'last position seen': {'null': 3, 'missing': 0, 'string': 4}, 'date arrival': {'null': 1, 'missing': 0, 'string': 6}, 'last date seen': {'null': 1, 'missing': 0, 'string': 6}, 'attributes': {'null': 1, 'missing': 0, 'array': 6}, 'Date Type': {'null': 1, 'missing': 0, 'date': 6}, 'timestamp': {'null': 1, 'missing': 0, 'date': 6}, 'Cybertronian': {'null': 1, 'missing': 0, 'boolean': 6}, 'function(binary)': {'null': 1, 'missing': 0, 'binary': 6}, 'NullType': {'null': 7, 'missing': 0}}\n" - ] - } - ], - "source": [ - "t.create(None, \"cols.count_by_dtypes\", None, \"dict\", None, \"*\", infer=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating test_cols_count_by_dtypes_infer() test function...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:optimus:test_cols_count_by_dtypes_infer()\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'names': {'string': 6, 'null': 1, 'int': 0, 'decimal': 0, 'boolean': 0, 'date': 0, 'array': 0, 'object': 0, 'gender': 0, 'ip': 0, 'url': 0, 'email': 0, 'credit_card_number': 0, 'zip_code': 0, 'missing': 0}, 'height(ft)': {'smallint': 5, 'null': 2}, 'function': {'string': 6, 'null': 1, 'int': 0, 'decimal': 0, 'boolean': 0, 'date': 0, 'array': 0, 'object': 0, 'gender': 0, 'ip': 0, 'url': 0, 'email': 0, 'credit_card_number': 0, 'zip_code': 0, 'missing': 0}, 'rank': {'tinyint': 6, 'null': 1}, 'age': {'int': 6, 'null': 1}, 'weight(t)': {'float': 5, 'null': 2}, 'japanese name': {'array': 6, 'null': 1}, 'last position seen': {'array': 3, 'date': 1, 'null': 3, 'int': 0, 'decimal': 0, 'string': 0, 'boolean': 0, 'object': 0, 'gender': 0, 'ip': 0, 'url': 0, 'email': 0, 'credit_card_number': 0, 'zip_code': 0, 'missing': 0}, 'date arrival': {'date': 6, 'null': 1, 'int': 0, 'decimal': 0, 'string': 0, 'boolean': 0, 'array': 0, 'object': 0, 'gender': 0, 'ip': 0, 'url': 0, 'email': 0, 'credit_card_number': 0, 'zip_code': 0, 'missing': 0}, 'last date seen': {'date': 6, 'null': 1, 'int': 0, 'decimal': 0, 'string': 0, 'boolean': 0, 'array': 0, 'object': 0, 'gender': 0, 'ip': 0, 'url': 0, 'email': 0, 'credit_card_number': 0, 'zip_code': 0, 'missing': 0}, 'attributes': {'array': 6, 'null': 1}, 'Date Type': {'date': 6, 'null': 1}, 'timestamp': {'timestamp': 6, 'null': 1}, 'Cybertronian': {'boolean': 6, 'null': 1}, 'function(binary)': {'binary': 6, 'null': 1}, 'NullType': {'null': 7}}\n" - ] - } - ], - "source": [ - "t.create(None, \"cols.count_by_dtypes\", \"infer\", \"dict\", None, \"*\", infer=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating file ../test_df_cols.py\n", - "Done\n" - ] - } - ], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "dtypes_df = op.create.df(\n", - " [\n", - " (\"col 1\", \"str\", True),\n", - " (\"col 2\", \"str\", True),\n", - " (\"col 3\", \"int\", True),\n", - " \n", - " ],\n", - " [\n", - " (\"male\",\"male\",1),\n", - " (\"optimus\",\"bumblebee\",1),\n", - " (\"3\",\"4.1\",1),\n", - " (\"true\",\"False\",1),\n", - " (\"[1,2,3,4]\",\"(1,2,3,4)\",1),\n", - " (\"{1,2,3,4}\",\"{'key1' :1 , 'key2':2}\",1),\n", - " (\"1.1.1.1\",\"123.123.123.123\",1),\n", - " (\"http://hi-optimuse.com\",\"https://hi-bumblebee.com\",1),\n", - " (\"optimus@cybertron.com\",\"bumblebee@cybertron.com\",1),\n", - " (\"5123456789123456\",\"373655783158306\",1),\n", - " (\"11529\",\"30345\",1),\n", - " (\"04/10/1980\",\"04/10/1980\",1),\n", - " (\"null\",\"Null\",1),\n", - " (\"\",\"\",1),\n", - " (None,None,1) \n", - " \n", - " ], infer_schema=True)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(dtypes_df, \"cols.count_by_dtypes\", \"infer\", \"dict\", None, \"*\", infer=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(source_df, \"cols.count_by_dtypes\", None, \"dict\", None, \"*\", infer=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "source_df.table()" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import sys\n", - "from datetime import date, datetime\n", - "\n", - "from pyspark.sql.types import *\n", - "\n", - "from optimus import Optimus\n", - "\n", - "mismatch_df = op.create.df(\n", - " [\n", - " (\"names\", \"str\", True),\n", - " (\"height(ft)\", \"int\", True),\n", - " (\"function\", \"str\", True),\n", - " (\"rank\", \"int\", True),\n", - " (\"age\", \"int\", True),\n", - " (\"weight(t)\", \"float\", True),\n", - " (\"japanese name\", ArrayType(StringType()), True),\n", - " (\"last position seen\", \"str\", True),\n", - " (\"date arrival\", \"str\", True),\n", - " (\"last date seen\", \"str\", True),\n", - " (\"attributes\", ArrayType(FloatType()), True),\n", - " (\"DateType\", DateType()),\n", - " (\"Timestamp\", TimestampType()),\n", - " (\"Cybertronian\", \"bool\", True),\n", - " (\"function(binary)\", \"binary\", False),\n", - " (\"NullType\", \"null\", True),\n", - "\n", - " ],\n", - " [\n", - " (\"31/12/2019\", 28, \"1978-12-20\", 10, 5000000, 4.30, [\"Inochi\", \"Convoy\"], \"19.442735,-99.201111\", \"1980/04/10\",\n", - " \"2016/09/10\", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray(\"Leader\", \"utf-8\"),\n", - " None),\n", - " (\"bumbl#ebéé \", 17, \"Espionage\", 7, 5000000, 2.0, [\"Bumble\", \"Goldback\"], \"10.642707,-71.612534\", \"1980/04/10\",\n", - " \"2015/08/10\", [5.334, 2000.0], date(2015, 8, 10), datetime(2014, 6, 24), True, bytearray(\"Espionage\", \"utf-8\"),\n", - " None),\n", - " (\"ironhide&\", 26, \"Security\", 7, 5000000, 4.0, [\"Roadbuster\"], \"37.789563,-122.400356\", \"1980/04/10\",\n", - " \"2014/07/10\", [7.9248, 4000.0], date(2014, 6, 24), datetime(2014, 6, 24), True, bytearray(\"Security\", \"utf-8\"),\n", - " None),\n", - " (\"Jazz\", 13, \"First Lieutenant\", 8, 5000000, 1.80, [\"Meister\"], \"33.670666,-117.841553\", \"1980/04/10\",\n", - " \"2013/06/10\", [3.9624, 1800.0], date(2013, 6, 24), datetime(2014, 6, 24), True,\n", - " bytearray(\"First Lieutenant\", \"utf-8\"), None),\n", - " (\"Megatron\", None, \"None\", 10, 5000000, 5.70, [\"Megatron\"], None, \"1980/04/10\", \"2012/05/10\", [None, 5700.0],\n", - " date(2012, 5, 10), datetime(2014, 6, 24), True, bytearray(\"None\", \"utf-8\"), None),\n", - " (\"Metroplex_)^$\", 300, \"Battle Station\", 8, 5000000, None, [\"Metroflex\"], None, \"1980/04/10\", \"2011/04/10\",\n", - " [91.44, None], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray(\"Battle Station\", \"utf-8\"), None),\n", - " (\"1\", 2, \"3\", 4, 5, 6.0, [\"7\"], 8, \"1980/04/10\", \"2011/04/10\",\n", - " [11.0], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray(\"15\", \"utf-8\"), None)\n", - " ], infer_schema=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "mismatch = {\"names\":\"dd/mm/yyyy\",\"height(ft)\":r'^([0-2][0-9]|(3)[0-1])(\\/)(((0)[0-9])|((1)[0-2]))(\\/)\\d{4}$',\"function\":\"yyyy-mm-dd\"}" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": {}, - "outputs": [], - "source": [ - "m = {\"names\":\"int\"}" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
Viewing 7 of 7 rows / 16 columns
\n", - "
1 partition(s)
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
names
\n", - "
1 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
height(ft)
\n", - "
2 (int)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
function
\n", - "
3 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
rank
\n", - "
4 (int)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
age
\n", - "
5 (int)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
weight(t)
\n", - "
6 (float)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
japanese name
\n", - "
7 (array<string>)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
last position seen
\n", - "
8 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
date arrival
\n", - "
9 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
last date seen
\n", - "
10 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
attributes
\n", - "
11 (array<float>)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
DateType
\n", - "
12 (date)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
Timestamp
\n", - "
13 (timestamp)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
Cybertronian
\n", - "
14 (boolean)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
function(binary)
\n", - "
15 (binary)
\n", - "
\n", - " \n", - " not nullable\n", - " \n", - "
\n", - "
\n", - "
NullType
\n", - "
16 (null)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 31/12/2019\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 28\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1978-12-20\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 5000000\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 4.300000190734863\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " ['Inochi',⋅'Convoy']\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 19.442735,-99.201111\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1980/04/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2016/09/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " [8.53439998626709,⋅4300.0]\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2016-09-10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2014-06-24⋅00:00:00\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " True\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " bytearray(b'Leader')\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " bumbl#ebéé⋅⋅\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 17\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " Espionage\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 7\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 5000000\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2.0\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " ['Bumble',⋅'Goldback']\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 10.642707,-71.612534\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1980/04/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2015/08/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " [5.334000110626221,⋅2000.0]\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2015-08-10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2014-06-24⋅00:00:00\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " True\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " bytearray(b'Espionage')\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " ironhide&\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 26\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " Security\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 7\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 5000000\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 4.0\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " ['Roadbuster']\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 37.789563,-122.400356\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1980/04/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2014/07/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " [7.924799919128418,⋅4000.0]\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2014-06-24\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2014-06-24⋅00:00:00\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " True\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " bytearray(b'Security')\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " Jazz\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 13\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " First⋅Lieutenant\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 8\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 5000000\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1.7999999523162842\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " ['Meister']\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 33.670666,-117.841553\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1980/04/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2013/06/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " [3.962399959564209,⋅1800.0]\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2013-06-24\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2014-06-24⋅00:00:00\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " True\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " bytearray(b'First⋅Lieutenant')\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " Megatron\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 5000000\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 5.699999809265137\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " ['Megatron']\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1980/04/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2012/05/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " [None,⋅5700.0]\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2012-05-10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2014-06-24⋅00:00:00\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " True\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " bytearray(b'None')\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " Metroplex_)^$\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 300\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " Battle⋅Station\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 8\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 5000000\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " ['Metroflex']\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1980/04/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2011/04/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " [91.44000244140625,⋅None]\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2011-04-10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2014-06-24⋅00:00:00\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " True\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " bytearray(b'Battle⋅Station')\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 3\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 4\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 5\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 6.0\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " ['7']\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 8\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1980/04/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2011/04/10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " [11.0]\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2011-04-10\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2014-06-24⋅00:00:00\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " True\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " bytearray(b'15')\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "\n", - "\n", - "
Viewing 7 of 7 rows / 16 columns
\n", - "
1 partition(s)
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "mismatch_df.table()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating test_cols_count_mismatch() test function...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:optimus:test_cols_count_mismatch()\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['{\"names\": \"int\"}']\n", - "{'names': {'mismatch': 6, 'int': 1, 'null': 0, 'missing': 0}}\n" - ] - } - ], - "source": [ - "t.create(mismatch_df, \"cols.count_mismatch\", None, \"dict\", None, {\"names\":\"int\"})" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating file ../test_df_cols.py\n", - "Done\n" - ] - } - ], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Unnest String" - ] - }, - { - "cell_type": "code", - "execution_count": 83, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating test_cols_unnest_string_multi_index() test function...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:optimus:test_cols_unnest_string_multi_index()\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[\"'date arrival'\", \"'/'\"]\n" - ] - }, - { - "ename": "ValueError", - "evalue": "'missing_columns' must be 'words', 'num', 'animals', 'thing', 'second', 'filter', received 'date arrival'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcreate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"cols.unnest\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"string_multi_index\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"df\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdate_col\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"/\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msplits\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\test.py\u001b[0m in \u001b[0;36mcreate\u001b[1;34m(self, obj, method, suffix, output, additional_method, *args, **kwargs)\u001b[0m\n\u001b[0;32m 217\u001b[0m \u001b[0mdf_func\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_func\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 218\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 219\u001b[1;33m \u001b[0mdf_result\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf_func\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 220\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 221\u001b[0m \u001b[1;31m# Additional Methods\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 47\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 48\u001b[0m \u001b[0mstart_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 49\u001b[1;33m \u001b[0mf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 50\u001b[0m \u001b[0m_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mround\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mstart_time\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 51\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mlog_time\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Documents\\Optimus\\optimus\\dataframe\\columns.py\u001b[0m in \u001b[0;36munnest\u001b[1;34m(input_cols, separator, splits, index, output_cols)\u001b[0m\n\u001b[0;32m 1571\u001b[0m \u001b[0mdf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1572\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1573\u001b[1;33m \u001b[0minput_cols\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparse_columns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minput_cols\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1574\u001b[0m \u001b[0moutput_cols\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mget_output_cols\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minput_cols\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moutput_cols\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1575\u001b[0m \u001b[0mfinal_columns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\columns.py\u001b[0m in \u001b[0;36mparse_columns\u001b[1;34m(df, cols_args, get_args, is_regex, filter_by_column_dtypes, accepts_missing_cols, invert)\u001b[0m\n\u001b[0;32m 140\u001b[0m \u001b[1;31m# Check for missing columns\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 141\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0maccepts_missing_cols\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 142\u001b[1;33m \u001b[0mcheck_for_missing_columns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcols\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 143\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 144\u001b[0m \u001b[1;31m# Filter by column data type\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\columns.py\u001b[0m in \u001b[0;36mcheck_for_missing_columns\u001b[1;34m(df, col_names)\u001b[0m\n\u001b[0;32m 242\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 243\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmissing_columns\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m>\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 244\u001b[1;33m \u001b[0mRaiseIt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalue_error\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmissing_columns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 245\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 246\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\raiseit.py\u001b[0m in \u001b[0;36mvalue_error\u001b[1;34m(var, data_values)\u001b[0m\n\u001b[0;32m 76\u001b[0m type=divisor.join(map(\n\u001b[0;32m 77\u001b[0m \u001b[1;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m\"'\"\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mx\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;34m\"'\"\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 78\u001b[1;33m data_values)), var_type=one_list_to_val(var)))\n\u001b[0m\u001b[0;32m 79\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 80\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mstaticmethod\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mValueError\u001b[0m: 'missing_columns' must be 'words', 'num', 'animals', 'thing', 'second', 'filter', received 'date arrival'" - ] - } - ], - "source": [ - "t.create(None, \"cols.unnest\", \"string_multi_index\", \"df\", None, date_col, \"/\", splits=3, index=2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.unnest\", \"string_multi_index\", \"df\", None, date_col, \"/\", splits=3, index=[1,2])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.unnest\", \"string_infer_split\", \"df\", None, date_col, \"/\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.unnest\", \"string_no_index\", \"df\", None, date_col, \"/\", splits=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.unnest\", \"string_output_columns\", \"df\", None, date_col, \"/\", splits=3, output_cols= [(\"year\", \"month\",\"day\")])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.unnest\", \"array_index\", \"df\", None, array_col, index=2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.unnest\", \"array_multi_index\", \"df\", None, array_col, index=[1,2])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.unnest\", \"string_multi_colum_multi_index_multi_output\", \"df\", None, [\"date arrival\",\"last date seen\"], \"/\", index=[(1,2),(1,2)], output_cols=[(\"year1\",\"month1\"),(\"year2\",\"month2\")])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.unnest\", \"string_multi_colum_multi_output\", \"df\", None, [\"date arrival\",\"last date seen\"], \"/\", output_cols=[(\"year1\",\"month1\"),(\"year2\",\"month2\")])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.unnest\", \"array\", \"df\", array_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.unnest\", \"array_all_columns\", \"df\", array_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.is_na\", \"all_columns\", \"df\", \"*\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"cols.is_na\", None, \"df\", numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pyspark.sql.types import *\n", - "from optimus import Optimus\n", - "from optimus.helpers.json import json_enconding\n", - "from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector\n", - "import numpy as np\n", - "nan = np.nan\n", - "import datetime" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_end_of_cell_marker": 2 - }, - "outputs": [], - "source": [ - "actual_df =op.load.json('https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.json')\n", - "expected_df = op.create.df([('billingId', LongType(), True),('birth', StringType(), True),('dummyCol', StringType(), True),('firstName', StringType(), True),('id', LongType(), True),('lastName', StringType(), True),('price', LongType(), True),('product', StringType(), True)], [(123, '1980/07/07', 'never', 'Luis', 1, 'Alvarez$$%!', 10, 'Cake')])\n", - "\n", - "# assert (expected_df.collect() == actual_df.collect())\n", - "\n", - "from deepdiff import DeepDiff # For Deep Difference of 2 objects\n", - "\n", - "actual_df.table()\n", - "expected_df.table()\n", - "\n", - "# source_df.table()\n", - "# print(actual_df.to_json())\n", - "# print(expected_df.to_json())\n", - "a1 = actual_df.to_json()\n", - "e1 = expected_df.to_json()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ddiff = DeepDiff(a1, e1, ignore_order=False)\n", - "print(ddiff)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Rows Test" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t = Test(op,df, \"df_rows\", imports=[\"from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector\",\n", - " \"import numpy as np\",\n", - " \"nan = np.nan\",\n", - " \"import datetime\",\n", - " \"from pyspark.sql import functions as F\",\n", - " \"from optimus.functions import abstract_udf as audf\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "rows = [\n", - " (\"Optim'us\", 28, \"Leader\", 10, 5000000, 4.30, [\"Inochi\", \"Convoy\"], \"19.442735,-99.201111\", \"1980/04/10\",\n", - " \"2016/09/10\", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray(\"Leader\", \"utf-8\"),\n", - " None)\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = op.load.url(\"https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t = Test(op, source_df, \"op_io\", imports=[\"from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector\",\n", - " \"import numpy as np\",\n", - " \"nan = np.nan\",\n", - " \"import datetime\",\n", - " \"from pyspark.sql import functions as F\"],path = \"op_io\", final_path=\"..\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(op, \"load.csv\", \"local_csv\", \"df\", \"../../examples/data/foo.csv\")\n", - "t.create(op, \"load.json\", \"local_json\", \"df\", \"../../examples/data/foo.json\")\n", - "t.create(op, \"load.parquet\", \"local_parquet\", \"df\", \"../../examples/data/foo.parquet\")\n", - "t.create(op, \"load.csv\", \"remote_csv\", \"df\", \"https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(op, \"load.json\", \"remote_json\", \"df\", \"https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.json\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(op, \"load.parquet\", \"remote_parquet\", \"df\", \"https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.parquet\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from optimus.profiler.profiler import Profiler\n", - "p = Profiler()\n", - "\n", - "print(p.run(source_df1, \"japanese name\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# df_string = source_df.cols.cast(\"*\",\"str\")\n", - "t.create(source_df, \"save.csv\", None, None, \"test.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 6 of 6 rows / 15 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
names
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
height(ft)
\n", + "
2 (smallint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
function
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
rank
\n", + "
4 (tinyint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
age
\n", + "
5 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
weight(t)
\n", + "
6 (float)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
japanese name
\n", + "
7 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
last position seen
\n", + "
8 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
date arrival
\n", + "
9 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
last date seen
\n", + "
10 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
attributes
\n", + "
11 (array<float>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
Date Type
\n", + "
12 (date)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
timestamp
\n", + "
13 (timestamp)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
Cybertronian
\n", + "
14 (boolean)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
rank***STRING_TO_INDEX
\n", + "
15 (double)
\n", + "
\n", + " \n", + " not nullable\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Optim'us\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " -28\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Leader\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 4.300000190734863\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Inochi',⋅'Convoy']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 19.442735,-99.201111\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2016/09/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [8.53439998626709,⋅4300.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2016-09-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bumbl#ebéé⋅⋅\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 17\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Espionage\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 7\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Bumble',⋅'Goldback']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10.642707,-71.612534\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2015/08/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [5.334000110626221,⋅2000.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2015-08-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ironhide&\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 26\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Security\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 7\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 4.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Roadbuster']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 37.789563,-122.400356\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014/07/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [7.924799919128418,⋅4000.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Jazz\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 13\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " First⋅Lieutenant\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1.7999999523162842\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Meister']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 33.670666,-117.841553\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2013/06/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [3.962399959564209,⋅1800.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2013-06-24\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 0.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Megatron\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5.699999809265137\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Megatron']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2012/05/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [None,⋅5700.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2012-05-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Metroplex_)^$\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 300\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Battle⋅Station\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Metroflex']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2011/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [91.44000244140625,⋅None]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2011-04-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 0.0\n", + " \n", + "
\n", + "
\n", + "\n", + "\n", + "
Viewing 6 of 6 rows / 15 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "t.create(None, \"save.json\", None, None, \"test.json\")" + "t.create(source_df_string_to_index, \"cols.string_to_index\", None, \"df\", None, \"rank\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ - "t.create(None, \"save.parquet\", None, None, \"test.parquet\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "lines_to_next_cell": 2 - }, - "outputs": [], - "source": [ - "source_df.table()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Ouliers" + "source_df_index_to_string = source_df_string_to_index.cols.string_to_index(\"rank\")" ] }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -7356,7 +3337,7 @@ "\n", "\n", "\n", - "
Viewing 7 of 7 rows / 16 columns
\n", + "
Viewing 6 of 6 rows / 15 columns
\n", "
1 partition(s)
\n", "\n", "\n", @@ -7504,21 +3485,11 @@ " \n", " \n", " \n", - " \n", - " \n", @@ -7643,17 +3614,9 @@ " \n", " \n", " \n", - " \n", - " \n", @@ -7761,31 +3724,23 @@ " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", @@ -7907,17 +3862,9 @@ " \n", " \n", " \n", - " \n", - " \n", @@ -8039,17 +3986,9 @@ " \n", " \n", " \n", - " \n", - " \n", @@ -8171,17 +4110,9 @@ " \n", " \n", " \n", - " \n", - " \n", @@ -8303,149 +4234,9 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", @@ -8456,7 +4247,7 @@ "
\n", - "
function(binary)
\n", - "
15 (binary)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
NullType
\n", - "
16 (null)
\n", + "
rank***STRING_TO_INDEX
\n", + "
15 (double)
\n", "
\n", " \n", - " nullable\n", + " not nullable\n", " \n", "
\n", "
\n", - "
\n", - " \n", - " bytearray(b'Leader')\n", - " \n", - "
\n", - "
\n", - "
\n", + "
\n", " \n", - " None\n", + " 2.0\n", " \n", "
\n", "
\n", "
\n", " \n", - " 2014-06-24⋅00:00:00\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " True\n", + " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " bytearray(b'Espionage')\n", + " True\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " None\n", + " 1.0\n", " \n", "
\n", "
\n", - "
\n", - " \n", - " bytearray(b'Security')\n", - " \n", - "
\n", - "
\n", - "
\n", + "
\n", " \n", - " None\n", + " 1.0\n", " \n", "
\n", "
\n", - "
\n", - " \n", - " bytearray(b'First⋅Lieutenant')\n", - " \n", - "
\n", - "
\n", - "
\n", + "
\n", " \n", - " None\n", + " 0.0\n", " \n", "
\n", "
\n", - "
\n", - " \n", - " bytearray(b'None')\n", - " \n", - "
\n", - "
\n", - "
\n", + "
\n", " \n", - " None\n", + " 2.0\n", " \n", "
\n", "
\n", - "
\n", - " \n", - " bytearray(b'Battle⋅Station')\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", + "
\n", " \n", - " None\n", + " 0.0\n", " \n", "
\n", "
\n", "\n", "\n", - "
Viewing 7 of 7 rows / 16 columns
\n", + "
Viewing 6 of 6 rows / 15 columns
\n", "
1 partition(s)
\n" ], "text/plain": [ @@ -8468,95 +4259,1068 @@ } ], "source": [ - "import pandas as pd\n", - "from pyspark.sql.types import *\n", - "from datetime import date, datetime\n", - "\n", - "\n", - "cols = [\n", - " (\"names\", \"str\"),\n", - " (\"height(ft)\", ShortType()),\n", - " (\"function\", \"str\"),\n", - " (\"rank\", ByteType()),\n", - " (\"age\", \"int\"),\n", - " (\"weight(t)\", \"float\"),\n", - " \"japanese name\",\n", - " \"last position seen\",\n", - " \"date arrival\",\n", - " \"last date seen\",\n", - " (\"attributes\", ArrayType(FloatType())),\n", - " (\"Date Type\", DateType()),\n", - " (\"timestamp\", TimestampType()),\n", - " (\"Cybertronian\", BooleanType()),\n", - " (\"function(binary)\", BinaryType()),\n", - " (\"NullType\", NullType())\n", - "\n", - " ]\n", - "\n", - "rows = [\n", - " (\"Optim'us\", -28, \"Leader\", 10, 5000000, 4.30, [\"Inochi\", \"Convoy\"], \"19.442735,-99.201111\", \"1980/04/10\",\n", - " \"2016/09/10\", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray(\"Leader\", \"utf-8\"),\n", - " None),\n", - " (\"bumbl#ebéé \", 17, \"Espionage\", 7, 5000000, 2.0, [\"Bumble\", \"Goldback\"], \"10.642707,-71.612534\", \"1980/04/10\",\n", - " \"2015/08/10\", [5.334, 2000.0], date(2015, 8, 10), datetime(2014, 6, 24), True, bytearray(\"Espionage\", \"utf-8\"),\n", - " None),\n", - " (\"ironhide&\", 26, \"Security\", 7, 5000000, 4.0, [\"Roadbuster\"], \"37.789563,-122.400356\", \"1980/04/10\",\n", - " \"2014/07/10\", [7.9248, 4000.0], date(2014, 6, 24), datetime(2014, 6, 24), True, bytearray(\"Security\", \"utf-8\"),\n", - " None),\n", - " (\"Jazz\", 13, \"First Lieutenant\", 8, 5000000, 1.80, [\"Meister\"], \"33.670666,-117.841553\", \"1980/04/10\",\n", - " \"2013/06/10\", [3.9624, 1800.0], date(2013, 6, 24), datetime(2014, 6, 24), True,\n", - " bytearray(\"First Lieutenant\", \"utf-8\"), None),\n", - " (\"Megatron\", None, \"None\", 10, 5000000, 5.70, [\"Megatron\"], None, \"1980/04/10\", \"2012/05/10\", [None, 5700.0],\n", - " date(2012, 5, 10), datetime(2014, 6, 24), True, bytearray(\"None\", \"utf-8\"), None),\n", - " (\"Metroplex_)^$\", 300, \"Battle Station\", 8, 5000000, None, [\"Metroflex\"], None, \"1980/04/10\", \"2011/04/10\",\n", - " [91.44, None], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray(\"Battle Station\", \"utf-8\"), None),\n", - " (None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None),\n", - "\n", - " ]\n", - "source_df = op.create.df(cols ,rows)\n", + "source_df_index_to_string.table()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating test_cols_index_to_string() test function...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:test_cols_index_to_string()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\"'rank'\"]\n", + "[IndexToString_47a68629355b65a5ee55]\n" + ] + }, + { + "ename": "Py4JJavaError", + "evalue": "An error occurred while calling o225.transform.\n: java.lang.ClassCastException: org.apache.spark.ml.attribute.UnresolvedAttribute$ cannot be cast to org.apache.spark.ml.attribute.NominalAttribute\r\n\tat org.apache.spark.ml.feature.IndexToString.transform(StringIndexer.scala:394)\r\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\r\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\r\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\r\n\tat java.lang.reflect.Method.invoke(Method.java:498)\r\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\r\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\r\n\tat py4j.Gateway.invoke(Gateway.java:282)\r\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\r\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\r\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\r\n\tat java.lang.Thread.run(Thread.java:748)\r\n", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mPy4JJavaError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcreate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msource_df_index_to_string\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"cols.index_to_string\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"df\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"rank\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\test.py\u001b[0m in \u001b[0;36mcreate\u001b[1;34m(self, obj, method, suffix, output, additional_method, *args, **kwargs)\u001b[0m\n\u001b[0;32m 217\u001b[0m \u001b[0mdf_func\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_func\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 218\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 219\u001b[1;33m \u001b[0mdf_result\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf_func\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 220\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 221\u001b[0m \u001b[1;31m# Additional Methods\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 47\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 48\u001b[0m \u001b[0mstart_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 49\u001b[1;33m \u001b[0mf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 50\u001b[0m \u001b[0m_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mround\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mstart_time\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 51\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mlog_time\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\dataframe\\columns.py\u001b[0m in \u001b[0;36mindex_to_string\u001b[1;34m(input_cols, output_cols, columns)\u001b[0m\n\u001b[0;32m 2105\u001b[0m \u001b[0mdf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2106\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2107\u001b[1;33m \u001b[0mdf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mml_index_to_string\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minput_cols\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moutput_cols\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2108\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2109\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\ml\\encoding.py\u001b[0m in \u001b[0;36mindex_to_string\u001b[1;34m(df, input_cols, output_cols, columns, **kargs)\u001b[0m\n\u001b[0;32m 86\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindexers\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 87\u001b[0m \u001b[0mpipeline\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mPipeline\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstages\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mindexers\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 88\u001b[1;33m \u001b[0mdf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpipeline\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 89\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 90\u001b[0m \u001b[0mdf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpreserve_meta\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_actual\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mActions\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mINDEX_TO_STRING\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moutput_cols\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pyspark\\ml\\base.py\u001b[0m in \u001b[0;36mtransform\u001b[1;34m(self, dataset, params)\u001b[0m\n\u001b[0;32m 171\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_transform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 172\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 173\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_transform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 174\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 175\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Params must be a param map but got %s.\"\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pyspark\\ml\\pipeline.py\u001b[0m in \u001b[0;36m_transform\u001b[1;34m(self, dataset)\u001b[0m\n\u001b[0;32m 260\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_transform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 261\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mt\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstages\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 262\u001b[1;33m \u001b[0mdataset\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 263\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mdataset\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 264\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pyspark\\ml\\base.py\u001b[0m in \u001b[0;36mtransform\u001b[1;34m(self, dataset, params)\u001b[0m\n\u001b[0;32m 171\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_transform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 172\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 173\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_transform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 174\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 175\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Params must be a param map but got %s.\"\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pyspark\\ml\\wrapper.py\u001b[0m in \u001b[0;36m_transform\u001b[1;34m(self, dataset)\u001b[0m\n\u001b[0;32m 303\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_transform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdataset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 304\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_transfer_params_to_java\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 305\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mDataFrame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_java_obj\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jdf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdataset\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msql_ctx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 306\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 307\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\py4j\\java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m 1255\u001b[0m \u001b[0manswer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1256\u001b[0m return_value = get_return_value(\n\u001b[1;32m-> 1257\u001b[1;33m answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[0;32m 1258\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1259\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pyspark\\sql\\utils.py\u001b[0m in \u001b[0;36mdeco\u001b[1;34m(*a, **kw)\u001b[0m\n\u001b[0;32m 61\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mdeco\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 62\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 63\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 64\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 65\u001b[0m \u001b[0ms\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjava_exception\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoString\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\py4j\\protocol.py\u001b[0m in \u001b[0;36mget_return_value\u001b[1;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[0;32m 326\u001b[0m raise Py4JJavaError(\n\u001b[0;32m 327\u001b[0m \u001b[1;34m\"An error occurred while calling {0}{1}{2}.\\n\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 328\u001b[1;33m format(target_id, \".\", name), value)\n\u001b[0m\u001b[0;32m 329\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 330\u001b[0m raise Py4JError(\n", + "\u001b[1;31mPy4JJavaError\u001b[0m: An error occurred while calling o225.transform.\n: java.lang.ClassCastException: org.apache.spark.ml.attribute.UnresolvedAttribute$ cannot be cast to org.apache.spark.ml.attribute.NominalAttribute\r\n\tat org.apache.spark.ml.feature.IndexToString.transform(StringIndexer.scala:394)\r\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\r\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\r\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\r\n\tat java.lang.reflect.Method.invoke(Method.java:498)\r\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\r\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\r\n\tat py4j.Gateway.invoke(Gateway.java:282)\r\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\r\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\r\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\r\n\tat java.lang.Thread.run(Thread.java:748)\r\n" + ] + } + ], + "source": [ + "t.create(source_df_index_to_string, \"cols.index_to_string\", None, \"df\", None, \"rank\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(source_df_string_to_index, \"cols.values_to_cols\", None, \"df\", None, \"rank\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(source_df_string_to_index, \"cols.values_to_cols\", \"all_columns\", \"df\", None, [\"names\",\"height(ft)\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.remove\", None, \"df\", None, string_col, \"i\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.remove\", \"list\", \"df\", string_col, [\"a\",\"i\",\"Es\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.remove\", \"list_output\", \"df\", string_col, [\"a\",\"i\",\"Es\"], output_cols=string_col+\"_new\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.min\", None, \"json\", numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.min\", \"all_columns\", \"json\", None, \"*\")\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.max\", None, \"json\", numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.max\", \"all_columns\", \"json\", None, \"*\")\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.range\", None, \"json\",None, numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.range\", \"all_columns\", \"json\",None, \"*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "source_df.table()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.median\", None, \"json\", None,numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.median\", \"all_columns\", \"json\", None, \"*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.percentile\", None, \"json\", None, numeric_col, [0.05, 0.25], 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.percentile\", \"all_columns\", \"json\", None, \"*\", [0.05, 0.25], 1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## MAD" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.mad\", None, \"json\", None, numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.mad\", \"all_columns\", \"json\", None, \"*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.std\", None, \"json\", numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.std\", \"all_columns\", \"json\", None, \"*\")\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.kurt\", None, \"json\", None, numeric_col)\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.kurt\", \"all_columns\", \"json\", None, \"*\")\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.mean\", None, \"json\", numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.mean\", \"all_columns\", \"json\", None, \"*\")\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.skewness\", None, \"json\", numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.skewness\", \"all_columns\", \"json\", None, \"*\")\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.sum\", None, \"json\", numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.sum\", \"all_columns\", \"json\", None,\"*\")\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.variance\", None, \"json\", numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.variance\", \"all_columns\", \"json\", None, \"*\")\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "source_df.table()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import functions as F\n", + "source_df.select(F.abs(F.col(\"age\")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.abs\", None, \"df\", None,\"weight(t)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.abs\", \"all_columns\", \"json\", None, \"*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "source_df.table()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import functions as F\n", + "\n", + "source_df.select(F.abs(\"weight(t)\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating test_cols_mode() test function...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:test_cols_mode()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "None\n" + ] + } + ], + "source": [ + "t.create(None, \"cols.mode\", None, \"json\", None, numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating test_cols_mode_all_columns() test function...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:test_cols_mode_all_columns()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'names': None}, {'height(ft)': None}, {'function': None}, {'rank': [8, 7, 10]}, {'age': 5000000}, {'weight(t)': None}, {'japanese name': None}, {'last position seen': None}, {'date arrival': '1980/04/10'}, {'last date seen': None}, {'attributes': None}, {'Date Type': None}, {'timestamp': datetime.datetime(2014, 6, 24, 0, 0)}, {'Cybertronian': True}, {'function(binary)': None}, {'NullType': None}]\n", + "Wall time: 40.1 s\n" + ] + } + ], + "source": [ + "%%time\n", + "t.create(None, \"cols.mode\", \"all_columns\", \"json\", None,\"*\")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating file ../test_df_cols.py\n", + "Done\n" + ] + } + ], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.count\", None, \"json\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Count na" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.count_na\", None, \"json\", None, numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.count_na\", \"all_columns\", \"json\",None, \"*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "source_df.cols.names(\"rank\",[\"str\",\"int\",\"float\"],True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.count_zeros\", None, \"json\", numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.count_zeros\", \"all_columns\", \"json\", None, \"*\")\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "source_df.cols.names()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Value counts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.value_counts\", None, \"json\", None, numeric_col)\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.value_counts\", \"all_columns\", \"json\", None, \"*\")\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "t.create(None, \"cols.count_uniques\", None, \"json\", None, numeric_col)\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.count_uniques\", \"all_columns\", \"json\",None, \"*\")\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.unique\", None, \"json\", None,numeric_col)\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.unique\", \"all_columns\", \"json\", None,\"*\")\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.add\", None, \"df\", [numeric_col, numeric_col_B])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.add\", \"all_columns\", \"df\", \"*\")," + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.sub\", None, \"df\", [numeric_col, numeric_col_B])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.sub\", \"all_columns\", \"df\", \"*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.mul\", None, \"df\", [numeric_col, numeric_col_B])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.mul\", \"all_columns\", \"df\", \"*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.div\", None, \"df\", [numeric_col, numeric_col_B])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.div\", \"all_columns\", \"df\", \"*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.z_score\", None, \"df\", numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.z_score\", \"all_columns\", \"df\", \"*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.iqr\", None, \"json\", None, numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.iqr\", \"all_columns\", \"json\",None, \"*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.lower\", None, \"df\", string_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.lower\", \"all_columns\", \"df\", \"*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.upper\", None, \"df\", string_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.upper\", \"all_columns\", \"df\", \"*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.trim\", None, \"df\", numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.trim\", \"all_columns\", \"df\", \"*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.reverse\", None, \"df\", string_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.reverse\", \"all_columns\", \"df\", \"*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.remove_accents\", None, \"df\", string_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.remove_accents\", \"all_columns\", \"df\", string_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "source_df.table()" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "t = Test(op, source_df, \"df_outliers\", imports=[\"from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector\",\n", - " \"import numpy as np\",\n", - " \"nan = np.nan\",\n", - " \"import datetime\",\n", - " \"from pyspark.sql import functions as F\"], path = \"df_outliers\", final_path=\"..\")" + "t.create(None, \"cols.remove_special_chars\", None, \"df\", string_col)" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "from pyspark.sql import functions as F\n", - "\n", - "\n", - "def func(col_name, attrs):\n", - " return F.col(col_name) * 2\n", - "\n", - "numeric_col = \"height(ft)\"\n", - "numeric_col_B = \"rank\"\n", - "numeric_col_C = \"rank\"\n", - "string_col = \"function\"\n", - "date_col = \"date arrival\"\n", - "date_col_B = \"last date seen\"\n", - "new_col = \"new col\"\n", - "array_col = \"attributes\"" + "t.create(None, \"cols.remove_special_chars\", \"all_columns\",\"df\", None, \"*\")\n", + "t.run()\n", + "# t.create(None, \"cols.value_counts\", None, \"json\", None, numeric_col)" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "source_df.cols.remove_special_chars(\"*\").table()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.remove_white_spaces\", None, \"df\", string_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.remove_white_spaces\", \"all_columns\", \"df\", \"*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.date_transform\", None, \"df\", date_col, \"yyyy/MM/dd\", \"dd-MM-YYYY\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.date_transform\", \"all_columns\", \"df\", [date_col, date_col_B], \"yyyy/MM/dd\", \"dd-MM-YYYY\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "# t.create(None, \"cols.years_between\", None, \"df\", date_col, \"yyyy/MM/dd\")\n", + "t.delete(None, \"cols.years_between\", None, \"df\", date_col, \"yyyy/MM/dd\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "# t.create(None, \"cols.years_between\", \"multiple_columns\", \"df\", [date_col, date_col_B], \"yyyy/MM/dd\")\n", + "t.delete(None, \"cols.years_between\", \"multiple_columns\", \"df\", [date_col, date_col_B], \"yyyy/MM/dd\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.impute\", None, \"df\", numeric_col_B)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating test_cols_impute_all_columns() test function...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:test_cols_impute_all_columns()\n", + "INFO:optimus:Using 'column_exp' to process column 'names' with function func_col_exp\n" + ] + }, { "data": { "text/html": [ @@ -9674,51 +6438,160 @@ }, "metadata": {}, "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall time: 8.41 s\n" + ] + } + ], + "source": [ + "%%time\n", + "t.create(None, \"cols.impute\", \"all_columns\",\"df\", None ,\"names\",\"categorical\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating file ../test_df_cols.py\n", + "Done\n" + ] } ], "source": [ - "source_df.table()" + "t.run()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Tukey" + "## Hist" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.hist\", None, \"json\", None, [\"height(ft)\",numeric_col_B], 4)\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None,\"cols.hist\",\"all_columns\",\"json\",None, \"Date Type\",4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.frequency\", None, \"dict\", None, numeric_col_B, 4)\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.frequency\", \"all_columns\", \"dict\", None, \"*\", 4)\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "t.create(None, \"cols.schema_dtype\", None, \"json\", numeric_col_B)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Problems with casting\n", + "# t.delete(None, \"cols.schema_dtype\", \"all_columns\", \"json\", \"*\")\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.dtypes\", None, \"json\", None, numeric_col_B)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.dtypes\", \"all_columns\", \"json\", \"*\")" + ] + }, + { + "cell_type": "code", + "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Creating test_outliers_tukey_select() test function...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:optimus:test_outliers_tukey_select()\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[\"'height(ft)'\"]\n" + "Creating test_cols_select_by_dtypes_str() test function...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:optimus:percentile() executed in 3.34 sec\n", - "INFO:optimus:percentile() executed in 2.68 sec\n" + "INFO:optimus:test_cols_select_by_dtypes_str()\n", + "INFO:optimus:`height(ft)`,`rank`,`age`,`weight(t)`,`japanese name`,`attributes`,`Date Type`,`timestamp`,`Cybertronian`,`function(binary)`,`NullType` column(s) was not processed because is/are not str\n" ] }, { @@ -9729,7 +6602,7 @@ "\n", "\n", "\n", - "
Viewing 2 of 2 rows / 16 columns
\n", + "
Viewing 7 of 7 rows / 5 columns
\n", "
1 partition(s)
\n", "\n", "\n", @@ -9747,58 +6620,8 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", + " \n", + " \n", " \n", " \n", - "\n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", @@ -9976,57 +6787,53 @@ " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -10036,57 +6843,61 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -10115,42 +6926,38 @@ " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -10169,7 +6976,7 @@ "
\n", - "
height(ft)
\n", - "
2 (smallint)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", "
function
\n", - "
3 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
rank
\n", - "
4 (tinyint)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
age
\n", - "
5 (int)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
weight(t)
\n", - "
6 (float)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
japanese name
\n", - "
7 (array<string>)
\n", + "
2 (string)
\n", "
\n", " \n", " nullable\n", @@ -9808,7 +6631,7 @@ " \n", "
\n", "
last position seen
\n", - "
8 (string)
\n", + "
3 (string)
\n", "
\n", " \n", " nullable\n", @@ -9818,7 +6641,7 @@ " \n", "
\n", "
date arrival
\n", - "
9 (string)
\n", + "
4 (string)
\n", "
\n", " \n", " nullable\n", @@ -9828,17 +6651,7 @@ " \n", "
\n", "
last date seen
\n", - "
10 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
attributes
\n", - "
11 (array<float>)
\n", + "
5 (string)
\n", "
\n", " \n", " nullable\n", @@ -9846,123 +6659,121 @@ "
\n", "
\n", - "
Date Type
\n", - "
12 (date)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
timestamp
\n", - "
13 (timestamp)
\n", - "
\n", - " \n", - " nullable\n", + "
\n", + "
\n", " \n", + " Optim'us\n", + " \n", "
\n", - " \n", + "
\n", - "
Cybertronian
\n", - "
14 (boolean)
\n", - "
\n", - " \n", - " nullable\n", + "
\n", + "
\n", " \n", + " Leader\n", + " \n", "
\n", - " \n", + "
\n", - "
function(binary)
\n", - "
15 (binary)
\n", - "
\n", - " \n", - " nullable\n", + "
\n", + "
\n", " \n", + " 19.442735,-99.201111\n", + " \n", "
\n", - " \n", + "
\n", - "
NullType
\n", - "
16 (null)
\n", - "
\n", + "
\n", + "
\n", " \n", - " nullable\n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", " \n", + " 2016/09/10\n", + " \n", "
\n", - " \n", + "
\n", - "
\n", + "
\n", " \n", - " Optim'us\n", + " bumbl#ebéé⋅⋅\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " -28\n", + " Espionage\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Leader\n", + " 10.642707,-71.612534\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 10\n", + " 1980/04/10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 5000000\n", + " 2015/08/10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 4.300000190734863\n", + " ironhide&\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " ['Inochi',⋅'Convoy']\n", + " Security\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 19.442735,-99.201111\n", + " 37.789563,-122.400356\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2016/09/10\n", + " 2014/07/10\n", " \n", "
\n", "
\n", - "
\n", - " \n", - " [8.53439998626709,⋅4300.0]\n", - " \n", - "
\n", - "
\n", - "
\n", + "
\n", " \n", - " 2016-09-10\n", + " Jazz\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2014-06-24⋅00:00:00\n", + " First⋅Lieutenant\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " True\n", + " 33.670666,-117.841553\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " bytearray(b'Leader')\n", + " 1980/04/10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " None\n", + " 2013/06/10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Metroplex_)^$\n", + " Megatron\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 300\n", + " None\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Battle⋅Station\n", + " None\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 8\n", + " 1980/04/10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 5000000\n", + " 2012/05/10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " None\n", + " Metroplex_)^$\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " ['Metroflex']\n", + " Battle⋅Station\n", " \n", "
\n", "
\n", - "
\n", - " \n", - " [91.44000244140625,⋅None]\n", - " \n", - "
\n", - "
\n", - "
\n", + "
\n", " \n", - " 2011-04-10\n", + " None\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2014-06-24⋅00:00:00\n", + " None\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " True\n", + " None\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " bytearray(b'Battle⋅Station')\n", + " None\n", " \n", "
\n", "
\n", "\n", "\n", - "
Viewing 2 of 2 rows / 16 columns
\n", + "
Viewing 7 of 7 rows / 5 columns
\n", "
1 partition(s)
\n" ], "text/plain": [ @@ -10181,21 +6988,734 @@ } ], "source": [ - "t.create(None, \"outliers.tukey\", None, \"df\",\"select\", numeric_col)" + "t.create(None, \"cols.select_by_dtypes\", \"str\", \"df\", None, \"str\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.select_by_dtypes\", \"int\", \"df\", \"int\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.select_by_dtypes\", \"float\", \"df\", \"float\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.select_by_dtypes\", \"array\", \"df\", \"array\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.names\", None, \"json\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.qcut\", None, \"df\", numeric_col_B, 4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.qcut\", \"all_columns\", \"df\", \"*\", 4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.clip\", None, \"df\", numeric_col_B, 3, 5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.clip\", \"all_columns\", \"df\", \"*\", 3, 5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.replace\", \"full\", \"df\", None,string_col,[\"First Lieutenant\",\"Battle\"], \"Match\", search_by=\"full\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.replace\", \"words\", \"df\", None,string_col,[\"Security\", \"Leader\"], \"Match\", search_by=\"words\")\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.replace\", \"chars\", \"df\", None,string_col,[\"F\", \"E\"], \"Match\", search_by=\"chars\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.replace\", \"numeric\", \"df\", None,\"age\",5000000, 5, search_by=\"numeric\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "# Assert is failing I can see why\n", + "t.create(None, \"cols.replace\", \"all_columns\", \"df\", None,\"*\", [\"Jazz\", \"Leader\"], \"Match\")\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Its necesary to save the function \n", + "t.delete(None, \"cols.apply_expr\", None, \"df\", numeric_col_B, func)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Its necesary to save the function \n", + "t.delete(None, \"cols.apply_expr\", \"all_columns\", \"df\", [numeric_col_B,numeric_col_C], func)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.append\", \"number\", \"df\", new_col, 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_col = op.create.df(\n", + " [\n", + " (\"new_col\", \"str\", True),\n", + " \n", + "\n", + " ],[\n", + " (\"q\"),(\"w\"), (\"e\"), (\"r\"),\n", + "\n", + " ])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.append\", \"dataframes\", \"df\", None, df_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "#t.create(None, \"cols.append\", \"advance\", \"df\", [(\"new_col_4\", \"test\"),\n", + " # (\"new_col_5\", df[numeric_col_B] * 2),\n", + " # (\"new_col_6\", [1, 2, 3])\n", + " # ])," + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.rename\", None, \"df\", numeric_col_B, numeric_col_B + \"(old)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.rename\", \"list\", \"df\", [numeric_col, numeric_col + \"(tons)\", numeric_col_B, numeric_col_B + \"(old)\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.rename\", \"function\", \"df\", str.upper)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.drop\", None, \"df\", numeric_col_B)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.cast\", None, \"df\", string_col, \"string\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.cast\", \"all_columns\", \"df\", \"*\", \"string\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Problems with precision\n", + "t.delete(None, \"cols.cast\", \"vector\", \"df\", array_col, Vectors)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.keep\", None, \"df\", numeric_col_B)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.move\", \"after\", \"df\", numeric_col_B, \"after\", array_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.move\", \"before\", \"df\", numeric_col_B, \"before\", array_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.move\", \"beginning\", \"df\", numeric_col_B, \"beginning\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.move\", \"end\", \"df\", numeric_col_B, \"end\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.select\", None, \"df\", 0, numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.select\", \"regex\", \"df\", \"n.*\", regex=True)," + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.sort\", None, \"df\")\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.sort\", \"desc\", \"df\", None,\"desc\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.sort\", \"asc\", \"df\", None, \"asc\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.fill_na\", None, \"df\", numeric_col, \"1\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.fill_na\", \"array\", \"df\", None, \"japanese name\", [\"1\",\"2\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.fill_na\", \"bool\", \"df\", None, \"Cybertronian\", False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [], + "source": [ + "t.create(None, \"cols.fill_na\", \"all_columns\", \"df\", [\"names\",\"height(ft)\", \"function\", \"rank\", \"age\"], \"2\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Nest" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.nest\", None, \"df\", None, [numeric_col, numeric_col_B], separator=\" \",output_col=new_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# t.create(None, \"cols.nest\", \"mix\", \"df\", [F.col(numeric_col_C), F.col(numeric_col_B)], \"E\", separator=\"--\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_na = source_df.cols.drop(\"NullType\").rows.drop_na(\"*\")\n", + "\n", + "t.create(df_na, \"cols.nest\", \"vector_all_columns\", \"df\", None,[numeric_col_C, numeric_col_B], shape=\"vector\", output_col=new_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(df_na, \"cols.nest\", \"vector\", \"df\", None, [numeric_col_C, numeric_col_B], shape=\"vector\",output_col=new_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.nest\", \"array\", \"df\", None, [numeric_col, numeric_col_B,numeric_col_C], shape=\"array\", output_col=new_col)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating test_cols_count_by_dtypes() test function...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:test_cols_count_by_dtypes()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'names': {'null': 1, 'missing': 0, 'string': 6}, 'height(ft)': {'null': 2, 'missing': 0, 'int': 5}, 'function': {'null': 1, 'missing': 0, 'string': 6}, 'rank': {'null': 1, 'missing': 0, 'int': 6}, 'age': {'null': 1, 'missing': 0, 'int': 6}, 'weight(t)': {'null': 2, 'missing': 0, 'decimal': 5}, 'japanese name': {'null': 1, 'missing': 0, 'array': 6}, 'last position seen': {'null': 3, 'missing': 0, 'string': 4}, 'date arrival': {'null': 1, 'missing': 0, 'string': 6}, 'last date seen': {'null': 1, 'missing': 0, 'string': 6}, 'attributes': {'null': 1, 'missing': 0, 'array': 6}, 'Date Type': {'null': 1, 'missing': 0, 'date': 6}, 'timestamp': {'null': 1, 'missing': 0, 'date': 6}, 'Cybertronian': {'null': 1, 'missing': 0, 'boolean': 6}, 'function(binary)': {'null': 1, 'missing': 0, 'binary': 6}, 'NullType': {'null': 7, 'missing': 0}}\n" + ] + } + ], + "source": [ + "t.create(None, \"cols.count_by_dtypes\", None, \"dict\", None, \"*\", infer=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating test_cols_count_by_dtypes_infer() test function...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:test_cols_count_by_dtypes_infer()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'names': {'string': 6, 'null': 1, 'int': 0, 'decimal': 0, 'boolean': 0, 'date': 0, 'array': 0, 'object': 0, 'gender': 0, 'ip': 0, 'url': 0, 'email': 0, 'credit_card_number': 0, 'zip_code': 0, 'missing': 0}, 'height(ft)': {'smallint': 5, 'null': 2}, 'function': {'string': 6, 'null': 1, 'int': 0, 'decimal': 0, 'boolean': 0, 'date': 0, 'array': 0, 'object': 0, 'gender': 0, 'ip': 0, 'url': 0, 'email': 0, 'credit_card_number': 0, 'zip_code': 0, 'missing': 0}, 'rank': {'tinyint': 6, 'null': 1}, 'age': {'int': 6, 'null': 1}, 'weight(t)': {'float': 5, 'null': 2}, 'japanese name': {'array': 6, 'null': 1}, 'last position seen': {'array': 3, 'date': 1, 'null': 3, 'int': 0, 'decimal': 0, 'string': 0, 'boolean': 0, 'object': 0, 'gender': 0, 'ip': 0, 'url': 0, 'email': 0, 'credit_card_number': 0, 'zip_code': 0, 'missing': 0}, 'date arrival': {'date': 6, 'null': 1, 'int': 0, 'decimal': 0, 'string': 0, 'boolean': 0, 'array': 0, 'object': 0, 'gender': 0, 'ip': 0, 'url': 0, 'email': 0, 'credit_card_number': 0, 'zip_code': 0, 'missing': 0}, 'last date seen': {'date': 6, 'null': 1, 'int': 0, 'decimal': 0, 'string': 0, 'boolean': 0, 'array': 0, 'object': 0, 'gender': 0, 'ip': 0, 'url': 0, 'email': 0, 'credit_card_number': 0, 'zip_code': 0, 'missing': 0}, 'attributes': {'array': 6, 'null': 1}, 'Date Type': {'date': 6, 'null': 1}, 'timestamp': {'timestamp': 6, 'null': 1}, 'Cybertronian': {'boolean': 6, 'null': 1}, 'function(binary)': {'binary': 6, 'null': 1}, 'NullType': {'null': 7}}\n" + ] + } + ], + "source": [ + "t.create(None, \"cols.count_by_dtypes\", \"infer\", \"dict\", None, \"*\", infer=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating file ../test_df_cols.py\n", + "Done\n" + ] + } + ], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "dtypes_df = op.create.df(\n", + " [\n", + " (\"col 1\", \"str\", True),\n", + " (\"col 2\", \"str\", True),\n", + " (\"col 3\", \"int\", True),\n", + " \n", + " ],\n", + " [\n", + " (\"male\",\"male\",1),\n", + " (\"optimus\",\"bumblebee\",1),\n", + " (\"3\",\"4.1\",1),\n", + " (\"true\",\"False\",1),\n", + " (\"[1,2,3,4]\",\"(1,2,3,4)\",1),\n", + " (\"{1,2,3,4}\",\"{'key1' :1 , 'key2':2}\",1),\n", + " (\"1.1.1.1\",\"123.123.123.123\",1),\n", + " (\"http://hi-optimuse.com\",\"https://hi-bumblebee.com\",1),\n", + " (\"optimus@cybertron.com\",\"bumblebee@cybertron.com\",1),\n", + " (\"5123456789123456\",\"373655783158306\",1),\n", + " (\"11529\",\"30345\",1),\n", + " (\"04/10/1980\",\"04/10/1980\",1),\n", + " (\"null\",\"Null\",1),\n", + " (\"\",\"\",1),\n", + " (None,None,1) \n", + " \n", + " ], infer_schema=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(dtypes_df, \"cols.count_by_dtypes\", \"infer\", \"dict\", None, \"*\", infer=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(source_df, \"cols.count_by_dtypes\", None, \"dict\", None, \"*\", infer=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "source_df.table()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import sys\n", + "from datetime import date, datetime\n", + "\n", + "from pyspark.sql.types import *\n", + "\n", + "from optimus import Optimus\n", + "\n", + "mismatch_df = op.create.df(\n", + " [\n", + " (\"names\", \"str\", True),\n", + " (\"height(ft)\", \"int\", True),\n", + " (\"function\", \"str\", True),\n", + " (\"rank\", \"int\", True),\n", + " (\"age\", \"int\", True),\n", + " (\"weight(t)\", \"float\", True),\n", + " (\"japanese name\", ArrayType(StringType()), True),\n", + " (\"last position seen\", \"str\", True),\n", + " (\"date arrival\", \"str\", True),\n", + " (\"last date seen\", \"str\", True),\n", + " (\"attributes\", ArrayType(FloatType()), True),\n", + " (\"DateType\", DateType()),\n", + " (\"Timestamp\", TimestampType()),\n", + " (\"Cybertronian\", \"bool\", True),\n", + " (\"function(binary)\", \"binary\", False),\n", + " (\"NullType\", \"null\", True),\n", + "\n", + " ],\n", + " [\n", + " (\"31/12/2019\", 28, \"1978-12-20\", 10, 5000000, 4.30, [\"Inochi\", \"Convoy\"], \"19.442735,-99.201111\", \"1980/04/10\",\n", + " \"2016/09/10\", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray(\"Leader\", \"utf-8\"),\n", + " None),\n", + " (\"bumbl#ebéé \", 17, \"Espionage\", 7, 5000000, 2.0, [\"Bumble\", \"Goldback\"], \"10.642707,-71.612534\", \"1980/04/10\",\n", + " \"2015/08/10\", [5.334, 2000.0], date(2015, 8, 10), datetime(2014, 6, 24), True, bytearray(\"Espionage\", \"utf-8\"),\n", + " None),\n", + " (\"ironhide&\", 26, \"Security\", 7, 5000000, 4.0, [\"Roadbuster\"], \"37.789563,-122.400356\", \"1980/04/10\",\n", + " \"2014/07/10\", [7.9248, 4000.0], date(2014, 6, 24), datetime(2014, 6, 24), True, bytearray(\"Security\", \"utf-8\"),\n", + " None),\n", + " (\"Jazz\", 13, \"First Lieutenant\", 8, 5000000, 1.80, [\"Meister\"], \"33.670666,-117.841553\", \"1980/04/10\",\n", + " \"2013/06/10\", [3.9624, 1800.0], date(2013, 6, 24), datetime(2014, 6, 24), True,\n", + " bytearray(\"First Lieutenant\", \"utf-8\"), None),\n", + " (\"Megatron\", None, \"None\", 10, 5000000, 5.70, [\"Megatron\"], None, \"1980/04/10\", \"2012/05/10\", [None, 5700.0],\n", + " date(2012, 5, 10), datetime(2014, 6, 24), True, bytearray(\"None\", \"utf-8\"), None),\n", + " (\"Metroplex_)^$\", 300, \"Battle Station\", 8, 5000000, None, [\"Metroflex\"], None, \"1980/04/10\", \"2011/04/10\",\n", + " [91.44, None], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray(\"Battle Station\", \"utf-8\"), None),\n", + " (\"1\", 2, \"3\", 4, 5, 6.0, [\"7\"], 8, \"1980/04/10\", \"2011/04/10\",\n", + " [11.0], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray(\"15\", \"utf-8\"), None)\n", + " ], infer_schema=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "mismatch = {\"names\":\"dd/mm/yyyy\",\"height(ft)\":r'^([0-2][0-9]|(3)[0-1])(\\/)(((0)[0-9])|((1)[0-2]))(\\/)\\d{4}$',\"function\":\"yyyy-mm-dd\"}" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "m = {\"names\":\"int\"}" + ] + }, + { + "cell_type": "code", + "execution_count": 69, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:optimus:percentile() executed in 2.62 sec\n" - ] - }, { "data": { "text/html": [ @@ -10204,7 +7724,7 @@ "\n", "\n", "\n", - "
Viewing 3 of 3 rows / 16 columns
\n", + "
Viewing 7 of 7 rows / 16 columns
\n", "
1 partition(s)
\n", "\n", "\n", @@ -10223,7 +7743,7 @@ " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", " \n", " \n", - "\n", - " \n", - " \n", " \n", " \n", " \n", @@ -10736,304 +8388,72 @@ " 2013-06-24\n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", "
height(ft)
\n", - "
2 (smallint)
\n", + "
2 (int)
\n", "
\n", " \n", " nullable\n", @@ -10243,7 +7763,7 @@ " \n", "
\n", "
rank
\n", - "
4 (tinyint)
\n", + "
4 (int)
\n", "
\n", " \n", " nullable\n", @@ -10322,7 +7842,7 @@ "
\n", - "
Date Type
\n", + "
DateType
\n", "
12 (date)
\n", "
\n", " \n", @@ -10332,7 +7852,7 @@ "
\n", - "
timestamp
\n", + "
Timestamp
\n", "
13 (timestamp)
\n", "
\n", " \n", @@ -10356,25 +7876,157 @@ "
15 (binary)
\n", "
\n", " \n", + " not nullable\n", + " \n", + "
\n", + "
\n", + "
NullType
\n", + "
16 (null)
\n", + "
\n", + " \n", " nullable\n", " \n", "
\n", - "
\n", + "
\n", + " \n", + " 31/12/2019\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 28\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1978-12-20\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 4.300000190734863\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Inochi',⋅'Convoy']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 19.442735,-99.201111\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2016/09/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [8.53439998626709,⋅4300.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2016-09-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", - "
NullType
\n", - "
16 (null)
\n", - "
\n", + "
\n", + "
\n", " \n", - " nullable\n", + " bytearray(b'Leader')\n", + " \n", + "
\n", + "
\n", + "
\n", " \n", + " None\n", + " \n", "
\n", - " \n", + "
\n", - "
\n", - " \n", - " 2014-06-24⋅00:00:00\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " True\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " bytearray(b'First⋅Lieutenant')\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "\n", - "\n", - "
Viewing 3 of 3 rows / 16 columns
\n", - "
1 partition(s)
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "source_df.outliers.tukey(numeric_col).drop().table()" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating test_outliers_tukey_drop() test function...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:optimus:test_outliers_tukey_drop()\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[\"'height(ft)'\"]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:optimus:percentile() executed in 2.64 sec\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
Viewing 3 of 3 rows / 16 columns
\n", - "
1 partition(s)
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", + " \n", + " \n", " \n", " \n", - "\n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -11047,25 +8467,25 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -11079,25 +8499,25 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -11119,9 +8539,9 @@ " \n", " \n", " \n", @@ -11139,33 +8559,33 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -11179,25 +8599,25 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -11211,25 +8631,25 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -11251,9 +8671,9 @@ " \n", " \n", " \n", @@ -11271,65 +8691,65 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -11343,25 +8763,25 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -11382,247 +8802,218 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
names
\n", - "
1 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
height(ft)
\n", - "
2 (smallint)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
function
\n", - "
3 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
rank
\n", - "
4 (tinyint)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
age
\n", - "
5 (int)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
weight(t)
\n", - "
6 (float)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
japanese name
\n", - "
7 (array<string>)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
last position seen
\n", - "
8 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
date arrival
\n", - "
9 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
last date seen
\n", - "
10 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
attributes
\n", - "
11 (array<float>)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
Date Type
\n", - "
12 (date)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
timestamp
\n", - "
13 (timestamp)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
Cybertronian
\n", - "
14 (boolean)
\n", - "
\n", - " \n", - " nullable\n", + "
\n", + "
\n", " \n", + " 2014-06-24⋅00:00:00\n", + " \n", "
\n", - " \n", + "
\n", - "
function(binary)
\n", - "
15 (binary)
\n", - "
\n", - " \n", - " nullable\n", + "
\n", + "
\n", " \n", + " True\n", + " \n", "
\n", - " \n", + "
\n", - "
NullType
\n", - "
16 (null)
\n", - "
\n", + "
\n", + "
\n", " \n", - " nullable\n", + " bytearray(b'First⋅Lieutenant')\n", + " \n", + "
\n", + "
\n", + "
\n", " \n", + " None\n", + " \n", "
\n", - " \n", + "
\n", - "
\n", + "
\n", " \n", - " bumbl#ebéé⋅⋅\n", + " Megatron\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 17\n", + " None\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Espionage\n", + " None\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 7\n", + " 10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2.0\n", + " 5.699999809265137\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " ['Bumble',⋅'Goldback']\n", + " ['Megatron']\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 10.642707,-71.612534\n", + " None\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2015/08/10\n", + " 2012/05/10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " [5.334000110626221,⋅2000.0]\n", + " [None,⋅5700.0]\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2015-08-10\n", + " 2012-05-10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " bytearray(b'Espionage')\n", + " bytearray(b'None')\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " ironhide&\n", + " Metroplex_)^$\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 26\n", + " 300\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Security\n", + " Battle⋅Station\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 7\n", + " 8\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 4.0\n", + " None\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " ['Roadbuster']\n", + " ['Metroflex']\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 37.789563,-122.400356\n", + " None\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2014/07/10\n", + " 2011/04/10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " [7.924799919128418,⋅4000.0]\n", + " [91.44000244140625,⋅None]\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2014-06-24\n", + " 2011-04-10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " bytearray(b'Security')\n", + " bytearray(b'Battle⋅Station')\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Jazz\n", + " 1\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 13\n", + " 2\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " First⋅Lieutenant\n", + " 3\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 8\n", + " 4\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 5000000\n", + " 5\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 1.7999999523162842\n", + " 6.0\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " ['Meister']\n", + " ['7']\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 33.670666,-117.841553\n", + " 8\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2013/06/10\n", + " 2011/04/10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " [3.962399959564209,⋅1800.0]\n", + " [11.0]\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2013-06-24\n", + " 2011-04-10\n", " \n", "
\n", "
\n", - "
\n", - " \n", - " bytearray(b'First⋅Lieutenant')\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "\n", - "\n", - "
Viewing 3 of 3 rows / 16 columns
\n", - "
1 partition(s)
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "t.create(None, \"outliers.tukey\", None, \"df\",\"drop\", numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating test_outliers_tukey_whiskers() test function...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:optimus:test_outliers_tukey_whiskers()\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[\"'height(ft)'\"]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:optimus:percentile() executed in 2.65 sec\n", - "INFO:optimus:percentile() executed in 2.64 sec\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'lower_bound': -6.5, 'upper_bound': 45.5, 'q1': 13, 'median': 17, 'q3': 26, 'iqr': 13}\n" - ] + " \n", + "
\n", + " \n", + " bytearray(b'15')\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "
Viewing 7 of 7 rows / 16 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "t.create(None, \"outliers.tukey\", None, \"json\", \"whiskers\", numeric_col)" + "mismatch_df.table()" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Creating test_outliers_tukey_count() test function...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:optimus:test_outliers_tukey_count()\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[\"'height(ft)'\"]\n" + "Creating test_cols_count_mismatch() test function...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:optimus:percentile() executed in 2.61 sec\n" + "INFO:optimus:test_cols_count_mismatch()\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "2\n" + "['{\"names\": \"int\"}']\n", + "{'names': {'mismatch': 6, 'int': 1, 'null': 0, 'missing': 0}}\n" ] } ], "source": [ - "t.create(None, \"outliers.tukey\", None, \"json\", \"count\", numeric_col)" + "t.create(mismatch_df, \"cols.count_mismatch\", None, \"dict\", None, {\"names\":\"int\"})" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Creating test_outliers_tukey_non_outliers_count() test function...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:optimus:test_outliers_tukey_non_outliers_count()\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[\"'height(ft)'\"]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:optimus:percentile() executed in 2.59 sec\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "3\n" + "Creating file ../test_df_cols.py\n", + "Done\n" ] } ], "source": [ - "t.create(None, \"outliers.tukey\", None, \"json\", \"non_outliers_count\", numeric_col)" + "t.run()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Unnest String" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Creating test_outliers_tukey_info() test function...\n" + "Creating test_cols_unnest_string_multi_index() test function...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:optimus:test_outliers_tukey_info()\n" + "INFO:optimus:test_cols_unnest_string_multi_index()\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "[\"'height(ft)'\"]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:optimus:percentile() executed in 2.83 sec\n" + "[\"'date arrival'\", \"'/'\"]\n" ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'count_outliers': 2, 'count_non_outliers': 3, 'lower_bound': -6.5, 'lower_bound_count': 1, 'upper_bound': 45.5, 'upper_bound_count': 1, 'q1': 13, 'median': 17, 'q3': 26, 'iqr': 13}\n" + "ename": "ValueError", + "evalue": "'missing_columns' must be 'words', 'num', 'animals', 'thing', 'second', 'filter', received 'date arrival'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcreate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"cols.unnest\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"string_multi_index\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"df\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdate_col\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"/\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msplits\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\test.py\u001b[0m in \u001b[0;36mcreate\u001b[1;34m(self, obj, method, suffix, output, additional_method, *args, **kwargs)\u001b[0m\n\u001b[0;32m 217\u001b[0m \u001b[0mdf_func\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_func\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 218\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 219\u001b[1;33m \u001b[0mdf_result\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf_func\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 220\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 221\u001b[0m \u001b[1;31m# Additional Methods\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 47\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 48\u001b[0m \u001b[0mstart_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 49\u001b[1;33m \u001b[0mf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 50\u001b[0m \u001b[0m_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mround\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mstart_time\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 51\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mlog_time\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\dataframe\\columns.py\u001b[0m in \u001b[0;36munnest\u001b[1;34m(input_cols, separator, splits, index, output_cols)\u001b[0m\n\u001b[0;32m 1571\u001b[0m \u001b[0mdf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1572\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1573\u001b[1;33m \u001b[0minput_cols\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparse_columns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minput_cols\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1574\u001b[0m \u001b[0moutput_cols\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mget_output_cols\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minput_cols\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moutput_cols\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1575\u001b[0m \u001b[0mfinal_columns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\columns.py\u001b[0m in \u001b[0;36mparse_columns\u001b[1;34m(df, cols_args, get_args, is_regex, filter_by_column_dtypes, accepts_missing_cols, invert)\u001b[0m\n\u001b[0;32m 140\u001b[0m \u001b[1;31m# Check for missing columns\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 141\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0maccepts_missing_cols\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 142\u001b[1;33m \u001b[0mcheck_for_missing_columns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcols\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 143\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 144\u001b[0m \u001b[1;31m# Filter by column data type\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\columns.py\u001b[0m in \u001b[0;36mcheck_for_missing_columns\u001b[1;34m(df, col_names)\u001b[0m\n\u001b[0;32m 242\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 243\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmissing_columns\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m>\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 244\u001b[1;33m \u001b[0mRaiseIt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalue_error\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmissing_columns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 245\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 246\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\raiseit.py\u001b[0m in \u001b[0;36mvalue_error\u001b[1;34m(var, data_values)\u001b[0m\n\u001b[0;32m 76\u001b[0m type=divisor.join(map(\n\u001b[0;32m 77\u001b[0m \u001b[1;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m\"'\"\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mx\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;34m\"'\"\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 78\u001b[1;33m data_values)), var_type=one_list_to_val(var)))\n\u001b[0m\u001b[0;32m 79\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 80\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mstaticmethod\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mValueError\u001b[0m: 'missing_columns' must be 'words', 'num', 'animals', 'thing', 'second', 'filter', received 'date arrival'" ] } ], "source": [ - "t.create(None, \"outliers.tukey\", None, \"json\", \"info\", numeric_col)" + "t.create(None, \"cols.unnest\", \"string_multi_index\", \"df\", None, date_col, \"/\", splits=3, index=2)" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating file ../test_df_outliers.py\n", - "Done\n" - ] - } - ], + "outputs": [], + "source": [ + "t.create(None, \"cols.unnest\", \"string_multi_index\", \"df\", None, date_col, \"/\", splits=3, index=[1,2])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.unnest\", \"string_infer_split\", \"df\", None, date_col, \"/\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.unnest\", \"string_no_index\", \"df\", None, date_col, \"/\", splits=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.unnest\", \"string_output_columns\", \"df\", None, date_col, \"/\", splits=3, output_cols= [(\"year\", \"month\",\"day\")])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.unnest\", \"array_index\", \"df\", None, array_col, index=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"cols.unnest\", \"array_multi_index\", \"df\", None, array_col, index=[1,2])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "t.run()" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "## Zscore" + "t.create(None, \"cols.unnest\", \"string_multi_colum_multi_index_multi_output\", \"df\", None, [\"date arrival\",\"last date seen\"], \"/\", index=[(1,2),(1,2)], output_cols=[(\"year1\",\"month1\"),(\"year2\",\"month2\")])" ] }, { @@ -11631,7 +9022,7 @@ "metadata": {}, "outputs": [], "source": [ - "threshold = 0.5" + "t.create(None, \"cols.unnest\", \"string_multi_colum_multi_output\", \"df\", None, [\"date arrival\",\"last date seen\"], \"/\", output_cols=[(\"year1\",\"month1\"),(\"year2\",\"month2\")])" ] }, { @@ -11640,7 +9031,7 @@ "metadata": {}, "outputs": [], "source": [ - "t.create(None, \"outliers.z_score\", None, \"df\",\"select\", numeric_col, threshold)" + "t.create(None, \"cols.unnest\", \"array\", \"df\", array_col)" ] }, { @@ -11649,7 +9040,7 @@ "metadata": {}, "outputs": [], "source": [ - "source_df.outliers.z_score('height(ft)',0.5).select()" + "t.create(None, \"cols.unnest\", \"array_all_columns\", \"df\", array_col)" ] }, { @@ -11658,7 +9049,7 @@ "metadata": {}, "outputs": [], "source": [ - "t.create(None, \"outliers.z_score\", None, \"df\",\"drop\", numeric_col, threshold)" + "t.create(None, \"cols.is_na\", \"all_columns\", \"df\", \"*\")" ] }, { @@ -11667,7 +9058,7 @@ "metadata": {}, "outputs": [], "source": [ - "t.create(None, \"outliers.z_score\", None, \"json\", \"count\", numeric_col, threshold)" + "t.create(None, \"cols.is_na\", None, \"df\", numeric_col)" ] }, { @@ -11676,7 +9067,47 @@ "metadata": {}, "outputs": [], "source": [ - "t.create(None, \"outliers.z_score\", None, \"json\", \"non_outliers_count\", numeric_col, threshold)" + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql.types import *\n", + "from optimus import Optimus\n", + "from optimus.helpers.json import json_enconding\n", + "from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector\n", + "import numpy as np\n", + "nan = np.nan\n", + "import datetime" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_end_of_cell_marker": 2 + }, + "outputs": [], + "source": [ + "actual_df =op.load.json('https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.json')\n", + "expected_df = op.create.df([('billingId', LongType(), True),('birth', StringType(), True),('dummyCol', StringType(), True),('firstName', StringType(), True),('id', LongType(), True),('lastName', StringType(), True),('price', LongType(), True),('product', StringType(), True)], [(123, '1980/07/07', 'never', 'Luis', 1, 'Alvarez$$%!', 10, 'Cake')])\n", + "\n", + "# assert (expected_df.collect() == actual_df.collect())\n", + "\n", + "from deepdiff import DeepDiff # For Deep Difference of 2 objects\n", + "\n", + "actual_df.table()\n", + "expected_df.table()\n", + "\n", + "# source_df.table()\n", + "# print(actual_df.to_json())\n", + "# print(expected_df.to_json())\n", + "a1 = actual_df.to_json()\n", + "e1 = expected_df.to_json()" ] }, { @@ -11685,7 +9116,8 @@ "metadata": {}, "outputs": [], "source": [ - "t.create(None, \"outliers.z_score\", None, \"json\", \"info\", numeric_col, threshold)" + "ddiff = DeepDiff(a1, e1, ignore_order=False)\n", + "print(ddiff)" ] }, { @@ -11693,15 +9125,13 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "t.run()" - ] + "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Modified Zscore" + "# Rows Test" ] }, { @@ -11710,8 +9140,12 @@ "metadata": {}, "outputs": [], "source": [ - "threshold = 0.5\n", - "relative_error = 10000" + "t = Test(op,df, \"df_rows\", imports=[\"from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector\",\n", + " \"import numpy as np\",\n", + " \"nan = np.nan\",\n", + " \"import datetime\",\n", + " \"from pyspark.sql import functions as F\",\n", + " \"from optimus.functions import abstract_udf as audf\"])" ] }, { @@ -11720,7 +9154,11 @@ "metadata": {}, "outputs": [], "source": [ - "t.create(None, \"outliers.modified_z_score\", None, \"df\",\"select\", numeric_col, threshold, relative_error)" + "rows = [\n", + " (\"Optim'us\", 28, \"Leader\", 10, 5000000, 4.30, [\"Inochi\", \"Convoy\"], \"19.442735,-99.201111\", \"1980/04/10\",\n", + " \"2016/09/10\", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray(\"Leader\", \"utf-8\"),\n", + " None)\n", + "]" ] }, { @@ -11729,7 +9167,7 @@ "metadata": {}, "outputs": [], "source": [ - "t.create(None, \"outliers.modified_z_score\", None, \"df\",\"drop\", numeric_col, threshold, relative_error)" + "df = op.load.url(\"https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.csv\")" ] }, { @@ -11738,7 +9176,11 @@ "metadata": {}, "outputs": [], "source": [ - "t.create(None, \"outliers.modified_z_score\", None, \"json\",\"count\", numeric_col, threshold, relative_error)" + "t = Test(op, source_df, \"op_io\", imports=[\"from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector\",\n", + " \"import numpy as np\",\n", + " \"nan = np.nan\",\n", + " \"import datetime\",\n", + " \"from pyspark.sql import functions as F\"],path = \"op_io\", final_path=\"..\")" ] }, { @@ -11747,7 +9189,10 @@ "metadata": {}, "outputs": [], "source": [ - "t.create(None, \"outliers.modified_z_score\", None, \"json\",\"non_outliers_count\", numeric_col, threshold, relative_error)" + "t.create(op, \"load.csv\", \"local_csv\", \"df\", \"../../examples/data/foo.csv\")\n", + "t.create(op, \"load.json\", \"local_json\", \"df\", \"../../examples/data/foo.json\")\n", + "t.create(op, \"load.parquet\", \"local_parquet\", \"df\", \"../../examples/data/foo.parquet\")\n", + "t.create(op, \"load.csv\", \"remote_csv\", \"df\", \"https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.csv\")" ] }, { @@ -11756,7 +9201,7 @@ "metadata": {}, "outputs": [], "source": [ - "t.create(None, \"outliers.modified_z_score\", None, \"json\",\"info\", numeric_col, threshold, relative_error)" + "t.create(op, \"load.json\", \"remote_json\", \"df\", \"https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.json\")" ] }, { @@ -11765,25 +9210,15 @@ "metadata": {}, "outputs": [], "source": [ - "t.run()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Mad" + "t.create(op, \"load.parquet\", \"remote_parquet\", \"df\", \"https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.parquet\")" ] }, { "cell_type": "code", - "execution_count": 84, + "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "threshold = 0.5\n", - "relative_error = 10000" - ] + "source": [] }, { "cell_type": "code", @@ -11791,7 +9226,10 @@ "metadata": {}, "outputs": [], "source": [ - "t.create(None, \"outliers.mad\", None, \"df\",\"select\", numeric_col, threshold, relative_error)" + "from optimus.profiler.profiler import Profiler\n", + "p = Profiler()\n", + "\n", + "print(p.run(source_df1, \"japanese name\"))" ] }, { @@ -11800,7 +9238,8 @@ "metadata": {}, "outputs": [], "source": [ - "t.create(None, \"outliers.mad\", None, \"df\",\"drop\", numeric_col, threshold, relative_error)" + "# df_string = source_df.cols.cast(\"*\",\"str\")\n", + "t.create(source_df, \"save.csv\", None, None, \"test.csv\")" ] }, { @@ -11809,384 +9248,3526 @@ "metadata": {}, "outputs": [], "source": [ - "t.create(None, \"outliers.mad\", None, \"json\",\"count\", numeric_col, threshold, relative_error)" + "t.create(None, \"save.json\", None, None, \"test.json\")" ] }, { "cell_type": "code", - "execution_count": 93, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating test_outliers_mad_non_outliers_count() test function...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:optimus:test_outliers_mad_non_outliers_count()\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[\"'height(ft)'\", '0.5', '10000']\n", - "2\n" - ] - } - ], + "outputs": [], "source": [ - "t.create(None, \"outliers.mad\", None, \"json\",\"non_outliers_count\", numeric_col, threshold, relative_error)" + "t.create(None, \"save.parquet\", None, None, \"test.parquet\")" ] }, { "cell_type": "code", - "execution_count": 90, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating test_outliers_mad_info() test function...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:optimus:test_outliers_mad_info()\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[\"'height(ft)'\", '0.5', '10000']\n", - "{'count_outliers': 3, 'count_non_outliers': 2, 'lower_bound': 12.5, 'lower_bound_count': 1, 'upper_bound': 21.5, 'upper_bound_count': 2}\n" - ] - } - ], + "execution_count": null, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], "source": [ - "t.create(None, \"outliers.mad\", None, \"json\",\"info\", numeric_col, threshold, relative_error)" + "t.run()" ] }, { "cell_type": "code", - "execution_count": 94, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating file ../test_df_outliers.py\n", - "Done\n" - ] - } - ], + "execution_count": null, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], "source": [ - "t.run()" + "source_df.table()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Keycolision" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "source_df = op.read.csv(\"../../examples/data/random.csv\",header=True, sep=\";\").limit(10)" + "# Ouliers" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 86, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 7 of 7 rows / 16 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
names
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
height(ft)
\n", + "
2 (smallint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
function
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
rank
\n", + "
4 (tinyint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
age
\n", + "
5 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
weight(t)
\n", + "
6 (float)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
japanese name
\n", + "
7 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
last position seen
\n", + "
8 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
date arrival
\n", + "
9 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
last date seen
\n", + "
10 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
attributes
\n", + "
11 (array<float>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
Date Type
\n", + "
12 (date)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
timestamp
\n", + "
13 (timestamp)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
Cybertronian
\n", + "
14 (boolean)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
function(binary)
\n", + "
15 (binary)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
NullType
\n", + "
16 (null)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Optim'us\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " -28\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Leader\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 4.300000190734863\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Inochi',⋅'Convoy']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 19.442735,-99.201111\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2016/09/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [8.53439998626709,⋅4300.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2016-09-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Leader')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bumbl#ebéé⋅⋅\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 17\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Espionage\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 7\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Bumble',⋅'Goldback']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10.642707,-71.612534\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2015/08/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [5.334000110626221,⋅2000.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2015-08-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Espionage')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ironhide&\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 26\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Security\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 7\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 4.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Roadbuster']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 37.789563,-122.400356\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014/07/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [7.924799919128418,⋅4000.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Security')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Jazz\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 13\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " First⋅Lieutenant\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1.7999999523162842\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Meister']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 33.670666,-117.841553\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2013/06/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [3.962399959564209,⋅1800.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2013-06-24\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'First⋅Lieutenant')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Megatron\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5.699999809265137\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Megatron']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2012/05/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [None,⋅5700.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2012-05-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'None')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Metroplex_)^$\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 300\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Battle⋅Station\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Metroflex']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2011/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [91.44000244140625,⋅None]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2011-04-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Battle⋅Station')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "\n", + "\n", + "
Viewing 7 of 7 rows / 16 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ + "import pandas as pd\n", + "from pyspark.sql.types import *\n", + "from datetime import date, datetime\n", + "\n", + "\n", + "cols = [\n", + " (\"names\", \"str\"),\n", + " (\"height(ft)\", ShortType()),\n", + " (\"function\", \"str\"),\n", + " (\"rank\", ByteType()),\n", + " (\"age\", \"int\"),\n", + " (\"weight(t)\", \"float\"),\n", + " \"japanese name\",\n", + " \"last position seen\",\n", + " \"date arrival\",\n", + " \"last date seen\",\n", + " (\"attributes\", ArrayType(FloatType())),\n", + " (\"Date Type\", DateType()),\n", + " (\"timestamp\", TimestampType()),\n", + " (\"Cybertronian\", BooleanType()),\n", + " (\"function(binary)\", BinaryType()),\n", + " (\"NullType\", NullType())\n", + "\n", + " ]\n", + "\n", + "rows = [\n", + " (\"Optim'us\", -28, \"Leader\", 10, 5000000, 4.30, [\"Inochi\", \"Convoy\"], \"19.442735,-99.201111\", \"1980/04/10\",\n", + " \"2016/09/10\", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray(\"Leader\", \"utf-8\"),\n", + " None),\n", + " (\"bumbl#ebéé \", 17, \"Espionage\", 7, 5000000, 2.0, [\"Bumble\", \"Goldback\"], \"10.642707,-71.612534\", \"1980/04/10\",\n", + " \"2015/08/10\", [5.334, 2000.0], date(2015, 8, 10), datetime(2014, 6, 24), True, bytearray(\"Espionage\", \"utf-8\"),\n", + " None),\n", + " (\"ironhide&\", 26, \"Security\", 7, 5000000, 4.0, [\"Roadbuster\"], \"37.789563,-122.400356\", \"1980/04/10\",\n", + " \"2014/07/10\", [7.9248, 4000.0], date(2014, 6, 24), datetime(2014, 6, 24), True, bytearray(\"Security\", \"utf-8\"),\n", + " None),\n", + " (\"Jazz\", 13, \"First Lieutenant\", 8, 5000000, 1.80, [\"Meister\"], \"33.670666,-117.841553\", \"1980/04/10\",\n", + " \"2013/06/10\", [3.9624, 1800.0], date(2013, 6, 24), datetime(2014, 6, 24), True,\n", + " bytearray(\"First Lieutenant\", \"utf-8\"), None),\n", + " (\"Megatron\", None, \"None\", 10, 5000000, 5.70, [\"Megatron\"], None, \"1980/04/10\", \"2012/05/10\", [None, 5700.0],\n", + " date(2012, 5, 10), datetime(2014, 6, 24), True, bytearray(\"None\", \"utf-8\"), None),\n", + " (\"Metroplex_)^$\", 300, \"Battle Station\", 8, 5000000, None, [\"Metroflex\"], None, \"1980/04/10\", \"2011/04/10\",\n", + " [91.44, None], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray(\"Battle Station\", \"utf-8\"), None),\n", + " (None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None),\n", + "\n", + " ]\n", + "source_df = op.create.df(cols ,rows)\n", "source_df.table()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ - "t = Test(op, source_df, \"df_keycollision\", imports=[\"from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector\",\n", + "t = Test(op, source_df, \"df_outliers\", imports=[\"from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector\",\n", " \"import numpy as np\",\n", " \"nan = np.nan\",\n", " \"import datetime\",\n", - " \"from pyspark.sql import functions as F\",\n", - " \"from optimus.ml import keycollision as keyCol\"], \n", - " path = \"df_keycollision\", final_path=\"..\")\n", - "\n", - "from optimus.ml import keycollision as keyCol" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "inputHidden": false, - "outputHidden": false - }, - "outputs": [], - "source": [ - "t.create(keyCol, \"fingerprint\", None, \"df\",None, source_df, \"STATE\")\n", - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "inputHidden": false, - "outputHidden": false - }, - "outputs": [], - "source": [ - "t.create(keyCol, \"fingerprint_cluster\", None, \"json\", None, source_df, \"STATE\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "inputHidden": false, - "outputHidden": false - }, - "outputs": [], - "source": [ - "t.create(keyCol, \"n_gram_fingerprint\", None, \"df\", None, source_df, \"STATE\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "inputHidden": false, - "outputHidden": false - }, - "outputs": [], - "source": [ - "t.create(keyCol, \"n_gram_fingerprint_cluster\", None, \"json\", None, source_df, \"STATE\", 2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "inputHidden": false, - "outputHidden": false - }, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Distance cluster" + " \"from pyspark.sql import functions as F\"], path = \"df_outliers\", final_path=\"..\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ - "source_df = op.read.csv(\"../../examples/data/random.csv\",header=True, sep=\";\").limit(1000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "inputHidden": false, - "outputHidden": false - }, - "outputs": [], - "source": [ - "t = Test(op, source_df, \"df_distance_cluster\", imports=[\"from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector\",\n", - " \"import numpy as np\",\n", - " \"nan = np.nan\",\n", - " \"import datetime\",\n", - " \"from pyspark.sql import functions as F\",\n", - " \"from optimus.ml import distancecluster as dc\"], path = \"df_distance_cluster\", final_path=\"..\")\n", + "from pyspark.sql import functions as F\n", "\n", - "from optimus.ml import distancecluster as dc" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.table()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "inputHidden": false, - "outputHidden": false - }, - "outputs": [], - "source": [ - "t.create(dc, \"levenshtein_cluster\", None, 'dict', None, source_df, \"STATE\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_cancer = op.spark.read.csv('../data_cancer.csv', sep=',', header=True, inferSchema=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "columns = ['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',\n", - " 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean',\n", - " 'fractal_dimension_mean']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_model, rf_model = op.ml.gbt(df_cancer, columns, \"diagnosis\")" + "\n", + "def func(col_name, attrs):\n", + " return F.col(col_name) * 2\n", + "\n", + "numeric_col = \"height(ft)\"\n", + "numeric_col_B = \"rank\"\n", + "numeric_col_C = \"rank\"\n", + "string_col = \"function\"\n", + "date_col = \"date arrival\"\n", + "date_col_B = \"last date seen\"\n", + "new_col = \"new col\"\n", + "array_col = \"attributes\"" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 7 of 7 rows / 16 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
names
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
height(ft)
\n", + "
2 (smallint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
function
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
rank
\n", + "
4 (tinyint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
age
\n", + "
5 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
weight(t)
\n", + "
6 (float)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
japanese name
\n", + "
7 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
last position seen
\n", + "
8 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
date arrival
\n", + "
9 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
last date seen
\n", + "
10 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
attributes
\n", + "
11 (array<float>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
Date Type
\n", + "
12 (date)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
timestamp
\n", + "
13 (timestamp)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
Cybertronian
\n", + "
14 (boolean)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
function(binary)
\n", + "
15 (binary)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
NullType
\n", + "
16 (null)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Optim'us\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " -28\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Leader\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 4.300000190734863\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Inochi',⋅'Convoy']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 19.442735,-99.201111\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2016/09/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [8.53439998626709,⋅4300.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2016-09-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Leader')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bumbl#ebéé⋅⋅\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 17\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Espionage\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 7\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Bumble',⋅'Goldback']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10.642707,-71.612534\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2015/08/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [5.334000110626221,⋅2000.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2015-08-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Espionage')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ironhide&\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 26\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Security\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 7\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 4.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Roadbuster']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 37.789563,-122.400356\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014/07/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [7.924799919128418,⋅4000.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Security')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Jazz\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 13\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " First⋅Lieutenant\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1.7999999523162842\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Meister']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 33.670666,-117.841553\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2013/06/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [3.962399959564209,⋅1800.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2013-06-24\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'First⋅Lieutenant')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Megatron\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5.699999809265137\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Megatron']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2012/05/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [None,⋅5700.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2012-05-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'None')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Metroplex_)^$\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 300\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Battle⋅Station\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Metroflex']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2011/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [91.44000244140625,⋅None]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2011-04-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Battle⋅Station')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "\n", + "\n", + "
Viewing 7 of 7 rows / 16 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "df_model.table()" + "source_df.table()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Row" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "source_df = op.create.df([\n", - " (\"words\", \"str\", True),\n", - " (\"num\", \"int\", True),\n", - " (\"animals\", \"str\", True),\n", - " (\"thing\", StringType(), True),\n", - " (\"second\", \"int\", True),\n", - " (\"filter\", StringType(), True)\n", - "],\n", - " [\n", - " (\" I like fish \", 1, \"dog dog\", \"housé\", 5, \"a\"),\n", - " (\" zombies\", 2, \"cat\", \"tv\", 6, \"b\"),\n", - " (\"simpsons cat lady\", 2, \"frog\", \"table\", 7, \"1\"),\n", - " (None, 3, \"eagle\", \"glass\", 8, \"c\"),\n", - " ])" + "## Tukey" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 24, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating test_outliers_tukey_select() test function...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:test_outliers_tukey_select()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\"'height(ft)'\"]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:percentile() executed in 3.34 sec\n", + "INFO:optimus:percentile() executed in 2.68 sec\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 2 of 2 rows / 16 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
names
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
height(ft)
\n", + "
2 (smallint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
function
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
rank
\n", + "
4 (tinyint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
age
\n", + "
5 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
weight(t)
\n", + "
6 (float)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
japanese name
\n", + "
7 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
last position seen
\n", + "
8 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
date arrival
\n", + "
9 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
last date seen
\n", + "
10 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
attributes
\n", + "
11 (array<float>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
Date Type
\n", + "
12 (date)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
timestamp
\n", + "
13 (timestamp)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
Cybertronian
\n", + "
14 (boolean)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
function(binary)
\n", + "
15 (binary)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
NullType
\n", + "
16 (null)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Optim'us\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " -28\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Leader\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 4.300000190734863\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Inochi',⋅'Convoy']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 19.442735,-99.201111\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2016/09/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [8.53439998626709,⋅4300.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2016-09-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Leader')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Metroplex_)^$\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 300\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Battle⋅Station\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Metroflex']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2011/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [91.44000244140625,⋅None]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2011-04-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Battle⋅Station')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "\n", + "\n", + "
Viewing 2 of 2 rows / 16 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "from optimus.audf import abstract_udf as audf\n", - "t = Test(op, source_df, \"df_rows\", imports=[\"from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector\",\n", - " \"import numpy as np\",\n", - " \"nan = np.nan\",\n", - " \"from optimus.audf import abstract_udf as audf\",\n", - " \"import datetime\",\n", - " \"from pyspark.sql import functions as F\"], path = \"df_rows\", final_path=\"..\")" + "t.create(None, \"outliers.tukey\", None, \"df\",\"select\", numeric_col)" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 32, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:percentile() executed in 2.62 sec\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 3 of 3 rows / 16 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
names
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
height(ft)
\n", + "
2 (smallint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
function
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
rank
\n", + "
4 (tinyint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
age
\n", + "
5 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
weight(t)
\n", + "
6 (float)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
japanese name
\n", + "
7 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
last position seen
\n", + "
8 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
date arrival
\n", + "
9 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
last date seen
\n", + "
10 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
attributes
\n", + "
11 (array<float>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
Date Type
\n", + "
12 (date)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
timestamp
\n", + "
13 (timestamp)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
Cybertronian
\n", + "
14 (boolean)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
function(binary)
\n", + "
15 (binary)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
NullType
\n", + "
16 (null)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bumbl#ebéé⋅⋅\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 17\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Espionage\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 7\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Bumble',⋅'Goldback']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10.642707,-71.612534\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2015/08/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [5.334000110626221,⋅2000.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2015-08-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Espionage')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ironhide&\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 26\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Security\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 7\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 4.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Roadbuster']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 37.789563,-122.400356\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014/07/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [7.924799919128418,⋅4000.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Security')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Jazz\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 13\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " First⋅Lieutenant\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1.7999999523162842\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Meister']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 33.670666,-117.841553\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2013/06/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [3.962399959564209,⋅1800.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2013-06-24\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'First⋅Lieutenant')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "\n", + "\n", + "
Viewing 3 of 3 rows / 16 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "row =[(\"this is a word\", 2, \"this is an animal\",\n", - " \"this is a thing\", 64, \"this is a filter\",)]" + "source_df.outliers.tukey(numeric_col).drop().table()" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Creating test_rows_append() test function...\n" + "Creating test_outliers_tukey_drop() test function...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:optimus:test_rows_append()\n" + "INFO:optimus:test_outliers_tukey_drop()\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "[\"[('this is a word', 2, 'this is an animal', 'this is a thing', 64, 'this is a filter')]\"]\n" + "[\"'height(ft)'\"]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:percentile() executed in 2.64 sec\n" ] }, { @@ -12197,16 +12778,76 @@ "\n", "\n", "\n", - "
Viewing 5 of 5 rows / 6 columns
\n", - "
2 partition(s)
\n", + "
Viewing 3 of 3 rows / 16 columns
\n", + "
1 partition(s)
\n", "\n", "\n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", @@ -12324,205 +13085,261 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -12533,8 +13350,8 @@ "
\n", - "
words
\n", - "
1 (string)
\n", + "
names
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
height(ft)
\n", + "
2 (smallint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
function
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
rank
\n", + "
4 (tinyint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
age
\n", + "
5 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
weight(t)
\n", + "
6 (float)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
japanese name
\n", + "
7 (array<string>)
\n", "
\n", " \n", " nullable\n", @@ -12215,8 +12856,8 @@ "
\n", - "
num
\n", - "
2 (int)
\n", + "
last position seen
\n", + "
8 (string)
\n", "
\n", " \n", " nullable\n", @@ -12225,8 +12866,8 @@ "
\n", - "
animals
\n", - "
3 (string)
\n", + "
date arrival
\n", + "
9 (string)
\n", "
\n", " \n", " nullable\n", @@ -12235,8 +12876,8 @@ "
\n", - "
thing
\n", - "
4 (string)
\n", + "
last date seen
\n", + "
10 (string)
\n", "
\n", " \n", " nullable\n", @@ -12245,8 +12886,8 @@ "
\n", - "
second
\n", - "
5 (int)
\n", + "
attributes
\n", + "
11 (array<float>)
\n", "
\n", " \n", " nullable\n", @@ -12255,8 +12896,48 @@ "
\n", - "
filter
\n", - "
6 (string)
\n", + "
Date Type
\n", + "
12 (date)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
timestamp
\n", + "
13 (timestamp)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
Cybertronian
\n", + "
14 (boolean)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
function(binary)
\n", + "
15 (binary)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
NullType
\n", + "
16 (null)
\n", "
\n", " \n", " nullable\n", @@ -12272,49 +12953,129 @@ "
\n", - "
\n", + "
\n", " \n", - " ⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅\n", + " bumbl#ebéé⋅⋅\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 1\n", + " 17\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " dog⋅dog\n", + " Espionage\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " housé\n", + " 7\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 5\n", + " 5000000\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " a\n", + " 2.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Bumble',⋅'Goldback']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10.642707,-71.612534\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2015/08/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [5.334000110626221,⋅2000.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2015-08-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Espionage')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " ⋅⋅⋅⋅zombies\n", + " ironhide&\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2\n", + " 26\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " cat\n", + " Security\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " tv\n", + " 7\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 6\n", + " 5000000\n", " \n", "
\n", "
\n", - "
\n", + "
\n", + " \n", + " 4.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Roadbuster']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 37.789563,-122.400356\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014/07/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [7.924799919128418,⋅4000.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", " \n", - " b\n", + " True\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " simpsons⋅⋅⋅cat⋅lady\n", + " bytearray(b'Security')\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2\n", + " None\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " frog\n", + " Jazz\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " table\n", + " 13\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 7\n", + " First⋅Lieutenant\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 1\n", + " 8\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " None\n", + " 5000000\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 3\n", + " 1.7999999523162842\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " eagle\n", + " ['Meister']\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " glass\n", + " 33.670666,-117.841553\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 8\n", + " 1980/04/10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " c\n", + " 2013/06/10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " this⋅is⋅a⋅word\n", + " [3.962399959564209,⋅1800.0]\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2\n", + " 2013-06-24\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " this⋅is⋅an⋅animal\n", + " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " this⋅is⋅a⋅thing\n", + " True\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 64\n", + " bytearray(b'First⋅Lieutenant')\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " this⋅is⋅a⋅filter\n", + " None\n", " \n", "
\n", "
\n", "\n", "\n", - "
Viewing 5 of 5 rows / 6 columns
\n", - "
2 partition(s)
\n" + "
Viewing 3 of 3 rows / 16 columns
\n", + "
1 partition(s)
\n" ], "text/plain": [ "" @@ -12545,56 +13362,801 @@ } ], "source": [ - "t.create(None, \"rows.append\", None, \"df\", None, row)" + "t.create(None, \"outliers.tukey\", None, \"df\",\"drop\", numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating test_outliers_tukey_whiskers() test function...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:test_outliers_tukey_whiskers()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\"'height(ft)'\"]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:percentile() executed in 2.65 sec\n", + "INFO:optimus:percentile() executed in 2.64 sec\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'lower_bound': -6.5, 'upper_bound': 45.5, 'q1': 13, 'median': 17, 'q3': 26, 'iqr': 13}\n" + ] + } + ], + "source": [ + "t.create(None, \"outliers.tukey\", None, \"json\", \"whiskers\", numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating test_outliers_tukey_count() test function...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:test_outliers_tukey_count()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\"'height(ft)'\"]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:percentile() executed in 2.61 sec\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2\n" + ] + } + ], + "source": [ + "t.create(None, \"outliers.tukey\", None, \"json\", \"count\", numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating test_outliers_tukey_non_outliers_count() test function...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:test_outliers_tukey_non_outliers_count()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\"'height(ft)'\"]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:percentile() executed in 2.59 sec\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3\n" + ] + } + ], + "source": [ + "t.create(None, \"outliers.tukey\", None, \"json\", \"non_outliers_count\", numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating test_outliers_tukey_info() test function...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:test_outliers_tukey_info()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\"'height(ft)'\"]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:percentile() executed in 2.83 sec\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'count_outliers': 2, 'count_non_outliers': 3, 'lower_bound': -6.5, 'lower_bound_count': 1, 'upper_bound': 45.5, 'upper_bound_count': 1, 'q1': 13, 'median': 17, 'q3': 26, 'iqr': 13}\n" + ] + } + ], + "source": [ + "t.create(None, \"outliers.tukey\", None, \"json\", \"info\", numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating file ../test_df_outliers.py\n", + "Done\n" + ] + } + ], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Zscore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "threshold = 0.5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"outliers.z_score\", None, \"df\",\"select\", numeric_col, threshold)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "source_df.outliers.z_score('height(ft)',0.5).select()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"outliers.z_score\", None, \"df\",\"drop\", numeric_col, threshold)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"outliers.z_score\", None, \"json\", \"count\", numeric_col, threshold)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"outliers.z_score\", None, \"json\", \"non_outliers_count\", numeric_col, threshold)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"outliers.z_score\", None, \"json\", \"info\", numeric_col, threshold)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Modified Zscore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "threshold = 0.5\n", + "relative_error = 10000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"outliers.modified_z_score\", None, \"df\",\"select\", numeric_col, threshold, relative_error)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"outliers.modified_z_score\", None, \"df\",\"drop\", numeric_col, threshold, relative_error)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"outliers.modified_z_score\", None, \"json\",\"count\", numeric_col, threshold, relative_error)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"outliers.modified_z_score\", None, \"json\",\"non_outliers_count\", numeric_col, threshold, relative_error)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"outliers.modified_z_score\", None, \"json\",\"info\", numeric_col, threshold, relative_error)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Mad" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [], + "source": [ + "threshold = 0.5\n", + "relative_error = 10000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"outliers.mad\", None, \"df\",\"select\", numeric_col, threshold, relative_error)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"outliers.mad\", None, \"df\",\"drop\", numeric_col, threshold, relative_error)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"outliers.mad\", None, \"json\",\"count\", numeric_col, threshold, relative_error)" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating test_outliers_mad_non_outliers_count() test function...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:test_outliers_mad_non_outliers_count()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\"'height(ft)'\", '0.5', '10000']\n", + "2\n" + ] + } + ], + "source": [ + "t.create(None, \"outliers.mad\", None, \"json\",\"non_outliers_count\", numeric_col, threshold, relative_error)" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating test_outliers_mad_info() test function...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:test_outliers_mad_info()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\"'height(ft)'\", '0.5', '10000']\n", + "{'count_outliers': 3, 'count_non_outliers': 2, 'lower_bound': 12.5, 'lower_bound_count': 1, 'upper_bound': 21.5, 'upper_bound_count': 2}\n" + ] + } + ], + "source": [ + "t.create(None, \"outliers.mad\", None, \"json\",\"info\", numeric_col, threshold, relative_error)" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating file ../test_df_outliers.py\n", + "Done\n" + ] + } + ], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Keycolision" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "source_df = op.read.csv(\"../../examples/data/random.csv\",header=True, sep=\";\").limit(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "source_df.table()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t = Test(op, source_df, \"df_keycollision\", imports=[\"from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector\",\n", + " \"import numpy as np\",\n", + " \"nan = np.nan\",\n", + " \"import datetime\",\n", + " \"from pyspark.sql import functions as F\",\n", + " \"from optimus.ml import keycollision as keyCol\"], \n", + " path = \"df_keycollision\", final_path=\"..\")\n", + "\n", + "from optimus.ml import keycollision as keyCol" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "inputHidden": false, + "outputHidden": false + }, + "outputs": [], + "source": [ + "t.create(keyCol, \"fingerprint\", None, \"df\",None, source_df, \"STATE\")\n", + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "inputHidden": false, + "outputHidden": false + }, + "outputs": [], + "source": [ + "t.create(keyCol, \"fingerprint_cluster\", None, \"json\", None, source_df, \"STATE\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "inputHidden": false, + "outputHidden": false + }, + "outputs": [], + "source": [ + "t.create(keyCol, \"n_gram_fingerprint\", None, \"df\", None, source_df, \"STATE\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "inputHidden": false, + "outputHidden": false + }, + "outputs": [], + "source": [ + "t.create(keyCol, \"n_gram_fingerprint_cluster\", None, \"json\", None, source_df, \"STATE\", 2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "inputHidden": false, + "outputHidden": false + }, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Distance cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "source_df = op.read.csv(\"../../examples/data/random.csv\",header=True, sep=\";\").limit(1000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "inputHidden": false, + "outputHidden": false + }, + "outputs": [], + "source": [ + "t = Test(op, source_df, \"df_distance_cluster\", imports=[\"from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector\",\n", + " \"import numpy as np\",\n", + " \"nan = np.nan\",\n", + " \"import datetime\",\n", + " \"from pyspark.sql import functions as F\",\n", + " \"from optimus.ml import distancecluster as dc\"], path = \"df_distance_cluster\", final_path=\"..\")\n", + "\n", + "from optimus.ml import distancecluster as dc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.table()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "inputHidden": false, + "outputHidden": false + }, + "outputs": [], + "source": [ + "t.create(dc, \"levenshtein_cluster\", None, 'dict', None, source_df, \"STATE\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_cancer = op.spark.read.csv('../data_cancer.csv', sep=',', header=True, inferSchema=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "columns = ['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',\n", + " 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean',\n", + " 'fractal_dimension_mean']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_model, rf_model = op.ml.gbt(df_cancer, columns, \"diagnosis\")" ] }, { "cell_type": "code", - "execution_count": 116, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "fil = (source_df[\"num\"] == 1)" + "df_model.table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Row" ] }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 2, "metadata": {}, "outputs": [ { - "ename": "TypeError", - "evalue": "delete() takes from 2 to 3 positional arguments but 7 were given", + "ename": "NameError", + "evalue": "name 'op' is not defined", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdelete\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"rows.select\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"df\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfil\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;31mTypeError\u001b[0m: delete() takes from 2 to 3 positional arguments but 7 were given" + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m source_df = op.create.df([\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[1;33m(\u001b[0m\u001b[1;34m\"words\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"str\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;33m(\u001b[0m\u001b[1;34m\"num\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"int\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;33m(\u001b[0m\u001b[1;34m\"animals\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"str\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;33m(\u001b[0m\u001b[1;34m\"thing\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mStringType\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mNameError\u001b[0m: name 'op' is not defined" ] } ], "source": [ - "t.create(None, \"rows.select\", None, \"df\", None, fil)" + "source_df = op.create.df([\n", + " (\"words\", \"str\", True),\n", + " (\"num\", \"int\", True),\n", + " (\"animals\", \"str\", True),\n", + " (\"thing\", StringType(), True),\n", + " (\"second\", \"int\", True),\n", + " (\"filter\", StringType(), True)\n", + "],\n", + " [\n", + " (\" I like fish \", 1, \"dog dog\", \"housé\", 5, \"a\"),\n", + " (\" zombies\", 2, \"cat\", \"tv\", 6, \"b\"),\n", + " (\"simpsons cat lady\", 2, \"frog\", \"table\", 7, \"1\"),\n", + " (None, 3, \"eagle\", \"glass\", 8, \"c\"),\n", + " ])" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'optimus'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0moptimus\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maudf\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mabstract_udf\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0maudf\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m t = Test(op, source_df, \"df_rows\", imports=[\"from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector\",\n\u001b[0;32m 3\u001b[0m \u001b[1;34m\"import numpy as np\"\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;34m\"nan = np.nan\"\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;34m\"from optimus.audf import abstract_udf as audf\"\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'optimus'" + ] + } + ], + "source": [ + "from optimus.audf import abstract_udf as audf\n", + "t = Test(op, source_df, \"df_rows\", imports=[\"from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector\",\n", + " \"import numpy as np\",\n", + " \"nan = np.nan\",\n", + " \"from optimus.audf import abstract_udf as audf\",\n", + " \"import datetime\",\n", + " \"from pyspark.sql import functions as F\"], path = \"df_rows\", final_path=\"..\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "row =[(\"this is a word\", 2, \"this is an animal\",\n", + " \"this is a thing\", 64, \"this is a filter\",)]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Creating test_rows_select_by_dtypes() test function...\n" + "Creating test_rows_append() test function...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:optimus:test_rows_select_by_dtypes()\n" + "INFO:optimus:test_rows_append()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\"[('this is a word', 2, 'this is an animal', 'this is a thing', 64, 'this is a filter')]\"]\n" ] }, { @@ -12605,8 +14167,8 @@ "\n", "\n", "\n", - "
Viewing 1 of 1 rows / 6 columns
\n", - "
1 partition(s)
\n", + "
Viewing 5 of 5 rows / 6 columns
\n", + "
2 partition(s)
\n", "\n", "\n", " \n", @@ -12680,519 +14242,457 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - "
\n", - "
\n", + "
\n", " \n", - " simpsons⋅⋅⋅cat⋅lady\n", + " ⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2\n", + " 1\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " frog\n", + " dog⋅dog\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " table\n", + " housé\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 7\n", + " 5\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 1\n", + " a\n", " \n", "
\n", "
\n", - "\n", - "\n", - "
Viewing 1 of 1 rows / 6 columns
\n", - "
1 partition(s)
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "t.create(None, \"rows.select_by_dtypes\", None, \"df\", None, \"filter\", \"integer\")" - ] - }, - { - "cell_type": "code", - "execution_count": 113, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Column\n" - ] - } - ], - "source": [ - "fil = (source_df[\"num\"] == 2) | (source_df[\"second\"] == 5)\n", - "print(str(fil))\n", - "# type(fil)" - ] - }, - { - "cell_type": "code", - "execution_count": 114, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating test_rows_drop() test function...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:optimus:test_rows_drop()\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[\"Column\"]\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
Viewing 1 of 1 rows / 6 columns
\n", - "
1 partition(s)
\n", - "\n", - "\n", - " \n", " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", " \n", - "\n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - "
\n", - "
words
\n", - "
1 (string)
\n", - "
\n", - " \n", - " nullable\n", + "
\n", + "
\n", " \n", + " ⋅⋅⋅⋅zombies\n", + " \n", "
\n", - " \n", + "
\n", - "
num
\n", - "
2 (int)
\n", - "
\n", - " \n", - " nullable\n", + "
\n", + "
\n", " \n", + " 2\n", + " \n", "
\n", - " \n", + "
\n", - "
animals
\n", - "
3 (string)
\n", - "
\n", - " \n", - " nullable\n", + "
\n", + "
\n", " \n", + " cat\n", + " \n", "
\n", - " \n", + "
\n", - "
thing
\n", - "
4 (string)
\n", - "
\n", - " \n", - " nullable\n", + "
\n", + "
\n", " \n", + " tv\n", + " \n", "
\n", - " \n", + "
\n", - "
second
\n", - "
5 (int)
\n", - "
\n", - " \n", - " nullable\n", + "
\n", + "
\n", " \n", + " 6\n", + " \n", "
\n", - " \n", + "
\n", - "
filter
\n", - "
6 (string)
\n", - "
\n", - " \n", - " nullable\n", + "
\n", + "
\n", " \n", + " b\n", + " \n", "
\n", - " \n", + "
\n", - "
\n", + "
\n", " \n", - " None\n", + " simpsons⋅⋅⋅cat⋅lady\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 3\n", + " 2\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " eagle\n", + " frog\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " glass\n", + " table\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 8\n", + " 7\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " c\n", + " 1\n", " \n", "
\n", "
\n", - "\n", - "\n", - "
Viewing 1 of 1 rows / 6 columns
\n", - "
1 partition(s)
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "t.create(None, \"rows.drop\", None, \"df\", None, fil)" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating test_rows_drop_by_dtypes() test function...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:optimus:test_rows_drop_by_dtypes()\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
Viewing 3 of 3 rows / 6 columns
\n", - "
1 partition(s)
\n", - "\n", - "\n", - " \n", " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", " \n", - "\n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + "
\n", - "
words
\n", - "
1 (string)
\n", - "
\n", - " \n", - " nullable\n", + "
\n", + "
\n", " \n", + " None\n", + " \n", "
\n", - " \n", + "
\n", - "
num
\n", - "
2 (int)
\n", - "
\n", - " \n", - " nullable\n", + "
\n", + "
\n", " \n", + " 3\n", + " \n", "
\n", - " \n", + "
\n", - "
animals
\n", - "
3 (string)
\n", - "
\n", - " \n", - " nullable\n", + "
\n", + "
\n", " \n", + " eagle\n", + " \n", "
\n", - " \n", + "
\n", - "
thing
\n", - "
4 (string)
\n", - "
\n", - " \n", - " nullable\n", + "
\n", + "
\n", " \n", + " glass\n", + " \n", "
\n", - " \n", + "
\n", - "
second
\n", - "
5 (int)
\n", - "
\n", - " \n", - " nullable\n", + "
\n", + "
\n", " \n", + " 8\n", + " \n", "
\n", - " \n", + "
\n", - "
filter
\n", - "
6 (string)
\n", - "
\n", - " \n", - " nullable\n", + "
\n", + "
\n", " \n", + " c\n", + " \n", "
\n", - " \n", + "
\n", - "
\n", + "
\n", " \n", - " ⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅\n", + " this⋅is⋅a⋅word\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 1\n", + " 2\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " dog⋅dog\n", + " this⋅is⋅an⋅animal\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " housé\n", + " this⋅is⋅a⋅thing\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 5\n", + " 64\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " a\n", + " this⋅is⋅a⋅filter\n", " \n", "
\n", "
\n", + "\n", + "\n", + "
Viewing 5 of 5 rows / 6 columns
\n", + "
2 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "t.create(None, \"rows.append\", None, \"df\", None, row)" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [], + "source": [ + "fil = (source_df[\"num\"] == 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "delete() takes from 2 to 3 positional arguments but 7 were given", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdelete\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"rows.select\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"df\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfil\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;31mTypeError\u001b[0m: delete() takes from 2 to 3 positional arguments but 7 were given" + ] + } + ], + "source": [ + "t.create(None, \"rows.select\", None, \"df\", None, fil)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating test_rows_select_by_dtypes() test function...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:test_rows_select_by_dtypes()\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 1 of 1 rows / 6 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", " \n", + "\n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -13203,7 +14703,7 @@ "
\n", - "
\n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", " \n", - " ⋅⋅⋅⋅zombies\n", - " \n", "
\n", - " \n", + "
\n", - "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", " \n", - " 2\n", - " \n", "
\n", - " \n", + "
\n", - "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", " \n", - " cat\n", - " \n", "
\n", - " \n", + "
\n", - "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", " \n", - " tv\n", - " \n", "
\n", - " \n", + "
\n", - "
\n", + "
\n", + "
second
\n", + "
5 (int)
\n", + "
\n", + " \n", + " nullable\n", " \n", - " 6\n", - " \n", "
\n", - " \n", + "
\n", - "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", " \n", - " b\n", - " \n", "
\n", - " \n", + "
\n", - "
\n", + "
\n", " \n", - " None\n", + " simpsons⋅⋅⋅cat⋅lady\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 3\n", + " 2\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " eagle\n", + " frog\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " glass\n", + " table\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 8\n", + " 7\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " c\n", + " 1\n", " \n", "
\n", "
\n", "\n", "\n", - "
Viewing 3 of 3 rows / 6 columns
\n", + "
Viewing 1 of 1 rows / 6 columns
\n", "
1 partition(s)
\n" ], "text/plain": [ @@ -13215,45 +14715,59 @@ } ], "source": [ - "t.create(None, \"rows.drop_by_dtypes\", None, \"df\", None, \"filter\", \"integer\")" + "t.create(None, \"rows.select_by_dtypes\", None, \"df\", None, \"filter\", \"integer\")" ] }, { "cell_type": "code", - "execution_count": 52, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 113, "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "INFO:optimus:Using 'pandas_udf' to process column 'num' with function func_data_type\n" + "Column\n" ] } ], "source": [ - "def func_data_type(value, attr):\n", - " return value > 1\n", - "a = audf(\"num\", func_data_type, \"boolean\")" + "fil = (source_df[\"num\"] == 2) | (source_df[\"second\"] == 5)\n", + "print(str(fil))\n", + "# type(fil)" ] }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 114, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Creating test_rows_drop_audf() test function...\n" + "Creating test_rows_drop() test function...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:optimus:test_rows_drop_audf()\n" + "INFO:optimus:test_rows_drop()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\"Column\"]\n" ] }, { @@ -13339,49 +14853,49 @@ " \n", " \n", " \n", - "
\n", + "
\n", " \n", - " ⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅\n", + " None\n", " \n", "
\n", " \n", " \n", " \n", - "
\n", + "
\n", " \n", - " 1\n", + " 3\n", " \n", "
\n", " \n", " \n", " \n", - "
\n", + "
\n", " \n", - " dog⋅dog\n", + " eagle\n", " \n", "
\n", " \n", " \n", " \n", - "
\n", + "
\n", " \n", - " housé\n", + " glass\n", " \n", "
\n", " \n", " \n", " \n", - "
\n", + "
\n", " \n", - " 5\n", + " 8\n", " \n", "
\n", " \n", " \n", " \n", - "
\n", + "
\n", " \n", - " a\n", + " c\n", " \n", "
\n", " \n", @@ -13404,26 +14918,26 @@ } ], "source": [ - "t.create(None, \"rows.drop\", \"audf\", \"df\", None, a)" + "t.create(None, \"rows.drop\", None, \"df\", None, fil)" ] }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Creating test_rows_sort() test function...\n" + "Creating test_rows_drop_by_dtypes() test function...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:optimus:test_rows_sort()\n" + "INFO:optimus:test_rows_drop_by_dtypes()\n" ] }, { @@ -13434,8 +14948,8 @@ "\n", "\n", "\n", - "
Viewing 4 of 4 rows / 6 columns
\n", - "
4 partition(s)
\n", + "
Viewing 3 of 3 rows / 6 columns
\n", + "
1 partition(s)
\n", "\n", "\n", " \n", @@ -13509,49 +15023,49 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -13613,101 +15127,49 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -13718,8 +15180,8 @@ "
\n", - "
\n", + "
\n", " \n", - " None\n", + " ⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 3\n", + " 1\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " eagle\n", + " dog⋅dog\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " glass\n", + " housé\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 8\n", + " 5\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " c\n", + " a\n", " \n", "
\n", "
\n", - "
\n", - " \n", - " simpsons⋅⋅⋅cat⋅lady\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " frog\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " table\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 7\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1\n", - " \n", - "
\n", - "
\n", - "
\n", + "
\n", " \n", - " ⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅\n", + " None\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 1\n", + " 3\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " dog⋅dog\n", + " eagle\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " housé\n", + " glass\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 5\n", + " 8\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " a\n", + " c\n", " \n", "
\n", "
\n", "\n", "\n", - "
Viewing 4 of 4 rows / 6 columns
\n", - "
4 partition(s)
\n" + "
Viewing 3 of 3 rows / 6 columns
\n", + "
1 partition(s)
\n" ], "text/plain": [ "" @@ -13730,43 +15192,45 @@ } ], "source": [ - "t.create(None, \"rows.sort\", None, \"df\", None, \"num\", \"desc\")" + "t.create(None, \"rows.drop_by_dtypes\", None, \"df\", None, \"filter\", \"integer\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 52, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:Using 'pandas_udf' to process column 'num' with function func_data_type\n" + ] + } + ], "source": [ - "t.create(None, \"rows.is_in\", None, \"df\", None, \"num\", 2)" + "def func_data_type(value, attr):\n", + " return value > 1\n", + "a = audf(\"num\", func_data_type, \"boolean\")" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Creating test_rows_between() test function...\n" + "Creating test_rows_drop_audf() test function...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:optimus:test_rows_between()\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[\"'second'\", '6', '8']\n", - "(6, 8)\n" + "INFO:optimus:test_rows_drop_audf()\n" ] }, { @@ -13852,49 +15316,49 @@ " \n", " \n", " \n", - "
\n", + "
\n", " \n", - " simpsons⋅⋅⋅cat⋅lady\n", + " ⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅\n", " \n", "
\n", " \n", " \n", " \n", - "
\n", + "
\n", " \n", - " 2\n", + " 1\n", " \n", "
\n", " \n", " \n", " \n", - "
\n", + "
\n", " \n", - " frog\n", + " dog⋅dog\n", " \n", "
\n", " \n", " \n", " \n", - "
\n", + "
\n", " \n", - " table\n", + " housé\n", " \n", "
\n", " \n", " \n", " \n", - "
\n", + "
\n", " \n", - " 7\n", + " 5\n", " \n", "
\n", " \n", " \n", " \n", - "
\n", + "
\n", " \n", - " 1\n", + " a\n", " \n", "
\n", " \n", @@ -13917,34 +15381,26 @@ } ], "source": [ - "t.create(None, \"rows.between\", None, \"df\", None, \"second\", 6, 8)" + "t.create(None, \"rows.drop\", \"audf\", \"df\", None, a)" ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Creating test_rows_between() test function...\n" + "Creating test_rows_sort() test function...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:optimus:test_rows_between()\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[\"'second'\", '6', '8']\n", - "(6, 8)\n" + "INFO:optimus:test_rows_sort()\n" ] }, { @@ -13955,8 +15411,8 @@ "\n", "\n", "\n", - "
Viewing 1 of 1 rows / 6 columns
\n", - "
1 partition(s)
\n", + "
Viewing 4 of 4 rows / 6 columns
\n", + "
4 partition(s)
\n", "\n", "\n", " \n", @@ -14030,6 +15486,110 @@ " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 3\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " eagle\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " glass\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " c\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ⋅⋅⋅⋅zombies\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " cat\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " tv\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 6\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " b\n", + " \n", + "
\n", + "
\n", "
\n", " \n", " simpsons⋅⋅⋅cat⋅lady\n", @@ -14079,12 +15639,64 @@ " \n", "
\n", + "
\n", + " \n", + " ⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " dog⋅dog\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " housé\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " a\n", + " \n", + "
\n", + "
\n", "\n", "\n", - "
Viewing 1 of 1 rows / 6 columns
\n", - "
1 partition(s)
\n" + "
Viewing 4 of 4 rows / 6 columns
\n", + "
4 partition(s)
\n" ], "text/plain": [ "" @@ -14095,34 +15707,42 @@ } ], "source": [ - "t.create(None, \"rows.between\", None, \"df\", None, \"second\", 6, 8)" + "t.create(None, \"rows.sort\", None, \"df\", None, \"num\", \"desc\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"rows.is_in\", None, \"df\", None, \"num\", 2)" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 58, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Creating test_rows_between_equal() test function...\n" + "Creating test_rows_between() test function...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:optimus:test_rows_between_equal()\n" + "INFO:optimus:test_rows_between()\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "[\"'second'\", '6', '8']\n", - "(6, 8)\n" + "Column 6) AND (second < 8))'>\n" ] }, { @@ -14133,7 +15753,7 @@ "\n", "\n", "\n", - "
Viewing 3 of 3 rows / 6 columns
\n", + "
Viewing 1 of 1 rows / 6 columns
\n", "
1 partition(s)
\n", "\n", "\n", @@ -14208,58 +15828,6 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", "
\n", - "
\n", - " \n", - " ⋅⋅⋅⋅zombies\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " cat\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " tv\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 6\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " b\n", - " \n", - "
\n", - "
\n", "
\n", " \n", " simpsons⋅⋅⋅cat⋅lady\n", @@ -14309,63 +15877,11 @@ " \n", "
\n", - "
\n", - " \n", - " None\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 3\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " eagle\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " glass\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 8\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " c\n", - " \n", - "
\n", - "
\n", "\n", "\n", - "
Viewing 3 of 3 rows / 6 columns
\n", + "
Viewing 1 of 1 rows / 6 columns
\n", "
1 partition(s)
\n" ], "text/plain": [ @@ -14377,33 +15893,33 @@ } ], "source": [ - "t.create(None, \"rows.between\", \"equal\", \"df\", None, \"second\", 6, 8, equal=True)" + "t.create(None, \"rows.between\", None, \"df\", None, \"second\", 6, 8)" ] }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Creating test_rows_between_invert_equal() test function...\n" + "Creating test_rows_between_equal() test function...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:optimus:test_rows_between_invert_equal()\n" + "INFO:optimus:test_rows_between_equal()\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Column= 8))'>\n" + "Column= 6) AND (second <= 8))'>\n" ] }, { @@ -14489,49 +16005,49 @@ " \n", " \n", " \n", - "
\n", + "
\n", " \n", - " ⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅\n", + " ⋅⋅⋅⋅zombies\n", " \n", "
\n", " \n", " \n", " \n", - "
\n", + "
\n", " \n", - " 1\n", + " 2\n", " \n", "
\n", " \n", " \n", " \n", - "
\n", + "
\n", " \n", - " dog⋅dog\n", + " cat\n", " \n", "
\n", " \n", " \n", " \n", - "
\n", + "
\n", " \n", - " housé\n", + " tv\n", " \n", "
\n", " \n", " \n", " \n", - "
\n", + "
\n", " \n", - " 5\n", + " 6\n", " \n", "
\n", " \n", " \n", " \n", - "
\n", + "
\n", " \n", - " a\n", + " b\n", " \n", "
\n", " \n", @@ -14541,9 +16057,9 @@ " \n", " \n", " \n", - "
\n", + "
\n", " \n", - " ⋅⋅⋅⋅zombies\n", + " simpsons⋅⋅⋅cat⋅lady\n", " \n", "
\n", " \n", @@ -14557,33 +16073,33 @@ " \n", " \n", " \n", - "
\n", + "
\n", " \n", - " cat\n", + " frog\n", " \n", "
\n", " \n", " \n", " \n", - "
\n", + "
\n", " \n", - " tv\n", + " table\n", " \n", "
\n", " \n", " \n", " \n", - "
\n", + "
\n", " \n", - " 6\n", + " 7\n", " \n", "
\n", " \n", " \n", " \n", - "
\n", + "
\n", " \n", - " b\n", + " 1\n", " \n", "
\n", " \n", @@ -14658,12 +16174,12 @@ } ], "source": [ - "t.create(None, \"rows.between\", \"invert_equal\", \"df\", None, \"second\", 6, 8, invert=True, equal=True)" + "t.create(None, \"rows.between\", \"equal\", \"df\", None, \"second\", 6, 8, equal=True)" ] }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 63, "metadata": {}, "outputs": [ { @@ -14684,7 +16200,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[\"'second'\", '[(6, 7),(7, 8)]']\n" + "Column= 8))'>\n" ] }, { @@ -14695,7 +16211,7 @@ "\n", "\n", "\n", - "
Viewing 4 of 4 rows / 6 columns
\n", + "
Viewing 3 of 3 rows / 6 columns
\n", "
1 partition(s)
\n", "\n", "\n", @@ -14874,58 +16390,6 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
\n", - " \n", - " simpsons⋅⋅⋅cat⋅lady\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 2\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " frog\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " table\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 7\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1\n", - " \n", - "
\n", - "
\n", "
\n", " \n", " None\n", @@ -14979,7 +16443,7 @@ "
\n", "\n", "\n", - "
Viewing 4 of 4 rows / 6 columns
\n", + "
Viewing 3 of 3 rows / 6 columns
\n", "
1 partition(s)
\n" ], "text/plain": [ @@ -14991,23 +16455,23 @@ } ], "source": [ - "t.create(None, \"rows.between\", \"invert_equal\", \"df\", None, \"second\", [(6,7),(7,8)], invert=True, equal=True)" + "t.create(None, \"rows.between\", \"invert_equal\", \"df\", None, \"second\", 6, 8, invert=True, equal=True)" ] }, { "cell_type": "code", - "execution_count": 46, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating file ../test_df_rows.py\n", - "Done\n" - ] - } - ], + "outputs": [], + "source": [ + "t.create(None, \"rows.between\", \"bounds\", \"df\", None, \"second\", bounds=[(6,7),(7,8)], invert=True, equal=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "t.run()" ] diff --git a/tests/creator/creator.py b/tests/creator/creator.py index 166e13f8..01afd347 100644 --- a/tests/creator/creator.py +++ b/tests/creator/creator.py @@ -190,6 +190,12 @@ def func(col_name, attrs): t.create(source_df_string_to_index, "cols.string_to_index", None, "df", None, "rank") +source_df_index_to_string = source_df_string_to_index.cols.string_to_index("rank") + +source_df_index_to_string.table() + +t.create(source_df_index_to_string, "cols.index_to_string", None, "df", None, "rank") + t.create(source_df_string_to_index, "cols.values_to_cols", None, "df", None, "rank") t.run() @@ -1064,6 +1070,8 @@ def func(col_name, attrs): t.create(None, "rows.select_by_dtypes", None, "df", None, "filter", "integer") + + fil = (source_df["num"] == 2) | (source_df["second"] == 5) print(str(fil)) # type(fil) @@ -1085,13 +1093,11 @@ def func_data_type(value, attr): t.create(None, "rows.between", None, "df", None, "second", 6, 8) -t.create(None, "rows.between", None, "df", None, "second", 6, 8) - t.create(None, "rows.between", "equal", "df", None, "second", 6, 8, equal=True) t.create(None, "rows.between", "invert_equal", "df", None, "second", 6, 8, invert=True, equal=True) -t.create(None, "rows.between", "invert_equal", "df", None, "second", [(6,7),(7,8)], invert=True, equal=True) +t.create(None, "rows.between", "bounds", "df", None, "second", bounds=[(6,7),(7,8)], invert=True, equal=True) t.run() diff --git a/tests/test_df_cols.py b/tests/test_df_cols.py index a3d99afa..a307b134 100644 --- a/tests/test_df_cols.py +++ b/tests/test_df_cols.py @@ -192,6 +192,12 @@ def test_cols_impute_all_columns(): expected_df = op.create.df([('names', StringType(), True),('height(ft)', ShortType(), True),('function', StringType(), True),('rank', ByteType(), True),('age', IntegerType(), True),('weight(t)', FloatType(), True),('japanese name', ArrayType(StringType(),True), True),('last position seen', StringType(), True),('date arrival', StringType(), True),('last date seen', StringType(), True),('attributes', ArrayType(FloatType(),True), True),('Date Type', DateType(), True),('timestamp', TimestampType(), True),('Cybertronian', BooleanType(), True),('function(binary)', BinaryType(), True),('NullType', NullType(), True)], [("Optim'us", -28, 'Leader', 10, 5000000, 4.300000190734863, ['Inochi', 'Convoy'], '19.442735,-99.201111', '1980/04/10', '2016/09/10', [8.53439998626709, 4300.0], datetime.date(2016, 9, 10), datetime.datetime(2014, 6, 24, 0, 0), True, bytearray(b'Leader'), None), ('bumbl#ebéé ', 17, 'Espionage', 7, 5000000, 2.0, ['Bumble', 'Goldback'], '10.642707,-71.612534', '1980/04/10', '2015/08/10', [5.334000110626221, 2000.0], datetime.date(2015, 8, 10), datetime.datetime(2014, 6, 24, 0, 0), True, bytearray(b'Espionage'), None), ('ironhide&', 26, 'Security', 7, 5000000, 4.0, ['Roadbuster'], '37.789563,-122.400356', '1980/04/10', '2014/07/10', [7.924799919128418, 4000.0], datetime.date(2014, 6, 24), datetime.datetime(2014, 6, 24, 0, 0), True, bytearray(b'Security'), None), ('Jazz', 13, 'First Lieutenant', 8, 5000000, 1.7999999523162842, ['Meister'], '33.670666,-117.841553', '1980/04/10', '2013/06/10', [3.962399959564209, 1800.0], datetime.date(2013, 6, 24), datetime.datetime(2014, 6, 24, 0, 0), True, bytearray(b'First Lieutenant'), None), ('Megatron', None, 'None', 10, 5000000, 5.699999809265137, ['Megatron'], None, '1980/04/10', '2012/05/10', [None, 5700.0], datetime.date(2012, 5, 10), datetime.datetime(2014, 6, 24, 0, 0), True, bytearray(b'None'), None), ('Metroplex_)^$', 300, 'Battle Station', 8, 5000000, None, ['Metroflex'], None, '1980/04/10', '2011/04/10', [91.44000244140625, None], datetime.date(2011, 4, 10), datetime.datetime(2014, 6, 24, 0, 0), True, bytearray(b'Battle Station'), None), ('None', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)]) assert (expected_df.collect() == actual_df.collect()) @staticmethod + def test_cols_index_to_string(): + source_df=op.create.df([('names', StringType(), True),('height(ft)', ShortType(), True),('function', StringType(), True),('rank', ByteType(), True),('age', IntegerType(), True),('weight(t)', FloatType(), True),('japanese name', ArrayType(StringType(),True), True),('last position seen', StringType(), True),('date arrival', StringType(), True),('last date seen', StringType(), True),('attributes', ArrayType(FloatType(),True), True),('Date Type', DateType(), True),('timestamp', TimestampType(), True),('Cybertronian', BooleanType(), True),('rank***STRING_TO_INDEX', DoubleType(), False)], [("Optim'us", -28, 'Leader', 10, 5000000, 4.300000190734863, ['Inochi', 'Convoy'], '19.442735,-99.201111', '1980/04/10', '2016/09/10', [8.53439998626709, 4300.0], datetime.date(2016, 9, 10), datetime.datetime(2014, 6, 24, 0, 0), True, 2.0), ('bumbl#ebéé ', 17, 'Espionage', 7, 5000000, 2.0, ['Bumble', 'Goldback'], '10.642707,-71.612534', '1980/04/10', '2015/08/10', [5.334000110626221, 2000.0], datetime.date(2015, 8, 10), datetime.datetime(2014, 6, 24, 0, 0), True, 1.0), ('ironhide&', 26, 'Security', 7, 5000000, 4.0, ['Roadbuster'], '37.789563,-122.400356', '1980/04/10', '2014/07/10', [7.924799919128418, 4000.0], datetime.date(2014, 6, 24), datetime.datetime(2014, 6, 24, 0, 0), True, 1.0), ('Jazz', 13, 'First Lieutenant', 8, 5000000, 1.7999999523162842, ['Meister'], '33.670666,-117.841553', '1980/04/10', '2013/06/10', [3.962399959564209, 1800.0], datetime.date(2013, 6, 24), datetime.datetime(2014, 6, 24, 0, 0), True, 0.0), ('Megatron', None, 'None', 10, 5000000, 5.699999809265137, ['Megatron'], None, '1980/04/10', '2012/05/10', [None, 5700.0], datetime.date(2012, 5, 10), datetime.datetime(2014, 6, 24, 0, 0), True, 2.0), ('Metroplex_)^$', 300, 'Battle Station', 8, 5000000, None, ['Metroflex'], None, '1980/04/10', '2011/04/10', [91.44000244140625, None], datetime.date(2011, 4, 10), datetime.datetime(2014, 6, 24, 0, 0), True, 0.0)]) + actual_df =source_df.cols.index_to_string('rank***STRING_TO_INDEX') + expected_df = op.create.df([('names', StringType(), True),('height(ft)', ShortType(), True),('function', StringType(), True),('rank', ByteType(), True),('age', IntegerType(), True),('weight(t)', FloatType(), True),('japanese name', ArrayType(StringType(),True), True),('last position seen', StringType(), True),('date arrival', StringType(), True),('last date seen', StringType(), True),('attributes', ArrayType(FloatType(),True), True),('Date Type', DateType(), True),('timestamp', TimestampType(), True),('Cybertronian', BooleanType(), True),('rank***STRING_TO_INDEX', DoubleType(), False),('rank***STRING_TO_INDEX***INDEX_TO_STRING', StringType(), True)], [("Optim'us", -28, 'Leader', 10, 5000000, 4.300000190734863, ['Inochi', 'Convoy'], '19.442735,-99.201111', '1980/04/10', '2016/09/10', [8.53439998626709, 4300.0], datetime.date(2016, 9, 10), datetime.datetime(2014, 6, 24, 0, 0), True, 2.0, '10'), ('bumbl#ebéé ', 17, 'Espionage', 7, 5000000, 2.0, ['Bumble', 'Goldback'], '10.642707,-71.612534', '1980/04/10', '2015/08/10', [5.334000110626221, 2000.0], datetime.date(2015, 8, 10), datetime.datetime(2014, 6, 24, 0, 0), True, 1.0, '7'), ('ironhide&', 26, 'Security', 7, 5000000, 4.0, ['Roadbuster'], '37.789563,-122.400356', '1980/04/10', '2014/07/10', [7.924799919128418, 4000.0], datetime.date(2014, 6, 24), datetime.datetime(2014, 6, 24, 0, 0), True, 1.0, '7'), ('Jazz', 13, 'First Lieutenant', 8, 5000000, 1.7999999523162842, ['Meister'], '33.670666,-117.841553', '1980/04/10', '2013/06/10', [3.962399959564209, 1800.0], datetime.date(2013, 6, 24), datetime.datetime(2014, 6, 24, 0, 0), True, 0.0, '8'), ('Megatron', None, 'None', 10, 5000000, 5.699999809265137, ['Megatron'], None, '1980/04/10', '2012/05/10', [None, 5700.0], datetime.date(2012, 5, 10), datetime.datetime(2014, 6, 24, 0, 0), True, 2.0, '10'), ('Metroplex_)^$', 300, 'Battle Station', 8, 5000000, None, ['Metroflex'], None, '1980/04/10', '2011/04/10', [91.44000244140625, None], datetime.date(2011, 4, 10), datetime.datetime(2014, 6, 24, 0, 0), True, 0.0, '8')]) + assert (expected_df.collect() == actual_df.collect()) + @staticmethod def test_cols_iqr(): actual_df =source_df.cols.iqr('height(ft)') actual_df =json_enconding(actual_df) @@ -588,7 +594,7 @@ def test_cols_std_all_columns(): def test_cols_string_to_index(): source_df=op.create.df([('names', StringType(), True),('height(ft)', ShortType(), True),('function', StringType(), True),('rank', ByteType(), True),('age', IntegerType(), True),('weight(t)', FloatType(), True),('japanese name', ArrayType(StringType(),True), True),('last position seen', StringType(), True),('date arrival', StringType(), True),('last date seen', StringType(), True),('attributes', ArrayType(FloatType(),True), True),('Date Type', DateType(), True),('timestamp', TimestampType(), True),('Cybertronian', BooleanType(), True)], [("Optim'us", -28, 'Leader', 10, 5000000, 4.300000190734863, ['Inochi', 'Convoy'], '19.442735,-99.201111', '1980/04/10', '2016/09/10', [8.53439998626709, 4300.0], datetime.date(2016, 9, 10), datetime.datetime(2014, 6, 24, 0, 0), True), ('bumbl#ebéé ', 17, 'Espionage', 7, 5000000, 2.0, ['Bumble', 'Goldback'], '10.642707,-71.612534', '1980/04/10', '2015/08/10', [5.334000110626221, 2000.0], datetime.date(2015, 8, 10), datetime.datetime(2014, 6, 24, 0, 0), True), ('ironhide&', 26, 'Security', 7, 5000000, 4.0, ['Roadbuster'], '37.789563,-122.400356', '1980/04/10', '2014/07/10', [7.924799919128418, 4000.0], datetime.date(2014, 6, 24), datetime.datetime(2014, 6, 24, 0, 0), True), ('Jazz', 13, 'First Lieutenant', 8, 5000000, 1.7999999523162842, ['Meister'], '33.670666,-117.841553', '1980/04/10', '2013/06/10', [3.962399959564209, 1800.0], datetime.date(2013, 6, 24), datetime.datetime(2014, 6, 24, 0, 0), True), ('Megatron', None, 'None', 10, 5000000, 5.699999809265137, ['Megatron'], None, '1980/04/10', '2012/05/10', [None, 5700.0], datetime.date(2012, 5, 10), datetime.datetime(2014, 6, 24, 0, 0), True), ('Metroplex_)^$', 300, 'Battle Station', 8, 5000000, None, ['Metroflex'], None, '1980/04/10', '2011/04/10', [91.44000244140625, None], datetime.date(2011, 4, 10), datetime.datetime(2014, 6, 24, 0, 0), True)]) actual_df =source_df.cols.string_to_index('rank') - expected_df = op.create.df([('names', StringType(), True),('height(ft)', ShortType(), True),('function', StringType(), True),('rank', ByteType(), True),('age', IntegerType(), True),('weight(t)', FloatType(), True),('japanese name', ArrayType(StringType(),True), True),('last position seen', StringType(), True),('date arrival', StringType(), True),('last date seen', StringType(), True),('attributes', ArrayType(FloatType(),True), True),('Date Type', DateType(), True),('timestamp', TimestampType(), True),('Cybertronian', BooleanType(), True),('rank***INDEX_TO_STRING', DoubleType(), False)], [("Optim'us", -28, 'Leader', 10, 5000000, 4.300000190734863, ['Inochi', 'Convoy'], '19.442735,-99.201111', '1980/04/10', '2016/09/10', [8.53439998626709, 4300.0], datetime.date(2016, 9, 10), datetime.datetime(2014, 6, 24, 0, 0), True, 2.0), ('bumbl#ebéé ', 17, 'Espionage', 7, 5000000, 2.0, ['Bumble', 'Goldback'], '10.642707,-71.612534', '1980/04/10', '2015/08/10', [5.334000110626221, 2000.0], datetime.date(2015, 8, 10), datetime.datetime(2014, 6, 24, 0, 0), True, 1.0), ('ironhide&', 26, 'Security', 7, 5000000, 4.0, ['Roadbuster'], '37.789563,-122.400356', '1980/04/10', '2014/07/10', [7.924799919128418, 4000.0], datetime.date(2014, 6, 24), datetime.datetime(2014, 6, 24, 0, 0), True, 1.0), ('Jazz', 13, 'First Lieutenant', 8, 5000000, 1.7999999523162842, ['Meister'], '33.670666,-117.841553', '1980/04/10', '2013/06/10', [3.962399959564209, 1800.0], datetime.date(2013, 6, 24), datetime.datetime(2014, 6, 24, 0, 0), True, 0.0), ('Megatron', None, 'None', 10, 5000000, 5.699999809265137, ['Megatron'], None, '1980/04/10', '2012/05/10', [None, 5700.0], datetime.date(2012, 5, 10), datetime.datetime(2014, 6, 24, 0, 0), True, 2.0), ('Metroplex_)^$', 300, 'Battle Station', 8, 5000000, None, ['Metroflex'], None, '1980/04/10', '2011/04/10', [91.44000244140625, None], datetime.date(2011, 4, 10), datetime.datetime(2014, 6, 24, 0, 0), True, 0.0)]) + expected_df = op.create.df([('names', StringType(), True),('height(ft)', ShortType(), True),('function', StringType(), True),('rank', ByteType(), True),('age', IntegerType(), True),('weight(t)', FloatType(), True),('japanese name', ArrayType(StringType(),True), True),('last position seen', StringType(), True),('date arrival', StringType(), True),('last date seen', StringType(), True),('attributes', ArrayType(FloatType(),True), True),('Date Type', DateType(), True),('timestamp', TimestampType(), True),('Cybertronian', BooleanType(), True),('rank***STRING_TO_INDEX', DoubleType(), False)], [("Optim'us", -28, 'Leader', 10, 5000000, 4.300000190734863, ['Inochi', 'Convoy'], '19.442735,-99.201111', '1980/04/10', '2016/09/10', [8.53439998626709, 4300.0], datetime.date(2016, 9, 10), datetime.datetime(2014, 6, 24, 0, 0), True, 2.0), ('bumbl#ebéé ', 17, 'Espionage', 7, 5000000, 2.0, ['Bumble', 'Goldback'], '10.642707,-71.612534', '1980/04/10', '2015/08/10', [5.334000110626221, 2000.0], datetime.date(2015, 8, 10), datetime.datetime(2014, 6, 24, 0, 0), True, 1.0), ('ironhide&', 26, 'Security', 7, 5000000, 4.0, ['Roadbuster'], '37.789563,-122.400356', '1980/04/10', '2014/07/10', [7.924799919128418, 4000.0], datetime.date(2014, 6, 24), datetime.datetime(2014, 6, 24, 0, 0), True, 1.0), ('Jazz', 13, 'First Lieutenant', 8, 5000000, 1.7999999523162842, ['Meister'], '33.670666,-117.841553', '1980/04/10', '2013/06/10', [3.962399959564209, 1800.0], datetime.date(2013, 6, 24), datetime.datetime(2014, 6, 24, 0, 0), True, 0.0), ('Megatron', None, 'None', 10, 5000000, 5.699999809265137, ['Megatron'], None, '1980/04/10', '2012/05/10', [None, 5700.0], datetime.date(2012, 5, 10), datetime.datetime(2014, 6, 24, 0, 0), True, 2.0), ('Metroplex_)^$', 300, 'Battle Station', 8, 5000000, None, ['Metroflex'], None, '1980/04/10', '2011/04/10', [91.44000244140625, None], datetime.date(2011, 4, 10), datetime.datetime(2014, 6, 24, 0, 0), True, 0.0)]) assert (expected_df.collect() == actual_df.collect()) @staticmethod def test_cols_sub():