From c44bc3374330edda0df64c6c636256db8619fdb0 Mon Sep 17 00:00:00 2001 From: Argenis Leon Date: Fri, 6 Dec 2019 08:11:10 -0600 Subject: [PATCH] Fix profiler --- optimus/profiler/profiler.py | 2 +- tests/creator/creator-profiler.ipynb | 3125 ++++++++++++++++++++++++- tests/creator/creator-profiler.py | 6 + tests/creator/creator.ipynb | 3156 ++++++++++++++++++++++---- tests/creator/creator.py | 8 +- tests/test_rows.py | 182 -- 6 files changed, 5875 insertions(+), 604 deletions(-) delete mode 100644 tests/test_rows.py diff --git a/optimus/profiler/profiler.py b/optimus/profiler/profiler.py index 7d63e872..37a390d2 100644 --- a/optimus/profiler/profiler.py +++ b/optimus/profiler/profiler.py @@ -396,7 +396,7 @@ def columns_stats(self, df, columns, buckets=10, infer=False, relative_error=REL logger.print("Processing Stats For columns...") # Get columns data types. This is necessary to make the pertinent histogram calculations. - count_by_data_type = df.cols.count_by_dtypes(columns, infer=infer, mismatch=mismatch) + count_by_data_type = df.cols.count_by_dtypes(columns, infer=infer) count_by_data_type_no_mismatch = copy.deepcopy(count_by_data_type) diff --git a/tests/creator/creator-profiler.ipynb b/tests/creator/creator-profiler.ipynb index 91b94560..492a4690 100644 --- a/tests/creator/creator-profiler.ipynb +++ b/tests/creator/creator-profiler.ipynb @@ -39,15 +39,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "C:\\Users\\argenisleon\\Anaconda3\\lib\\site-packages\\socks.py:58: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n", - " from collections import Callable\n", - "../..\\optimus\\helpers\\functions.py:172: DeprecationWarning: invalid escape sequence \\d\n", - " pattern = '\\\"(\\d+\\.\\d+).*\\\"'\n", "\n", " You are using PySparkling of version 2.4.10, but your PySpark is of\n", - " version 2.3.1. Please make sure Spark and PySparkling versions are compatible. \n", - "`formatargspec` is deprecated since Python 3.5. Use `signature` and the `Signature` object directly\n", - "invalid escape sequence \\d\n" + " version 2.3.1. Please make sure Spark and PySparkling versions are compatible. \n" ] } ], @@ -72,7 +66,7 @@ "INFO:optimus:HADOOP_HOME=C:\\opt\\hadoop-2.7.7\n", "INFO:optimus:PYSPARK_PYTHON=C:\\Users\\argenisleon\\Anaconda3\\python.exe\n", "INFO:optimus:PYSPARK_DRIVER_PYTHON=jupyter\n", - "INFO:optimus:PYSPARK_SUBMIT_ARGS=--jars \"file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/RedshiftJDBC42-1.2.16.1027.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/mysql-connector-java-8.0.16.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/ojdbc8.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/postgresql-42.2.5.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/presto-jdbc-0.224.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/spark-cassandra-connector_2.11-2.4.1.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/sqlite-jdbc-3.27.2.1.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/mssql-jdbc-7.4.1.jre8.jar\" --driver-class-path \"C:/Users/argenisleon/Documents/Optimus/optimus/jars/RedshiftJDBC42-1.2.16.1027.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/mysql-connector-java-8.0.16.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/ojdbc8.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/postgresql-42.2.5.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/presto-jdbc-0.224.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/spark-cassandra-connector_2.11-2.4.1.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/sqlite-jdbc-3.27.2.1.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/mssql-jdbc-7.4.1.jre8.jar\" --conf \"spark.sql.catalogImplementation=hive\" pyspark-shell\n", + "INFO:optimus:PYSPARK_SUBMIT_ARGS=--jars \"file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/spark-redis-2.4.1-SNAPSHOT-jar-with-dependencies.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/RedshiftJDBC42-1.2.16.1027.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/mysql-connector-java-8.0.16.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/ojdbc8.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/postgresql-42.2.5.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/presto-jdbc-0.224.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/spark-cassandra-connector_2.11-2.4.1.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/sqlite-jdbc-3.27.2.1.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/mssql-jdbc-7.4.1.jre8.jar\" --driver-class-path \"C:/Users/argenisleon/Documents/Optimus/optimus/jars/spark-redis-2.4.1-SNAPSHOT-jar-with-dependencies.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/RedshiftJDBC42-1.2.16.1027.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/mysql-connector-java-8.0.16.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/ojdbc8.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/postgresql-42.2.5.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/presto-jdbc-0.224.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/spark-cassandra-connector_2.11-2.4.1.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/sqlite-jdbc-3.27.2.1.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/mssql-jdbc-7.4.1.jre8.jar\" --conf \"spark.sql.catalogImplementation=hive\" pyspark-shell\n", "INFO:optimus:JAVA_HOME=C:\\java\n", "INFO:optimus:Pyarrow Installed\n", "INFO:optimus:-----\n", @@ -86,7 +80,20 @@ " \\____/ .___/\\__/_/_/ /_/ /_/\\__,_/____/ \n", " /_/ \n", " \n", - "INFO:optimus:Transform and Roll out...\n", + "INFO:optimus:Transform and Roll out...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "C:/Users/argenisleon/Documents/Optimus/optimus/../parse/infer.py\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ "INFO:optimus:Optimus successfully imported. Have fun :).\n", "INFO:optimus:Config.ini not found\n" ] @@ -6092,6 +6099,3106 @@ "json.dumps(\"{'name'=a'a}\")" ] }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from optimus.profiler.profiler import Profiler" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:Processing Stats For columns...\n", + "INFO:optimus:Batch Stats 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']\n", + "INFO:optimus:'kurtosis' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values\n", + "INFO:optimus:'skewness' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values\n", + "INFO:optimus:Batch Histogram 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']\n", + "INFO:optimus:'kurtosis' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values\n", + "INFO:optimus:'skewness' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values\n", + "INFO:optimus:Processing Frequency ...\n", + "INFO:optimus:`names`,`function`,`japanese name`,`last position seen`,`date arrival`,`last date seen`,`attributes`,`Date Type`,`timestamp`,`Cybertronian`,`function(binary)`,`NullType` column(s) was not processed because is/are not byte,short,big,int,double,float\n", + "INFO:optimus:`names`,`function`,`last position seen`,`date arrival`,`last date seen`,`timestamp`,`Cybertronian`,`NullType` column(s) was not processed because is/are not array,vector,byte,date,binary\n", + "INFO:optimus:Using 'column_exp' to process column 'japanese name' with function _cast_to\n", + "INFO:optimus:Using 'column_exp' to process column 'attributes' with function _cast_to\n", + "INFO:optimus:Using 'column_exp' to process column 'Date Type' with function _cast_to\n", + "INFO:optimus:Using 'column_exp' to process column 'function(binary)' with function _cast_to\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "

Overview

\n", + "
\n", + "
\n", + "
\n", + "

Dataset info

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
Number of columns16
Number of rows7
Total Missing (%)26
Total size in memory45.3 MB
\n", + "
\n", + "
\n", + "

Column types

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
Categorical0
Numeric0
Date2
Array2
Not available1
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

names

\n", + " categorical\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 5
Unique (%)
Missing1
Missing (%)
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 6\n", + "
\n", + " Integer\n", + " \n", + " \n", + "
\n", + " Decimal\n", + " \n", + " \n", + "
\n", + " Bool\n", + " \n", + " \n", + "
\n", + " Date\n", + " \n", + " \n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 1\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "

Frequency

\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
ValueCountFrequency (%)
Optimus114.29%
bumbl#ebéé 114.29%
ironhide&114.29%
Jazz114.29%
Megatron114.29%
Metroplex_)^$114.29%
None114.29%
\"Missing\"1%
\n", + " \n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

height(ft)

\n", + " numeric\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 5
Unique (%)
Missing2
Missing (%)
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " \n", + "
\n", + " Integer\n", + " \n", + " 5\n", + "
\n", + " Decimal\n", + " \n", + " \n", + "
\n", + " Bool\n", + " \n", + " \n", + "
\n", + " Date\n", + " \n", + " \n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 2\n", + "
\n", + " \n", + "
\n", + "

\n", + " Basic Stats\n", + "

\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
Mean65.6
Minimum-28
Maximum300
Zeros(%)0
\n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "

Quantile statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Minimum-28
5-th percentile-28
Q113
Median17
Q326
95-th percentile300
Maximum300
Range
Interquartile range
\n", + "
\n", + "
\n", + "

Descriptive statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Standard deviation132.66612
Coef of variation
Kurtosis0.13863
Mean65.6
MAD
Skewness1.4049
Sum328
Variance17600.3
\n", + "
\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

function

\n", + " categorical\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 6
Unique (%)
Missing1
Missing (%)
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 6\n", + "
\n", + " Integer\n", + " \n", + " \n", + "
\n", + " Decimal\n", + " \n", + " \n", + "
\n", + " Bool\n", + " \n", + " \n", + "
\n", + " Date\n", + " \n", + " \n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 1\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "

Frequency

\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
ValueCountFrequency (%)
Leader114.29%
Espionage114.29%
Security114.29%
First Lieutenant114.29%
None114.29%
Battle Station114.29%
None114.29%
\"Missing\"1%
\n", + " \n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

rank

\n", + " numeric\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 3
Unique (%)
Missing1
Missing (%)
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " \n", + "
\n", + " Integer\n", + " \n", + " 6\n", + "
\n", + " Decimal\n", + " \n", + " \n", + "
\n", + " Bool\n", + " \n", + " \n", + "
\n", + " Date\n", + " \n", + " \n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 1\n", + "
\n", + " \n", + "
\n", + "

\n", + " Basic Stats\n", + "

\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
Mean8.33333
Minimum7
Maximum10
Zeros(%)0
\n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "

Quantile statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Minimum7
5-th percentile7
Q17
Median8
Q310
95-th percentile10
Maximum10
Range
Interquartile range
\n", + "
\n", + "
\n", + "

Descriptive statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Standard deviation1.36626
Coef of variation
Kurtosis-1.5
Mean8.33333
MAD
Skewness0.3818
Sum50
Variance1.86667
\n", + "
\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

age

\n", + " numeric\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 1
Unique (%)
Missing1
Missing (%)
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " \n", + "
\n", + " Integer\n", + " \n", + " 6\n", + "
\n", + " Decimal\n", + " \n", + " \n", + "
\n", + " Bool\n", + " \n", + " \n", + "
\n", + " Date\n", + " \n", + " \n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 1\n", + "
\n", + " \n", + "
\n", + "

\n", + " Basic Stats\n", + "

\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
Mean5000000.0
Minimum5000000
Maximum5000000
Zeros(%)0
\n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "

Quantile statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Minimum5000000
5-th percentile5000000
Q15000000
Median5000000
Q35000000
95-th percentile5000000
Maximum5000000
Range
Interquartile range
\n", + "
\n", + "
\n", + "

Descriptive statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Standard deviation0.0
Coef of variation
Kurtosisnan
Mean5000000.0
MAD
Skewnessnan
Sum30000000
Variance0.0
\n", + "
\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

weight(t)

\n", + " numeric\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 5
Unique (%)
Missing2
Missing (%)
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " \n", + "
\n", + " Integer\n", + " \n", + " \n", + "
\n", + " Decimal\n", + " \n", + " 5\n", + "
\n", + " Bool\n", + " \n", + " \n", + "
\n", + " Date\n", + " \n", + " \n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 2\n", + "
\n", + " \n", + "
\n", + "

\n", + " Basic Stats\n", + "

\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
Mean3.56
Minimum1.8
Maximum5.7
Zeros(%)0
\n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "

Quantile statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Minimum1.8
5-th percentile1.7999999523162842
Q12.0
Median4.0
Q34.300000190734863
95-th percentile5.699999809265137
Maximum5.7
Range
Interquartile range
\n", + "
\n", + "
\n", + "

Descriptive statistics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Standard deviation1.64712
Coef of variation
Kurtosis-1.43641
Mean3.56
MAD
Skewness0.06521
Sum17.8
Variance2.713
\n", + "
\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

japanese name

\n", + " array\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 6
Unique (%)
Missing1
Missing (%)
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " \n", + "
\n", + " Integer\n", + " \n", + " \n", + "
\n", + " Decimal\n", + " \n", + " \n", + "
\n", + " Bool\n", + " \n", + " \n", + "
\n", + " Date\n", + " \n", + " \n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 1\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "

Frequency

\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
ValueCountFrequency (%)
[Inochi, Convoy]114.29%
[Bumble, Goldback]114.29%
[Roadbuster]114.29%
[Meister]114.29%
[Megatron]114.29%
[Metroflex]114.29%
None114.29%
\"Missing\"1%
\n", + " \n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

last position seen

\n", + " categorical\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 4
Unique (%)
Missing3
Missing (%)
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 4\n", + "
\n", + " Integer\n", + " \n", + " \n", + "
\n", + " Decimal\n", + " \n", + " \n", + "
\n", + " Bool\n", + " \n", + " \n", + "
\n", + " Date\n", + " \n", + " \n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 3\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "

Frequency

\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
ValueCountFrequency (%)
None342.86%
19.442735,-99.201111114.29%
10.642707,-71.612534114.29%
37.789563,-122.400356114.29%
33.670666,-117.841553114.29%
\"Missing\"3%
\n", + " \n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

date arrival

\n", + " categorical\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 1
Unique (%)
Missing1
Missing (%)
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 6\n", + "
\n", + " Integer\n", + " \n", + " \n", + "
\n", + " Decimal\n", + " \n", + " \n", + "
\n", + " Bool\n", + " \n", + " \n", + "
\n", + " Date\n", + " \n", + " \n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 1\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "

Frequency

\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
ValueCountFrequency (%)
1980/04/10685.71%
None114.29%
\"Missing\"1%
\n", + " \n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

last date seen

\n", + " categorical\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 6
Unique (%)
Missing1
Missing (%)
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " 6\n", + "
\n", + " Integer\n", + " \n", + " \n", + "
\n", + " Decimal\n", + " \n", + " \n", + "
\n", + " Bool\n", + " \n", + " \n", + "
\n", + " Date\n", + " \n", + " \n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 1\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "

Frequency

\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
ValueCountFrequency (%)
2016/09/10114.29%
2015/08/10114.29%
2014/07/10114.29%
2013/06/10114.29%
2012/05/10114.29%
2011/04/10114.29%
None114.29%
\"Missing\"1%
\n", + " \n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

attributes

\n", + " array\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 6
Unique (%)
Missing1
Missing (%)
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " \n", + "
\n", + " Integer\n", + " \n", + " \n", + "
\n", + " Decimal\n", + " \n", + " \n", + "
\n", + " Bool\n", + " \n", + " \n", + "
\n", + " Date\n", + " \n", + " \n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 1\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "

Frequency

\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
ValueCountFrequency (%)
[8.5344, 4300.0]114.29%
[5.334, 2000.0]114.29%
[7.9248, 4000.0]114.29%
[3.9624, 1800.0]114.29%
[, 5700.0]114.29%
[91.44,]114.29%
None114.29%
\"Missing\"1%
\n", + " \n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

Date Type

\n", + " date\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 6
Unique (%)
Missing1
Missing (%)
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " \n", + "
\n", + " Integer\n", + " \n", + " \n", + "
\n", + " Decimal\n", + " \n", + " \n", + "
\n", + " Bool\n", + " \n", + " \n", + "
\n", + " Date\n", + " \n", + " 6\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 1\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "

Frequency

\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
ValueCountFrequency (%)
2016-09-10114.29%
2015-08-10114.29%
2014-06-24114.29%
2013-06-24114.29%
2012-05-10114.29%
2011-04-10114.29%
None114.29%
\"Missing\"1%
\n", + " \n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

timestamp

\n", + " date\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 1
Unique (%)
Missing1
Missing (%)
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " \n", + "
\n", + " Integer\n", + " \n", + " \n", + "
\n", + " Decimal\n", + " \n", + " \n", + "
\n", + " Bool\n", + " \n", + " \n", + "
\n", + " Date\n", + " \n", + " 6\n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 1\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "

Frequency

\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
ValueCountFrequency (%)
2014-06-24 00:00:00685.71%
None114.29%
\"Missing\"1%
\n", + " \n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

Cybertronian

\n", + " categorical\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 1
Unique (%)
Missing1
Missing (%)
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " \n", + "
\n", + " Integer\n", + " \n", + " \n", + "
\n", + " Decimal\n", + " \n", + " \n", + "
\n", + " Bool\n", + " \n", + " 6\n", + "
\n", + " Date\n", + " \n", + " \n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 1\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "

Frequency

\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
ValueCountFrequency (%)
True685.71%
None114.29%
\"Missing\"1%
\n", + " \n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "

NullType

\n", + " null\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unique 0
Unique (%)
Missing7
Missing (%)
\n", + "
\n", + "

\n", + " Datatypes\n", + "

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "
\n", + " String\n", + " \n", + " \n", + "
\n", + " Integer\n", + " \n", + " \n", + "
\n", + " Decimal\n", + " \n", + " \n", + "
\n", + " Bool\n", + " \n", + " \n", + "
\n", + " Date\n", + " \n", + " \n", + "
\n", + " Missing\n", + " \n", + " 0\n", + "
\n", + " Null\n", + " \n", + " 7\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "

Frequency

\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
ValueCountFrequency (%)
None7100.0%
\"Missing\"7%
\n", + " \n", + "
\n", + " \n", + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + "
\n", + "\n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:run() executed in 68.73 sec\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "op.profiler.run(source_df, \"*\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'height(ft)': {'range': {'max': 300, 'min': -28}}}" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "source_df.cols.range(\"height(ft)\")" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/tests/creator/creator-profiler.py b/tests/creator/creator-profiler.py index f1b0e7b6..4419feec 100644 --- a/tests/creator/creator-profiler.py +++ b/tests/creator/creator-profiler.py @@ -151,4 +151,10 @@ def func(col_name, attrs): import json json.dumps("{'name'=a'a}") +from optimus.profiler.profiler import Profiler + +op.profiler.run(source_df, "*") + +source_df.cols.range("height(ft)") + diff --git a/tests/creator/creator.ipynb b/tests/creator/creator.ipynb index 6e960570..4f4dcc75 100644 --- a/tests/creator/creator.ipynb +++ b/tests/creator/creator.ipynb @@ -2329,7 +2329,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ @@ -2342,7 +2342,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ @@ -5694,7 +5694,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ @@ -5759,7 +5759,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ @@ -5768,7 +5768,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 69, "metadata": {}, "outputs": [ { @@ -6896,7 +6896,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 106, "metadata": {}, "outputs": [ { @@ -6917,12 +6917,13 @@ "name": "stdout", "output_type": "stream", "text": [ + "['{\"names\": \"int\"}']\n", "{'names': {'mismatch': 6, 'int': 1, 'null': 0, 'missing': 0}}\n" ] } ], "source": [ - "t.create(mismatch_df, \"cols.count_mismatch\", None, \"dict\", None, m)" + "t.create(mismatch_df, \"cols.count_mismatch\", None, \"dict\", None, {\"names\":\"int\"})" ] }, { @@ -6934,14 +6935,14 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 107, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Creating file ../test_df_cols.py\n", + "Creating file ../test_df_rows.py\n", "Done\n" ] } @@ -6959,9 +6960,48 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 83, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating test_cols_unnest_string_multi_index() test function...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:test_cols_unnest_string_multi_index()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\"'date arrival'\", \"'/'\"]\n" + ] + }, + { + "ename": "ValueError", + "evalue": "'missing_columns' must be 'words', 'num', 'animals', 'thing', 'second', 'filter', received 'date arrival'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcreate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"cols.unnest\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"string_multi_index\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"df\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdate_col\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"/\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msplits\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\test.py\u001b[0m in \u001b[0;36mcreate\u001b[1;34m(self, obj, method, suffix, output, additional_method, *args, **kwargs)\u001b[0m\n\u001b[0;32m 217\u001b[0m \u001b[0mdf_func\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_func\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 218\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 219\u001b[1;33m \u001b[0mdf_result\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf_func\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 220\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 221\u001b[0m \u001b[1;31m# Additional Methods\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 47\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 48\u001b[0m \u001b[0mstart_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 49\u001b[1;33m \u001b[0mf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 50\u001b[0m \u001b[0m_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mround\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtimeit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdefault_timer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mstart_time\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 51\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mlog_time\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\dataframe\\columns.py\u001b[0m in \u001b[0;36munnest\u001b[1;34m(input_cols, separator, splits, index, output_cols)\u001b[0m\n\u001b[0;32m 1571\u001b[0m \u001b[0mdf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1572\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1573\u001b[1;33m \u001b[0minput_cols\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparse_columns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minput_cols\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1574\u001b[0m \u001b[0moutput_cols\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mget_output_cols\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minput_cols\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moutput_cols\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1575\u001b[0m \u001b[0mfinal_columns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\columns.py\u001b[0m in \u001b[0;36mparse_columns\u001b[1;34m(df, cols_args, get_args, is_regex, filter_by_column_dtypes, accepts_missing_cols, invert)\u001b[0m\n\u001b[0;32m 140\u001b[0m \u001b[1;31m# Check for missing columns\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 141\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0maccepts_missing_cols\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 142\u001b[1;33m \u001b[0mcheck_for_missing_columns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcols\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 143\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 144\u001b[0m \u001b[1;31m# Filter by column data type\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\columns.py\u001b[0m in \u001b[0;36mcheck_for_missing_columns\u001b[1;34m(df, col_names)\u001b[0m\n\u001b[0;32m 242\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 243\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmissing_columns\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m>\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 244\u001b[1;33m \u001b[0mRaiseIt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalue_error\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmissing_columns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 245\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 246\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Documents\\Optimus\\optimus\\helpers\\raiseit.py\u001b[0m in \u001b[0;36mvalue_error\u001b[1;34m(var, data_values)\u001b[0m\n\u001b[0;32m 76\u001b[0m type=divisor.join(map(\n\u001b[0;32m 77\u001b[0m \u001b[1;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m\"'\"\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mx\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;34m\"'\"\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 78\u001b[1;33m data_values)), var_type=one_list_to_val(var)))\n\u001b[0m\u001b[0;32m 79\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 80\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mstaticmethod\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mValueError\u001b[0m: 'missing_columns' must be 'words', 'num', 'animals', 'thing', 'second', 'filter', received 'date arrival'" + ] + } + ], "source": [ "t.create(None, \"cols.unnest\", \"string_multi_index\", \"df\", None, date_col, \"/\", splits=3, index=2)" ] @@ -7313,234 +7353,2598 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from pyspark.sql.types import *\n", - "from datetime import date, datetime\n", - "\n", - "\n", - "cols = [\n", - " (\"names\", \"str\"),\n", - " (\"height(ft)\", ShortType()),\n", - " (\"function\", \"str\"),\n", - " (\"rank\", ByteType()),\n", - " (\"age\", \"int\"),\n", - " (\"weight(t)\", \"float\"),\n", - " \"japanese name\",\n", - " \"last position seen\",\n", - " \"date arrival\",\n", - " \"last date seen\",\n", - " (\"attributes\", ArrayType(FloatType())),\n", - " (\"Date Type\", DateType()),\n", - " (\"timestamp\", TimestampType()),\n", - " (\"Cybertronian\", BooleanType()),\n", - " (\"function(binary)\", BinaryType()),\n", - " (\"NullType\", NullType())\n", - "\n", - " ]\n", - "\n", - "rows = [\n", - " (\"Optim'us\", -28, \"Leader\", 10, 5000000, 4.30, [\"Inochi\", \"Convoy\"], \"19.442735,-99.201111\", \"1980/04/10\",\n", - " \"2016/09/10\", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray(\"Leader\", \"utf-8\"),\n", - " None),\n", - " (\"bumbl#ebéé \", 17, \"Espionage\", 7, 5000000, 2.0, [\"Bumble\", \"Goldback\"], \"10.642707,-71.612534\", \"1980/04/10\",\n", - " \"2015/08/10\", [5.334, 2000.0], date(2015, 8, 10), datetime(2014, 6, 24), True, bytearray(\"Espionage\", \"utf-8\"),\n", - " None),\n", - " (\"ironhide&\", 26, \"Security\", 7, 5000000, 4.0, [\"Roadbuster\"], \"37.789563,-122.400356\", \"1980/04/10\",\n", - " \"2014/07/10\", [7.9248, 4000.0], date(2014, 6, 24), datetime(2014, 6, 24), True, bytearray(\"Security\", \"utf-8\"),\n", - " None),\n", - " (\"Jazz\", 13, \"First Lieutenant\", 8, 5000000, 1.80, [\"Meister\"], \"33.670666,-117.841553\", \"1980/04/10\",\n", - " \"2013/06/10\", [3.9624, 1800.0], date(2013, 6, 24), datetime(2014, 6, 24), True,\n", - " bytearray(\"First Lieutenant\", \"utf-8\"), None),\n", - " (\"Megatron\", None, \"None\", 10, 5000000, 5.70, [\"Megatron\"], None, \"1980/04/10\", \"2012/05/10\", [None, 5700.0],\n", - " date(2012, 5, 10), datetime(2014, 6, 24), True, bytearray(\"None\", \"utf-8\"), None),\n", - " (\"Metroplex_)^$\", 300, \"Battle Station\", 8, 5000000, None, [\"Metroflex\"], None, \"1980/04/10\", \"2011/04/10\",\n", - " [91.44, None], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray(\"Battle Station\", \"utf-8\"), None),\n", - " (None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None),\n", - "\n", - " ]\n", - "source_df = op.create.df(cols ,rows)\n", - "source_df.table()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t = Test(op, source_df, \"df_outliers\", imports=[\"from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector\",\n", - " \"import numpy as np\",\n", - " \"nan = np.nan\",\n", - " \"import datetime\",\n", - " \"from pyspark.sql import functions as F\"], path = \"df_outliers\", final_path=\"..\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pyspark.sql import functions as F\n", - "\n", - "\n", - "def func(col_name, attrs):\n", - " return F.col(col_name) * 2\n", - "\n", - "numeric_col = \"height(ft)\"\n", - "numeric_col_B = \"rank\"\n", - "numeric_col_C = \"rank\"\n", - "string_col = \"function\"\n", - "date_col = \"date arrival\"\n", - "date_col_B = \"last date seen\"\n", - "new_col = \"new col\"\n", - "array_col = \"attributes\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Tukey" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"outliers.tukey\", None, \"df\",\"select\", numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "source_df.outliers.tukey(numeric_col).drop()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"outliers.tukey\", None, \"df\",\"drop\", numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"outliers.tukey\", None, \"json\", \"whiskers\", numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"outliers.tukey\", None, \"json\", \"count\", numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"outliers.tukey\", None, \"json\", \"non_outliers_count\", numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"outliers.tukey\", None, \"json\", \"info\", numeric_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.run()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Zscore" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "threshold = 0.5" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"outliers.z_score\", None, \"df\",\"select\", numeric_col, threshold)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "source_df.outliers.z_score('height(ft)',0.5).select()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"outliers.z_score\", None, \"df\",\"drop\", numeric_col, threshold)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"outliers.z_score\", None, \"json\", \"count\", numeric_col, threshold)" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 86, "metadata": {}, - "outputs": [], - "source": [ - "t.create(None, \"outliers.z_score\", None, \"json\", \"non_outliers_count\", numeric_col, threshold)" - ] - }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 7 of 7 rows / 16 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
names
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
height(ft)
\n", + "
2 (smallint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
function
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
rank
\n", + "
4 (tinyint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
age
\n", + "
5 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
weight(t)
\n", + "
6 (float)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
japanese name
\n", + "
7 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
last position seen
\n", + "
8 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
date arrival
\n", + "
9 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
last date seen
\n", + "
10 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
attributes
\n", + "
11 (array<float>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
Date Type
\n", + "
12 (date)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
timestamp
\n", + "
13 (timestamp)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
Cybertronian
\n", + "
14 (boolean)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
function(binary)
\n", + "
15 (binary)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
NullType
\n", + "
16 (null)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Optim'us\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " -28\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Leader\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 4.300000190734863\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Inochi',⋅'Convoy']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 19.442735,-99.201111\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2016/09/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [8.53439998626709,⋅4300.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2016-09-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Leader')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bumbl#ebéé⋅⋅\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 17\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Espionage\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 7\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Bumble',⋅'Goldback']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10.642707,-71.612534\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2015/08/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [5.334000110626221,⋅2000.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2015-08-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Espionage')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ironhide&\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 26\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Security\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 7\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 4.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Roadbuster']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 37.789563,-122.400356\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014/07/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [7.924799919128418,⋅4000.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Security')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Jazz\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 13\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " First⋅Lieutenant\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1.7999999523162842\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Meister']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 33.670666,-117.841553\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2013/06/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [3.962399959564209,⋅1800.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2013-06-24\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'First⋅Lieutenant')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Megatron\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5.699999809265137\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Megatron']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2012/05/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [None,⋅5700.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2012-05-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'None')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Metroplex_)^$\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 300\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Battle⋅Station\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Metroflex']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2011/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [91.44000244140625,⋅None]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2011-04-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Battle⋅Station')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "\n", + "\n", + "
Viewing 7 of 7 rows / 16 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "from pyspark.sql.types import *\n", + "from datetime import date, datetime\n", + "\n", + "\n", + "cols = [\n", + " (\"names\", \"str\"),\n", + " (\"height(ft)\", ShortType()),\n", + " (\"function\", \"str\"),\n", + " (\"rank\", ByteType()),\n", + " (\"age\", \"int\"),\n", + " (\"weight(t)\", \"float\"),\n", + " \"japanese name\",\n", + " \"last position seen\",\n", + " \"date arrival\",\n", + " \"last date seen\",\n", + " (\"attributes\", ArrayType(FloatType())),\n", + " (\"Date Type\", DateType()),\n", + " (\"timestamp\", TimestampType()),\n", + " (\"Cybertronian\", BooleanType()),\n", + " (\"function(binary)\", BinaryType()),\n", + " (\"NullType\", NullType())\n", + "\n", + " ]\n", + "\n", + "rows = [\n", + " (\"Optim'us\", -28, \"Leader\", 10, 5000000, 4.30, [\"Inochi\", \"Convoy\"], \"19.442735,-99.201111\", \"1980/04/10\",\n", + " \"2016/09/10\", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray(\"Leader\", \"utf-8\"),\n", + " None),\n", + " (\"bumbl#ebéé \", 17, \"Espionage\", 7, 5000000, 2.0, [\"Bumble\", \"Goldback\"], \"10.642707,-71.612534\", \"1980/04/10\",\n", + " \"2015/08/10\", [5.334, 2000.0], date(2015, 8, 10), datetime(2014, 6, 24), True, bytearray(\"Espionage\", \"utf-8\"),\n", + " None),\n", + " (\"ironhide&\", 26, \"Security\", 7, 5000000, 4.0, [\"Roadbuster\"], \"37.789563,-122.400356\", \"1980/04/10\",\n", + " \"2014/07/10\", [7.9248, 4000.0], date(2014, 6, 24), datetime(2014, 6, 24), True, bytearray(\"Security\", \"utf-8\"),\n", + " None),\n", + " (\"Jazz\", 13, \"First Lieutenant\", 8, 5000000, 1.80, [\"Meister\"], \"33.670666,-117.841553\", \"1980/04/10\",\n", + " \"2013/06/10\", [3.9624, 1800.0], date(2013, 6, 24), datetime(2014, 6, 24), True,\n", + " bytearray(\"First Lieutenant\", \"utf-8\"), None),\n", + " (\"Megatron\", None, \"None\", 10, 5000000, 5.70, [\"Megatron\"], None, \"1980/04/10\", \"2012/05/10\", [None, 5700.0],\n", + " date(2012, 5, 10), datetime(2014, 6, 24), True, bytearray(\"None\", \"utf-8\"), None),\n", + " (\"Metroplex_)^$\", 300, \"Battle Station\", 8, 5000000, None, [\"Metroflex\"], None, \"1980/04/10\", \"2011/04/10\",\n", + " [91.44, None], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray(\"Battle Station\", \"utf-8\"), None),\n", + " (None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None),\n", + "\n", + " ]\n", + "source_df = op.create.df(cols ,rows)\n", + "source_df.table()" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "t = Test(op, source_df, \"df_outliers\", imports=[\"from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector\",\n", + " \"import numpy as np\",\n", + " \"nan = np.nan\",\n", + " \"import datetime\",\n", + " \"from pyspark.sql import functions as F\"], path = \"df_outliers\", final_path=\"..\")" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import functions as F\n", + "\n", + "\n", + "def func(col_name, attrs):\n", + " return F.col(col_name) * 2\n", + "\n", + "numeric_col = \"height(ft)\"\n", + "numeric_col_B = \"rank\"\n", + "numeric_col_C = \"rank\"\n", + "string_col = \"function\"\n", + "date_col = \"date arrival\"\n", + "date_col_B = \"last date seen\"\n", + "new_col = \"new col\"\n", + "array_col = \"attributes\"" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 7 of 7 rows / 16 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
names
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
height(ft)
\n", + "
2 (smallint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
function
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
rank
\n", + "
4 (tinyint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
age
\n", + "
5 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
weight(t)
\n", + "
6 (float)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
japanese name
\n", + "
7 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
last position seen
\n", + "
8 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
date arrival
\n", + "
9 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
last date seen
\n", + "
10 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
attributes
\n", + "
11 (array<float>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
Date Type
\n", + "
12 (date)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
timestamp
\n", + "
13 (timestamp)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
Cybertronian
\n", + "
14 (boolean)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
function(binary)
\n", + "
15 (binary)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
NullType
\n", + "
16 (null)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Optim'us\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " -28\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Leader\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 4.300000190734863\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Inochi',⋅'Convoy']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 19.442735,-99.201111\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2016/09/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [8.53439998626709,⋅4300.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2016-09-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Leader')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bumbl#ebéé⋅⋅\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 17\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Espionage\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 7\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Bumble',⋅'Goldback']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10.642707,-71.612534\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2015/08/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [5.334000110626221,⋅2000.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2015-08-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Espionage')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ironhide&\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 26\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Security\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 7\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 4.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Roadbuster']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 37.789563,-122.400356\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014/07/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [7.924799919128418,⋅4000.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Security')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Jazz\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 13\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " First⋅Lieutenant\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1.7999999523162842\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Meister']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 33.670666,-117.841553\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2013/06/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [3.962399959564209,⋅1800.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2013-06-24\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'First⋅Lieutenant')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Megatron\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5.699999809265137\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Megatron']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2012/05/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [None,⋅5700.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2012-05-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'None')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Metroplex_)^$\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 300\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Battle⋅Station\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Metroflex']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2011/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [91.44000244140625,⋅None]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2011-04-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Battle⋅Station')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "\n", + "\n", + "
Viewing 7 of 7 rows / 16 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "source_df.table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tukey" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"outliers.tukey\", None, \"df\",\"select\", numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "source_df.outliers.tukey(numeric_col).drop()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"outliers.tukey\", None, \"df\",\"drop\", numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"outliers.tukey\", None, \"json\", \"whiskers\", numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating test_outliers_tukey_count() test function...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:test_outliers_tukey_count()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\"'height(ft)'\"]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:percentile() executed in 2.66 sec\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2\n" + ] + } + ], + "source": [ + "t.create(None, \"outliers.tukey\", None, \"json\", \"count\", numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating test_outliers_tukey_non_outliers_count() test function...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:test_outliers_tukey_non_outliers_count()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\"'height(ft)'\"]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:percentile() executed in 2.58 sec\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3\n" + ] + } + ], + "source": [ + "t.create(None, \"outliers.tukey\", None, \"json\", \"non_outliers_count\", numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating test_outliers_tukey_info() test function...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:test_outliers_tukey_info()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\"'height(ft)'\"]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:percentile() executed in 2.59 sec\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'count_outliers': 2, 'count_non_outliers': 3, 'lower_bound': -6.5, 'lower_bound_count': 1, 'upper_bound': 45.5, 'upper_bound_count': 1, 'q1': 13, 'median': 17, 'q3': 26, 'iqr': 13}\n" + ] + } + ], + "source": [ + "t.create(None, \"outliers.tukey\", None, \"json\", \"info\", numeric_col)" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating file ../test_df_outliers.py\n", + "Done\n" + ] + } + ], + "source": [ + "t.run()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Zscore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "threshold = 0.5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"outliers.z_score\", None, \"df\",\"select\", numeric_col, threshold)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "source_df.outliers.z_score('height(ft)',0.5).select()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"outliers.z_score\", None, \"df\",\"drop\", numeric_col, threshold)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"outliers.z_score\", None, \"json\", \"count\", numeric_col, threshold)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t.create(None, \"outliers.z_score\", None, \"json\", \"non_outliers_count\", numeric_col, threshold)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -7639,7 +10043,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ @@ -7676,27 +10080,82 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 93, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating test_outliers_mad_non_outliers_count() test function...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:test_outliers_mad_non_outliers_count()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\"'height(ft)'\", '0.5', '10000']\n", + "2\n" + ] + } + ], "source": [ "t.create(None, \"outliers.mad\", None, \"json\",\"non_outliers_count\", numeric_col, threshold, relative_error)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 90, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating test_outliers_mad_info() test function...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:optimus:test_outliers_mad_info()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\"'height(ft)'\", '0.5', '10000']\n", + "{'count_outliers': 3, 'count_non_outliers': 2, 'lower_bound': 12.5, 'lower_bound_count': 1, 'upper_bound': 21.5, 'upper_bound_count': 2}\n" + ] + } + ], "source": [ "t.create(None, \"outliers.mad\", None, \"json\",\"info\", numeric_col, threshold, relative_error)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 94, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating file ../test_df_outliers.py\n", + "Done\n" + ] + } + ], "source": [ "t.run()" ] @@ -7925,7 +10384,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 100, "metadata": {}, "outputs": [], "source": [ @@ -7947,7 +10406,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 101, "metadata": {}, "outputs": [], "source": [ @@ -7962,7 +10421,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ @@ -8350,7 +10809,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 116, "metadata": {}, "outputs": [], "source": [ @@ -8359,168 +10818,19 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 120, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating test_rows_select() test function...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:optimus:test_rows_select()\n" + "ename": "TypeError", + "evalue": "delete() takes from 2 to 3 positional arguments but 7 were given", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdelete\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"rows.select\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"df\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfil\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;31mTypeError\u001b[0m: delete() takes from 2 to 3 positional arguments but 7 were given" ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
Viewing 1 of 1 rows / 6 columns
\n", - "
1 partition(s)
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
words
\n", - "
1 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
num
\n", - "
2 (int)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
animals
\n", - "
3 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
thing
\n", - "
4 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
second
\n", - "
5 (int)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
filter
\n", - "
6 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " ⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 1\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " dog⋅dog\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " housé\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " 5\n", - " \n", - "
\n", - "
\n", - "
\n", - " \n", - " a\n", - " \n", - "
\n", - "
\n", - "\n", - "\n", - "
Viewing 1 of 1 rows / 6 columns
\n", - "
1 partition(s)
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" } ], "source": [ @@ -8699,16 +11009,33 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "metadata": {}, "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Column\n" + ] + } + ], "source": [ - "fil = (source_df[\"num\"] == 2) | (source_df[\"second\"] == 5)" + "fil = (source_df[\"num\"] == 2) | (source_df[\"second\"] == 5)\n", + "print(str(fil))\n", + "# type(fil)" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 114, "metadata": {}, "outputs": [ { @@ -8725,6 +11052,13 @@ "INFO:optimus:test_rows_drop()\n" ] }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\"Column\"]\n" + ] + }, { "data": { "text/html": [ @@ -10415,7 +12749,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 118, "metadata": {}, "outputs": [ { diff --git a/tests/creator/creator.py b/tests/creator/creator.py index 559547d1..b2ec916d 100644 --- a/tests/creator/creator.py +++ b/tests/creator/creator.py @@ -688,7 +688,7 @@ def func(col_name, attrs): mismatch_df.table() -t.create(mismatch_df, "cols.count_mismatch", None, "dict", None, m) +t.create(mismatch_df, "cols.count_mismatch", None, "dict", None, {"names":"int"}) @@ -888,6 +888,8 @@ def func(col_name, attrs): array_col = "attributes" # - +source_df.table() + # ## Tukey t.create(None, "outliers.tukey", None, "df","select", numeric_col) @@ -1064,7 +1066,11 @@ def func(col_name, attrs): t.create(None, "rows.select_by_dtypes", None, "df", None, "filter", "integer") + + fil = (source_df["num"] == 2) | (source_df["second"] == 5) +print(str(fil)) +# type(fil) t.create(None, "rows.drop", None, "df", None, fil) diff --git a/tests/test_rows.py b/tests/test_rows.py deleted file mode 100644 index a445dfec..00000000 --- a/tests/test_rows.py +++ /dev/null @@ -1,182 +0,0 @@ -from optimus import Optimus -from pyspark.sql.types import * -from optimus.audf import abstract_udf as audf - -op = Optimus() - -source_df = op.create.df([ - ("words", "str", True), - ("num", "int", True), - ("animals", "str", True), - ("thing", StringType(), True), - ("second", "int", True), - ("filter", StringType(), True) -], - [ - (" I like fish ", 1, "dog dog", "housé", 5, "a"), - (" zombies", 2, "cat", "tv", 6, "b"), - ("simpsons cat lady", 2, "frog", "table", 7, "1"), - (None, 3, "eagle", "glass", 8, "c"), - ]) - - -class TestDataFrameRows(object): - @staticmethod - def test_append(): - actual_df = source_df.rows.append([("this is a word", 2, "this is an animal", - "this is a thing", 64, "this is a filter")]) - - expected_df = op.create.df([ - ("words", "str", True), - ("num", "int", True), - ("animals", "str", True), - ("thing", StringType(), True), - ("second", "int", True), - ("filter", StringType(), True) - ], - [ - (" I like fish ", 1, "dog dog", "housé", 5, "a"), - (" zombies", 2, "cat", "tv", 6, "b"), - ("simpsons cat lady", 2, "frog", "table", 7, "1"), - (None, 3, "eagle", "glass", 8, "c"), - ("this is a word", 2, "this is an animal", - "this is a thing", 64, "this is a filter") - ]) - - assert (expected_df.collect() == actual_df.collect()) - - @staticmethod - def test_select(): - actual_df = source_df.rows.select(source_df["num"] == 1) - - expected_df = op.create.df([ - ("words", "str", True), - ("num", "int", True), - ("animals", "str", True), - ("thing", StringType(), True), - ("second", "int", True), - ("filter", StringType(), True) - ], - [ - (" I like fish ", 1, "dog dog", "housé", 5, "a") - ]) - - assert (expected_df.collect() == actual_df.collect()) - - @staticmethod - def test_select_by_dtypes(): - actual_df = source_df.rows.select_by_dtypes("filter", "integer") - - expected_df = op.create.df([ - ("words", "str", True), - ("num", "int", True), - ("animals", "str", True), - ("thing", StringType(), True), - ("second", "int", True), - ("filter", StringType(), True) - ], - [ - ("simpsons cat lady", 2, "frog", "table", 7, "1") - ]) - - assert (expected_df.collect() == actual_df.collect()) - - @staticmethod - def test_drop_by_dtypes(): - actual_df = source_df.rows.drop_by_dtypes("filter", "integer") - - expected_df = op.create.df([ - ("words", "str", True), - ("num", "int", True), - ("animals", "str", True), - ("thing", StringType(), True), - ("second", "int", True), - ("filter", StringType(), True) - ], - [ - (" I like fish ", 1, "dog dog", "housé", 5, "a"), - (" zombies", 2, "cat", "tv", 6, "b"), - (None, 3, "eagle", "glass", 8, "c") - ]) - - assert (expected_df.collect() == actual_df.collect()) - - @staticmethod - def test_drop(): - actual_df = source_df.rows.drop((source_df["num"] == 2) | (source_df["second"] == 5)) - - expected_df = op.create.df([ - ("words", "str", True), - ("num", "int", True), - ("animals", "str", True), - ("thing", StringType(), True), - ("second", "int", True), - ("filter", StringType(), True) - ], - [ - (None, 3, "eagle", "glass", 8, "c") - ]) - - assert (expected_df.collect() == actual_df.collect()) - - @staticmethod - def test_drop_audf(): - def func_data_type(value, attr): - return value > 1 - - actual_df = source_df.rows.drop(audf("num", func_data_type, "boolean")) - - expected_df = op.create.df([ - ("words", "str", True), - ("num", "int", True), - ("animals", "str", True), - ("thing", StringType(), True), - ("second", "int", True), - ("filter", StringType(), True) - ], - [ - (" I like fish ", 1, "dog dog", "housé", 5, "a") - ]) - - assert (expected_df.collect() == actual_df.collect()) - - @staticmethod - def test_sort(): - actual_df = source_df.rows.sort("num", "desc") - - expected_df = op.create.df([ - ("words", "str", True), - ("num", "int", True), - ("animals", "str", True), - ("thing", StringType(), True), - ("second", "int", True), - ("filter", StringType(), True) - ], - [ - (None, 3, "eagle", "glass", 8, "c"), - (" zombies", 2, "cat", "tv", 6, "b"), - ("simpsons cat lady", 2, "frog", "table", 7, "1"), - (" I like fish ", 1, "dog dog", "housé", 5, "a"), - - ]) - - assert (expected_df.collect() == actual_df.collect()) - - @staticmethod - def test_is_in(): - actual_df = source_df.rows.is_in("num", 2) - - expected_df = op.create.df([ - ("words", "str", True), - ("num", "int", True), - ("animals", "str", True), - ("thing", StringType(), True), - ("second", "int", True), - ("filter", StringType(), True) - ], - [ - (" zombies", 2, "cat", "tv", 6, "b"), - ("simpsons cat lady", 2, "frog", "table", 7, "1") - ]) - - assert (expected_df.collect() == actual_df.collect())