Added levenshtein json test

eapframework · Nov 14, 2019 · 1fdb5e4 · 1fdb5e4
1 parent 2d0331e
commit 1fdb5e4
Show file tree

Hide file tree

Showing 4 changed files with 143 additions and 37 deletions.
diff --git a/tests/creator/creator-profiler.ipynb b/tests/creator/creator-profiler.ipynb
@@ -41,10 +41,13 @@
      "text": [
       "C:\\Users\\argenisleon\\Anaconda3\\lib\\site-packages\\socks.py:58: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n",
       "  from collections import Callable\n",
+      "../..\\optimus\\helpers\\functions.py:172: DeprecationWarning: invalid escape sequence \\d\n",
+      "  pattern = '\\\"(\\d+\\.\\d+).*\\\"'\n",
       "\n",
       "    You are using PySparkling of version 2.4.10, but your PySpark is of\n",
       "    version 2.3.1. Please make sure Spark and PySparkling versions are compatible. \n",
-      "`formatargspec` is deprecated since Python 3.5. Use `signature` and the `Signature` object directly\n"
+      "`formatargspec` is deprecated since Python 3.5. Use `signature` and the `Signature` object directly\n",
+      "invalid escape sequence \\d\n"
      ]
     }
    ],
@@ -69,7 +72,7 @@
       "INFO:optimus:HADOOP_HOME=C:\\opt\\hadoop-2.7.7\n",
       "INFO:optimus:PYSPARK_PYTHON=C:\\Users\\argenisleon\\Anaconda3\\python.exe\n",
       "INFO:optimus:PYSPARK_DRIVER_PYTHON=jupyter\n",
-      "INFO:optimus:PYSPARK_SUBMIT_ARGS=--jars \"file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/RedshiftJDBC42-1.2.16.1027.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/mysql-connector-java-8.0.16.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/ojdbc8.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/postgresql-42.2.5.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/presto-jdbc-0.224.jar\" --driver-class-path \"C:/Users/argenisleon/Documents/Optimus/optimus/jars/RedshiftJDBC42-1.2.16.1027.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/mysql-connector-java-8.0.16.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/ojdbc8.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/postgresql-42.2.5.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/presto-jdbc-0.224.jar\" --conf \"spark.sql.catalogImplementation=hive\" pyspark-shell\n",
+      "INFO:optimus:PYSPARK_SUBMIT_ARGS=--jars \"file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/RedshiftJDBC42-1.2.16.1027.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/mysql-connector-java-8.0.16.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/ojdbc8.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/postgresql-42.2.5.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/presto-jdbc-0.224.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/spark-cassandra-connector_2.11-2.4.1.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/sqlite-jdbc-3.27.2.1.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/mssql-jdbc-7.4.1.jre8.jar\" --driver-class-path \"C:/Users/argenisleon/Documents/Optimus/optimus/jars/RedshiftJDBC42-1.2.16.1027.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/mysql-connector-java-8.0.16.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/ojdbc8.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/postgresql-42.2.5.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/presto-jdbc-0.224.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/spark-cassandra-connector_2.11-2.4.1.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/sqlite-jdbc-3.27.2.1.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/mssql-jdbc-7.4.1.jre8.jar\" --conf \"spark.sql.catalogImplementation=hive\" pyspark-shell\n",
       "INFO:optimus:JAVA_HOME=C:\\java\n",
       "INFO:optimus:Pyarrow Installed\n",
       "INFO:optimus:-----\n",
@@ -95,7 +98,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 5,
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1294,7 +1297,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -1303,7 +1306,7 @@
        "\"a'a\""
       ]
      },
-     "execution_count": 28,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1316,7 +1319,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -1333,7 +1336,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1350,7 +1353,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1372,7 +1375,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -5987,29 +5990,68 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Creating test_columns_agg() test function...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:optimus:test_columns_agg()\n",
+      "INFO:optimus:Batch Stats 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']\n",
+      "INFO:optimus:'kurtosis' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values\n",
+      "INFO:optimus:'skewness' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values\n",
+      "INFO:optimus:Batch Histogram 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']\n",
+      "INFO:optimus:'kurtosis' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values\n",
+      "INFO:optimus:'skewness' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'names': {'count_uniques': 5, 'min': 'Jazz', 'max': 'ironhide&', 'count_na': 1, 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0}, 'height(ft)': {'count_uniques': 5, 'min': -28, 'max': 300, 'count_na': 2, 'stddev': 132.66612, 'kurtosis': 0.13863, 'mean': 65.6, 'skewness': 1.4049, 'sum': 328, 'variance': 17600.3, 'zeros': 0, 'percentile': {'0.75': 26, '0.95': 300, '0.05': -28, '0.25': 13, '0.5': 17}, 'hist': [{'count': 4.0, 'lower': -28.0, 'upper': 54.0}, {'count': 0.0, 'lower': 54.0, 'upper': 136.0}, {'count': 0.0, 'lower': 136.0, 'upper': 218.0}, {'count': 0.0, 'lower': 218.0, 'upper': 300.0}]}, 'function': {'count_uniques': 6, 'min': 'Battle Station', 'max': 'Security', 'count_na': 1, 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0}, 'rank': {'count_uniques': 3, 'min': 7, 'max': 10, 'count_na': 1, 'stddev': 1.36626, 'kurtosis': -1.5, 'mean': 8.33333, 'skewness': 0.3818, 'sum': 50, 'variance': 1.86667, 'zeros': 0, 'percentile': {'0.75': 10, '0.95': 10, '0.05': 7, '0.25': 7, '0.5': 8}, 'hist': [{'count': 4.0, 'lower': 7.0, 'upper': 8.5}, {'count': 0.0, 'lower': 8.5, 'upper': 10.0}]}, 'age': {'count_uniques': 1, 'min': 5000000, 'max': 5000000, 'count_na': 1, 'stddev': 0.0, 'kurtosis': nan, 'mean': 5000000.0, 'skewness': nan, 'sum': 30000000, 'variance': 0.0, 'zeros': 0, 'percentile': {'0.75': 5000000, '0.95': 5000000, '0.05': 5000000, '0.25': 5000000, '0.5': 5000000}, 'hist': [{'count': 6, 'lower': 5000000, 'upper': 5000001}]}, 'weight(t)': {'count_uniques': 5, 'min': 1.8, 'max': 5.7, 'count_na': 2, 'stddev': 1.64712, 'kurtosis': -1.43641, 'mean': 3.56, 'skewness': 0.06521, 'sum': 17.8, 'variance': 2.713, 'zeros': 0, 'percentile': {'0.75': 4.300000190734863, '0.95': 5.699999809265137, '0.05': 1.7999999523162842, '0.25': 2.0, '0.5': 4.0}, 'hist': [{'count': 1.0, 'lower': 1.8, 'upper': 2.78}, {'count': 0.0, 'lower': 2.78, 'upper': 3.75}, {'count': 2.0, 'lower': 3.75, 'upper': 4.73}, {'count': 1.0, 'lower': 4.73, 'upper': 5.7}]}, 'japanese name': {'count_uniques': 6, 'min': ['Bumble', 'Goldback'], 'max': ['Roadbuster'], 'count_na': 1}, 'last position seen': {'count_uniques': 4, 'min': '10.642707,-71.612534', 'max': '37.789563,-122.400356', 'count_na': 3, 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0}, 'date arrival': {'count_uniques': 1, 'min': '1980/04/10', 'max': '1980/04/10', 'count_na': 1, 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0}, 'last date seen': {'count_uniques': 6, 'min': '2011/04/10', 'max': '2016/09/10', 'count_na': 1, 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0}, 'attributes': {'count_uniques': 6, 'min': [None, 5700.0], 'max': [91.44000244140625, None], 'count_na': 1}, 'Date Type': {'count_uniques': 6, 'min': datetime.date(2011, 4, 10), 'max': datetime.date(2016, 9, 10), 'count_na': 1}, 'timestamp': {'count_uniques': 1, 'min': datetime.datetime(2014, 6, 24, 0, 0), 'max': datetime.datetime(2014, 6, 24, 0, 0), 'count_na': 1}, 'Cybertronian': {'count_uniques': 1, 'min': 1, 'max': 1, 'count_na': 1}, 'function(binary)': {'count_uniques': 6, 'min': bytearray(b'Battle Station'), 'max': bytearray(b'Security'), 'count_na': 1}, 'NullType': {'count_uniques': 0, 'min': None, 'max': None, 'count_na': 7}, 'p_count_na': 100.0, 'p_count_uniques': 0.0, 'range': 3.9000000000000004, 'median': 4.0, 'interquartile_range': 2.3000001907348633, 'coef_variation': 0.46267, 'mad': 1.7}\n"
+     ]
+    }
+   ],
    "source": [
     "t.create(p, \"columns_agg\", None, 'json', None, source_df,\"*\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Creating file ../test_df_profiler.py\n",
+      "Done\n"
+     ]
+    }
+   ],
    "source": [
     "t.run()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 39,
-   "metadata": {},
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
    "outputs": [],
    "source": [
-    "a = \"{'name'=a'a}\"\n"
+    "a = \"{'name'=a'a}\""
    ]
   },
   {

diff --git a/tests/creator/creator.ipynb b/tests/creator/creator.ipynb
@@ -95,7 +95,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 5,
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -1271,7 +1271,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 128,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -102993,7 +102993,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -103002,7 +103002,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 9,
    "metadata": {
     "inputHidden": false,
     "outputHidden": false
@@ -103021,7 +103021,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 10,
    "metadata": {
     "inputHidden": false,
     "outputHidden": false
@@ -103099,17 +103099,17 @@
        "    <tr>\n",
        "        \n",
        "        <td style=\"border: 0px;\">\n",
-       "            <div style=\"min-height: 14px;\" title=\"estadodemexico\">\n",
+       "            <div style=\"min-height: 14px;\" title=\"distritofederal\">\n",
        "                \n",
-       "                estadodemexico\n",
+       "                distritofederal\n",
        "                 \n",
        "            </div>\n",
        "        </td>\n",
        "        \n",
        "        <td style=\"border: 0px;\">\n",
-       "            <div style=\"min-height: 14px;\" title=\"estadodemexico\">\n",
+       "            <div style=\"min-height: 14px;\" title=\"distritofederal\">\n",
        "                \n",
-       "                estadodemexico\n",
+       "                distritofederal\n",
        "                 \n",
        "            </div>\n",
        "        </td>\n",
@@ -103127,17 +103127,17 @@
        "    <tr>\n",
        "        \n",
        "        <td style=\"border: 0px;\">\n",
-       "            <div style=\"min-height: 14px;\" title=\"estadodemexico\">\n",
+       "            <div style=\"min-height: 14px;\" title=\"distritofederal\">\n",
        "                \n",
-       "                estadodemexico\n",
+       "                distritofederal\n",
        "                 \n",
        "            </div>\n",
        "        </td>\n",
        "        \n",
        "        <td style=\"border: 0px;\">\n",
-       "            <div style=\"min-height: 14px;\" title=\"distritofederal\">\n",
+       "            <div style=\"min-height: 14px;\" title=\"estadodemexico\">\n",
        "                \n",
-       "                distritofederal\n",
+       "                estadodemexico\n",
        "                 \n",
        "            </div>\n",
        "        </td>\n",
@@ -103155,17 +103155,17 @@
        "    <tr>\n",
        "        \n",
        "        <td style=\"border: 0px;\">\n",
-       "            <div style=\"min-height: 14px;\" title=\"distritofederal\">\n",
+       "            <div style=\"min-height: 14px;\" title=\"estadodemexico\">\n",
        "                \n",
-       "                distritofederal\n",
+       "                estadodemexico\n",
        "                 \n",
        "            </div>\n",
        "        </td>\n",
        "        \n",
        "        <td style=\"border: 0px;\">\n",
-       "            <div style=\"min-height: 14px;\" title=\"estadodemexico\">\n",
+       "            <div style=\"min-height: 14px;\" title=\"distritofederal\">\n",
        "                \n",
-       "                estadodemexico\n",
+       "                distritofederal\n",
        "                 \n",
        "            </div>\n",
        "        </td>\n",
@@ -103183,17 +103183,17 @@
        "    <tr>\n",
        "        \n",
        "        <td style=\"border: 0px;\">\n",
-       "            <div style=\"min-height: 14px;\" title=\"distritofederal\">\n",
+       "            <div style=\"min-height: 14px;\" title=\"estadodemexico\">\n",
        "                \n",
-       "                distritofederal\n",
+       "                estadodemexico\n",
        "                 \n",
        "            </div>\n",
        "        </td>\n",
        "        \n",
        "        <td style=\"border: 0px;\">\n",
-       "            <div style=\"min-height: 14px;\" title=\"distritofederal\">\n",
+       "            <div style=\"min-height: 14px;\" title=\"estadodemexico\">\n",
        "                \n",
-       "                distritofederal\n",
+       "                estadodemexico\n",
        "                 \n",
        "            </div>\n",
        "        </td>\n",
@@ -103227,6 +103227,24 @@
     "t.create(dc, \"levenshtein_matrix\", None, 'df', None, source_df, \"STATE\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Creating file ../test_df_distance_cluster.py\n",
+      "Done\n"
+     ]
+    }
+   ],
+   "source": [
+    "t.run()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 9,
@@ -103381,7 +103399,43 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Creating test_levenshtein_json() test function...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:optimus:test_levenshtein_json()\n",
+      "INFO:optimus:Using 'column_exp' to process column 'STATE***FINGERPRINT' with function _trim\n",
+      "INFO:optimus:Using 'column_exp' to process column 'STATE***FINGERPRINT' with function _lower\n",
+      "INFO:optimus:Using 'pandas_udf' to process column 'STATE***FINGERPRINT' with function multiple_replace\n",
+      "INFO:optimus:Using 'pandas_udf' to process column 'STATE***FINGERPRINT' with function _remove_accents\n",
+      "INFO:optimus:Using 'pandas_udf' to process column 'STATE***FINGERPRINT' with function _split_sort_remove_join\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'Estado de México': ['Distrito Federal'], 'Distrito Federal': ['Estado de México']}\n"
+     ]
+    }
+   ],
+   "source": [
+    "t.create(dc, \"levenshtein_json\", None, 'dict', None, source_df, \"STATE\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
    "metadata": {
     "inputHidden": false,
     "lines_to_next_cell": 0,

diff --git a/tests/creator/creator.py b/tests/creator/creator.py
@@ -933,9 +933,15 @@ def func(col_name, attrs):
 
 # + {"outputHidden": false, "inputHidden": false}
 t.create(dc, "levenshtein_matrix", None, 'df', None, source_df, "STATE")
+# -
+
+t.run()
 
 # + {"outputHidden": false, "inputHidden": false}
 t.create(dc, "levenshtein_filter", None, 'df', None, source_df, "STATE")
+# -
+
+t.create(dc, "levenshtein_json", None, 'dict', None, source_df, "STATE")
 
 # + {"outputHidden": false, "inputHidden": false}
 t.run()

diff --git a/tests/test_df_distance_cluster.py b/tests/test_df_distance_cluster.py
@@ -18,6 +18,10 @@ def test_levenshtein_filter():
 		actual_df =dc.levenshtein_filter(source_df,'STATE')
 		expected_df = op.create.df([('STATE_FROM', StringType(), True),('STATE***LEVENSHTEIN_DISTANCE', IntegerType(), True),('STATE_TO', StringType(), True)], [('estadodemexico', 11, 'distritofederal'), ('distritofederal', 11, 'estadodemexico')])
 		assert (expected_df.collect() == actual_df.collect())
+	def test_levenshtein_json(self):
+		actual_df =dc.levenshtein_json(source_df,'STATE')
+		expected_value ={'Estado de México': ['Distrito Federal'], 'Distrito Federal': ['Estado de México']}
+		self.assertDictEqual(deep_sort(expected_value),  deep_sort(actual_df))
 	@staticmethod
 	def test_levenshtein_matrix():
 		actual_df =dc.levenshtein_matrix(source_df,'STATE')