diff --git a/examples/sandbox.ipynb b/examples/sandbox.ipynb index b3567c4c..fcc565ed 100644 --- a/examples/sandbox.ipynb +++ b/examples/sandbox.ipynb @@ -34,8 +34,7 @@ "\n", " You are using PySparkling of version 2.4.10, but your PySpark is of\n", " version 2.3.1. Please make sure Spark and PySparkling versions are compatible. \n", - "`formatargspec` is deprecated since Python 3.5. Use `signature` and the `Signature` object directly\n", - "invalid escape sequence \\d\n" + "`formatargspec` is deprecated since Python 3.5. Use `signature` and the `Signature` object directly\n" ] } ], @@ -75,19 +74,166 @@ "op= Optimus(comm=True)" ] }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql.types import *\n", + "from datetime import date, datetime\n", + "\n", + "cols = [\n", + " (\"names\", \"str\"),\n", + " (\"height(ft)\", ShortType()),\n", + " (\"function\", \"str\"),\n", + " (\"rank\", ByteType()),\n", + " (\"age\", \"int\"),\n", + " (\"weight(t)\", \"float\"),\n", + " \"japanese name\",\n", + " \"last position seen\",\n", + " \"date arrival\",\n", + " \"last date seen\",\n", + " (\"attributes\", ArrayType(FloatType())),\n", + " (\"DateType\", DateType()),\n", + " (\"timestamp\", TimestampType()),\n", + " (\"Cybertronian\", BooleanType()),\n", + " (\"function(binary)\", BinaryType()),\n", + " (\"NullType\", NullType())\n", + "\n", + " ]\n", + "\n", + "rows = [\n", + " (\"Optimus OptimusPrime\", 28, \"Leader\", 10, 5000000, 4.30, [\"Inochi\", \"Convoy\"], \"19.442735,-99.201111\", \"1980/04/10\",\n", + " \"2016/09/10\", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray(\"Leader\", \"utf-8\"),\n", + " None),\n", + " (\"bumbl#ebéé \", 17, \"Espionage\", 7, 5000000, 2.0, [\"Bumble\", \"Goldback\"], \"10.642707,-71.612534\", \"1980/04/10\",\n", + " \"2015/08/10\", [5.334, 2000.0], date(2015, 8, 10), datetime(2014, 6, 24), True, bytearray(\"Espionage\", \"utf-8\"),\n", + " None),\n", + " (\"ironhide&\", 26, \"Security\", 7, 5000000, 4.0, [\"Roadbuster\"], \"37.789563,-122.400356\", \"1980/04/10\",\n", + " \"2014/07/10\", [7.9248, 4000.0], date(2014, 6, 24), datetime(2014, 6, 24), True, bytearray(\"Security\", \"utf-8\"),\n", + " None),\n", + " (\"1 Megatron\", 13, \"First Lieutenant\", 8, 5000000, 1.80, [\"Meister\"], \"33.670666,-117.841553\", \"1980/04/10\",\n", + " \"2013/06/10\", [3.9624, 1800.0], date(2013, 6, 24), datetime(2014, 6, 24), True,\n", + " bytearray(\"First Lieutenant\", \"utf-8\"), None),\n", + " (\"1 Megatron\", None, \"None\", 10, 5000000, 5.70, [\"Megatron\"], None, \"1980/04/10\", \"2012/05/10\", [None, 5700.0],\n", + " date(2012, 5, 10), datetime(2014, 6, 24), True, bytearray(\"None\", \"utf-8\"), None),\n", + " (\"megatron 1\", 300, \"Battle Station\", 8, 5000000, None, [\"Metroflex\"], None, \"1980/04/10\", \"2011/04/10\",\n", + " [91.44, None], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray(\"Battle Station\", \"utf-8\"), None),\n", + "\n", + " ]\n", + "df = op.create.df(cols ,rows).cache().repartition(1)" + ] + }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "df = op.load.csv(\"https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/crime.csv\", sep=\",\", header='true', infer_schema='true', charset=\"UTF-8\", null_value=\"None\")" + "df = op.load.csv(\"https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/Meteorite_Landings.csv\", sep=\",\", header='true', infer_schema='true', charset=\"UTF-8\", null_value=\"None\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, + "outputs": [], + "source": [ + "a= {'lower_bound': -285.75000000000006, 'upper_bound': 495.45000000000005, 'q1': 7.2, 'median': 32.6, 'q3': 202.5, 'iqr': 195.3}" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[-285.75000000000006, 495.45000000000005, 7.2, 32.6, 202.5, 195.3]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(a.values())" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1\n" + ] + } + ], + "source": [ + "if \"q3\" in ['lower_bound', 'upper_bound', 'q1', 'median', 'q3', 'iqr']:\n", + " print(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "tukey() missing 1 required positional argument: 'columns'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moutliers\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtukey\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwhiskers\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;31mTypeError\u001b[0m: tukey() missing 1 required positional argument: 'columns'" + ] + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "outlier = df.outliers.tukey(\"mass (g)\")" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'{\"columns\": [{\"title\": \"mass (g)\"}], \"value\": [[21.0], [160.0], [252.0], [256.8], [320.0], [41.0], [94.2], [265.0], [146.0], [134.0], [345.0], [14.0], [23.2], [17.0], [375.0], [270.0], [13.9], [18.0], [100.0], [488.1], [470.0], [67.8], [56.0], [190.0], [219.0], [324.0], [357.0], [212.0], [478.0], [342.0], [8.0], [94.0], [45.6], [0.5], [72.0], [367.0], [303.0], [48.6], [469.0], [78.4], [167.0], [100.0], [340.0], [28.0], [0.8], [230.0], [400.0], [438.0], [230.0], [30.0], [300.0], [188.0], [127.0], [277.0], [113.0], [107.2], [380.0], [82.0], [220.0], [240.0], [132.7], [36.1], [28.0], [380.0], [102.0], [480.0], [45.5], [215.0], [288.0], [28.0], [0.2], [315.0], [414.0], [167.7], [305.5], [180.0], [266.1], [112.0], [22.0], [450.0], [222.0], [100.0], [30.0], [483.0], [89.0], [230.0], [350.0], [448.0], [299.0], [400.0], [180.0], [450.0], [100.0], [331.0], [195.0], [140.0], [67.4], [97.7], [202.6], [136.0]]}'" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# print(outlier.info())\n", + "outlier.select_lower_bound()" + ] + }, + { + "cell_type": "code", + "execution_count": 256, + "metadata": {}, "outputs": [ { "data": { @@ -97,16 +243,16 @@ "\n", "\n", "\n", - "
Viewing 10 of 319,073 rows / 17 columns
\n", - "
8 partition(s)
\n", + "
Viewing 10 of 19 rows / 9 columns
\n", + "
1 partition(s)
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -422,557 +500,529 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -982,277 +1032,427 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + "
\n", - "
INCIDENT_NUMBER
\n", - "
1 (string)
\n", + "
id
\n", + "
1 (int)
\n", "
\n", " \n", " nullable\n", @@ -115,8 +261,8 @@ "
\n", - "
OFFENSE_CODE
\n", - "
2 (int)
\n", + "
firstName
\n", + "
2 (string)
\n", "
\n", " \n", " nullable\n", @@ -125,7 +271,7 @@ "
\n", - "
OFFENSE_CODE_GROUP
\n", + "
lastName
\n", "
3 (string)
\n", "
\n", " \n", @@ -135,8 +281,8 @@ "
\n", - "
OFFENSE_DESCRIPTION
\n", - "
4 (string)
\n", + "
billingId
\n", + "
4 (int)
\n", "
\n", " \n", " nullable\n", @@ -145,7 +291,7 @@ "
\n", - "
DISTRICT
\n", + "
product
\n", "
5 (string)
\n", "
\n", " \n", @@ -155,8 +301,8 @@ "
\n", - "
REPORTING_AREA
\n", - "
6 (string)
\n", + "
price
\n", + "
6 (int)
\n", "
\n", " \n", " nullable\n", @@ -165,7 +311,7 @@ "
\n", - "
SHOOTING
\n", + "
birth
\n", "
7 (string)
\n", "
\n", " \n", @@ -175,88 +321,8 @@ "
\n", - "
OCCURRED_ON_DATE
\n", - "
8 (timestamp)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
YEAR
\n", - "
9 (int)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
MONTH
\n", - "
10 (int)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
DAY_OF_WEEK
\n", - "
11 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
HOUR
\n", - "
12 (int)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
UCR_PART
\n", - "
13 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
STREET
\n", - "
14 (string)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
Lat
\n", - "
15 (double)
\n", - "
\n", - " \n", - " nullable\n", - " \n", - "
\n", - "
\n", - "
Long
\n", - "
16 (double)
\n", + "
dummyCol
\n", + "
8 (string)
\n", "
\n", " \n", " nullable\n", @@ -265,8 +331,8 @@ "
\n", - "
Location
\n", - "
17 (string)
\n", + "
product***FINGERPRINT
\n", + "
9 (string)
\n", "
\n", " \n", " nullable\n", @@ -282,137 +348,149 @@ "
\n", - "
\n", + "
\n", " \n", - " I182070945\n", + " 1\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 619\n", + " Luis\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Larceny\n", + " Alvarez$$%!\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " LARCENY⋅ALL⋅OTHERS\n", + " 123\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " D14\n", + " Cake\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 808\n", + " 10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " None\n", + " 1980/07/07\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2018-09-02⋅13:00:00\n", + " never\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2018\n", + " cake\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 9\n", + " André\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Sunday\n", + " Ampère\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 13\n", + " 423\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Part⋅One\n", + " piza\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " LINCOLN⋅ST\n", + " 8\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 42.35779134\n", + " 1950/07/08\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " -71.13937053\n", + " gonna\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " (42.35779134,⋅-71.13937053)\n", + " piza\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " I182070943\n", + " 3\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 1402\n", + " NiELS\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Vandalism\n", + " Böhr//((%%\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " VANDALISM\n", + " 551\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " C11\n", + " pizza\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 347\n", + " 8\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " None\n", + " 1990/07/09\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2018-08-21⋅00:00:00\n", + " give\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2018\n", + " pizza\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 8\n", + " 4\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Tuesday\n", + " PAUL\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 0\n", + " dirac$\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Part⋅Two\n", + " 521\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " HECLA⋅ST\n", + " pizza\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 42.30682138\n", + " 8\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " -71.06030035\n", + " 1954/07/10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " (42.30682138,⋅-71.06030035)\n", + " you\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " I182070941\n", + " pizza\n", " \n", "
\n", "
\n", - "
\n", - " \n", - " 3410\n", - " \n", - "
\n", - "
\n", - "
\n", + "
\n", " \n", - " Towed\n", + " 5\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " TOWED⋅MOTOR⋅VEHICLE\n", + " Albert\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " D4\n", + " Einstein\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 151\n", + " 634\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " None\n", + " pizza\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2018-09-03⋅19:27:00\n", + " 8\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2018\n", + " 1990/07/11\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 9\n", + " up\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Monday\n", + " pizza\n", " \n", "
\n", "
\n", - "
\n", - " \n", - " 19\n", - " \n", - "
\n", - "
\n", - "
\n", + "
\n", " \n", - " Part⋅Three\n", + " 6\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " CAZENOVE⋅ST\n", + " Galileo\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 42.34658879\n", + " ⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " -71.07242943\n", + " 672\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " (42.34658879,⋅-71.07242943)\n", + " arepa\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " I182070940\n", + " 5\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 3114\n", + " 1930/08/12\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Investigate⋅Property\n", + " never\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " INVESTIGATE⋅PROPERTY\n", + " arepa\n", " \n", "
\n", "
\n", - "
\n", - " \n", - " D4\n", - " \n", - "
\n", - "
\n", - "
\n", + "
\n", " \n", - " 272\n", + " 7\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " None\n", + " CaRL\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2018-09-03⋅21:16:00\n", + " Ga%%%uss\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2018\n", + " 323\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 9\n", + " taco\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Monday\n", + " 3\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 21\n", + " 1970/07/13\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Part⋅Three\n", + " gonna\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " NEWCOMB⋅ST\n", + " taco\n", " \n", "
\n", "
\n", - "
\n", - " \n", - " 42.33418175\n", - " \n", - "
\n", - "
\n", - "
\n", + "
\n", " \n", - " -71.07866441\n", + " 8\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " (42.33418175,⋅-71.07866441)\n", + " David\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " I182070938\n", + " H$$$ilbert\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 3114\n", + " 624\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Investigate⋅Property\n", + " taaaccoo\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " INVESTIGATE⋅PROPERTY\n", + " 3\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " B3\n", + " 1950/07/14\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 421\n", + " let\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " None\n", + " taaaccoo\n", " \n", "
\n", "
\n", - "
\n", - " \n", - " 2018-09-03⋅21:05:00\n", - " \n", - "
\n", - "
\n", - "
\n", + "
\n", " \n", - " 2018\n", + " 9\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 9\n", + " Johannes\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Monday\n", + " KEPLER\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 21\n", + " 735\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Part⋅Three\n", + " taco\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " DELHI⋅ST\n", + " 3\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 42.27536542\n", + " 1920/04/22\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " -71.09036101\n", + " you\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " (42.27536542,⋅-71.09036101)\n", + " taco\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " I182070936\n", + " 10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 3820\n", + " JaMES\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Motor⋅Vehicle⋅Accident⋅Response\n", + " M$$ax%%well\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY\n", + " 875\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " C11\n", + " taco\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 398\n", + " 3\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " None\n", + " 1923/03/12\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2018-09-03⋅21:09:00\n", + " down\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2018\n", + " taco\n", " \n", "
\n", "
\n", - "
\n", - " \n", - " 9\n", - " \n", - "
\n", - "
\n", + "\n", + "\n", + "
Viewing 10 of 19 rows / 9 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "keyCol.fingerprint(df,\"product\").table()" + ] + }, + { + "cell_type": "code", + "execution_count": 245, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 6 of 6 rows / 17 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + "\n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -1262,137 +1462,137 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -1402,137 +1602,137 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -1542,149 +1742,5810 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - "
\n", - "
\n", + "
\n", + "
names
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", " \n", - " Monday\n", - " \n", "
\n", - " \n", + "
\n", - "
\n", + "
\n", + "
height(ft)
\n", + "
2 (smallint)
\n", + "
\n", + " \n", + " nullable\n", " \n", - " 21\n", - " \n", "
\n", - " \n", + "
\n", - "
\n", + "
\n", + "
function
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", " \n", - " Part⋅Three\n", - " \n", "
\n", - " \n", + "
\n", - "
\n", + "
\n", + "
rank
\n", + "
4 (tinyint)
\n", + "
\n", + " \n", + " nullable\n", " \n", - " TALBOT⋅AVE\n", - " \n", "
\n", - " \n", + "
\n", - "
\n", + "
\n", + "
age
\n", + "
5 (int)
\n", + "
\n", + " \n", + " nullable\n", " \n", - " 42.29019621\n", - " \n", "
\n", - " \n", + "
\n", - "
\n", + "
\n", + "
weight(t)
\n", + "
6 (float)
\n", + "
\n", + " \n", + " nullable\n", " \n", - " -71.07159012\n", - " \n", "
\n", - " \n", + "
\n", - "
\n", + "
\n", + "
japanese name
\n", + "
7 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", " \n", - " (42.29019621,⋅-71.07159012)\n", - " \n", "
\n", - " \n", + "
\n", + "
last position seen
\n", + "
8 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
date arrival
\n", + "
9 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
last date seen
\n", + "
10 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
attributes
\n", + "
11 (array<float>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
DateType
\n", + "
12 (date)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
timestamp
\n", + "
13 (timestamp)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
Cybertronian
\n", + "
14 (boolean)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
function(binary)
\n", + "
15 (binary)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
NullType
\n", + "
16 (null)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
names***FINGERPRINT
\n", + "
17 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", - "
\n", + "
\n", " \n", - " I182070933\n", + " Optimus⋅OptimusPrime\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 724\n", + " 28\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Auto⋅Theft\n", + " Leader\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " AUTO⋅THEFT\n", + " 10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " B2\n", + " 5000000\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 330\n", + " 4.300000190734863\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " None\n", + " ['Inochi',⋅'Convoy']\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2018-09-03⋅21:25:00\n", + " 19.442735,-99.201111\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2018\n", + " 1980/04/10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 9\n", + " 2016/09/10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Monday\n", + " [8.53439998626709,⋅4300.0]\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 21\n", + " 2016-09-10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Part⋅One\n", + " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " NORMANDY⋅ST\n", + " True\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 42.30607218\n", + " bytearray(b'Leader')\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " -71.0827326\n", + " None\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " (42.30607218,⋅-71.08273260)\n", + " optimusoptimusprime\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " I182070932\n", + " bumbl#ebéé⋅⋅\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 3301\n", + " 17\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Verbal⋅Disputes\n", + " Espionage\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " VERBAL⋅DISPUTE\n", + " 7\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " B2\n", + " 5000000\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 584\n", + " 2.0\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " None\n", + " ['Bumble',⋅'Goldback']\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2018-09-03⋅20:39:37\n", + " 10.642707,-71.612534\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2018\n", + " 1980/04/10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 9\n", + " 2015/08/10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Monday\n", + " [5.334000110626221,⋅2000.0]\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 20\n", + " 2015-08-10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Part⋅Three\n", + " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " LAWN⋅ST\n", + " True\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 42.32701648\n", + " bytearray(b'Espionage')\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " -71.10555088\n", + " None\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " (42.32701648,⋅-71.10555088)\n", + " bumblebee\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " I182070931\n", + " ironhide&\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 301\n", + " 26\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Robbery\n", + " Security\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " ROBBERY⋅-⋅STREET\n", + " 7\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " C6\n", + " 5000000\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 177\n", + " 4.0\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " None\n", + " ['Roadbuster']\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2018-09-03⋅20:48:00\n", + " 37.789563,-122.400356\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2018\n", + " 1980/04/10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 9\n", + " 2014/07/10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Monday\n", + " [7.924799919128418,⋅4000.0]\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 20\n", + " 2014-06-24\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Part⋅One\n", + " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " MASSACHUSETTS⋅AVE\n", + " True\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 42.33152148\n", + " bytearray(b'Security')\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " -71.07085307\n", + " None\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " (42.33152148,⋅-71.07085307)\n", + " ironhide\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " I182070929\n", + " 1⋅Megatron\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 3301\n", + " 13\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Verbal⋅Disputes\n", + " First⋅Lieutenant\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " VERBAL⋅DISPUTE\n", + " 8\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " C11\n", + " 5000000\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 364\n", + " 1.7999999523162842\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " None\n", + " ['Meister']\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2018-09-03⋅20:38:00\n", + " 33.670666,-117.841553\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 2018\n", + " 1980/04/10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 9\n", + " 2013/06/10\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Monday\n", + " [3.962399959564209,⋅1800.0]\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 20\n", + " 2013-06-24\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " Part⋅Three\n", + " 2014-06-24⋅00:00:00\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " LESLIE⋅ST\n", + " True\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " 42.29514664\n", + " bytearray(b'First⋅Lieutenant')\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " -71.05860832\n", + " None\n", " \n", "
\n", "
\n", - "
\n", + "
\n", " \n", - " (42.29514664,⋅-71.05860832)\n", + " 1megatron\n", " \n", "
\n", "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " 1⋅Megatron\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " 10\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " 5.699999809265137\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " ['Megatron']\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " 2012/05/10\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " [None,⋅5700.0]\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " 2012-05-10\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " bytearray(b'None')\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " 1megatron\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " megatron⋅1\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " 300\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " Battle⋅Station\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " ['Metroflex']\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " 2011/04/10\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " [91.44000244140625,⋅None]\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " 2011-04-10\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " bytearray(b'Battle⋅Station')\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " 1megatron\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "
Viewing 6 of 6 rows / 17 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "keyCol.fingerprint(df,\"names\").table()" + ] + }, + { + "cell_type": "code", + "execution_count": 259, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'{\"taaaccoo\": {\"similar\": {\"taaaccoo\": 1}, \"count\": 1, \"sum\": 1}, \"piza\": {\"similar\": {\"piza\": 1}, \"count\": 1, \"sum\": 1}, \"hamburguer\": {\"similar\": {\"hamburguer\": 1}, \"count\": 1, \"sum\": 1}, \"taco\": {\"similar\": {\"taco\": 3}, \"count\": 1, \"sum\": 3}, \"pizzza\": {\"similar\": {\"pizzza\": 1}, \"count\": 1, \"sum\": 1}, \"arepa\": {\"similar\": {\"arepa\": 1}, \"count\": 1, \"sum\": 1}, \"pizza\": {\"similar\": {\"pizza\": 4}, \"count\": 1, \"sum\": 4}, \"Rice\": {\"similar\": {\"Rice\": 1}, \"count\": 1, \"sum\": 1}, \"110790\": {\"similar\": {\"110790\": 1}, \"count\": 1, \"sum\": 1}, \"BEER\": {\"similar\": {\"BEER\": 1}, \"count\": 1, \"sum\": 1}, \"Cake\": {\"similar\": {\"Cake\": 1}, \"count\": 1, \"sum\": 1}, \"null\": {\"similar\": {\"null\": 1}, \"count\": 1, \"sum\": 1}, \"pasta\": {\"similar\": {\"pasta\": 2}, \"count\": 1, \"sum\": 2}}'" + ] + }, + "execution_count": 259, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "keyCol.fingerprint_cluster(df,\"product\", output=\"json\")" + ] + }, + { + "cell_type": "code", + "execution_count": 261, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'{\"arepa\": {\"similar\": {\"arepa\": 1}, \"count\": 1, \"sum\": 1}, \"taaaccoo\": {\"similar\": {\"taaaccoo\": 1}, \"count\": 1, \"sum\": 1}, \"pasta\": {\"similar\": {\"pasta\": 2}, \"count\": 1, \"sum\": 2}, \"pizza\": {\"similar\": {\"pizzza\": 1, \"pizza\": 4}, \"count\": 2, \"sum\": 5}, \"110790\": {\"similar\": {\"110790\": 1}, \"count\": 1, \"sum\": 1}, \"hamburguer\": {\"similar\": {\"hamburguer\": 1}, \"count\": 1, \"sum\": 1}, \"taco\": {\"similar\": {\"taco\": 3}, \"count\": 1, \"sum\": 3}, \"Cake\": {\"similar\": {\"Cake\": 1}, \"count\": 1, \"sum\": 1}, \"Rice\": {\"similar\": {\"Rice\": 1}, \"count\": 1, \"sum\": 1}, \"piza\": {\"similar\": {\"piza\": 1}, \"count\": 1, \"sum\": 1}, \"null\": {\"similar\": {\"null\": 1}, \"count\": 1, \"sum\": 1}, \"BEER\": {\"similar\": {\"BEER\": 1}, \"count\": 1, \"sum\": 1}}'" + ] + }, + "execution_count": 261, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "keyCol.n_gram_fingerprint_cluster(df,\"product\", output=\"json\",n_size=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from optimus.ml import keycollision as keyCol\n", + "from optimus.ml import distancecluster as dc" + ] + }, + { + "cell_type": "code", + "execution_count": 258, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'{\"taaaccoo\": {\"similar\": {\"taco\": 3, \"taaaccoo\": 1}, \"count\": 2, \"sum\": 4}, \"piza\": {\"similar\": {\"pizza\": 4, \"piza\": 1}, \"count\": 2, \"sum\": 5}, \"hamburguer\": {\"similar\": {\"BEER\": 1, \"hamburguer\": 1}, \"count\": 2, \"sum\": 2}, \"taco\": {\"similar\": {\"Cake\": 1, \"Rice\": 1, \"taco\": 3}, \"count\": 3, \"sum\": 5}, \"pizzza\": {\"similar\": {\"pizza\": 4, \"pizzza\": 1}, \"count\": 2, \"sum\": 5}, \"arepa\": {\"similar\": {\"BEER\": 1, \"piza\": 1, \"pasta\": 2, \"Cake\": 1, \"Rice\": 1, \"pizza\": 4, \"arepa\": 1}, \"count\": 7, \"sum\": 11}, \"pizza\": {\"similar\": {\"piza\": 1, \"pizzza\": 1, \"pizza\": 4}, \"count\": 3, \"sum\": 6}, \"Rice\": {\"similar\": {\"piza\": 1, \"Cake\": 1, \"taco\": 3, \"Rice\": 1}, \"count\": 4, \"sum\": 6}, \"110790\": {\"similar\": {\"arepa\": 1, \"BEER\": 1, \"piza\": 1, \"pizzza\": 1, \"pasta\": 2, \"Cake\": 1, \"null\": 1, \"Rice\": 1, \"pizza\": 4, \"taco\": 3, \"110790\": 1}, \"count\": 11, \"sum\": 17}, \"BEER\": {\"similar\": {\"arepa\": 1, \"piza\": 1, \"Cake\": 1, \"null\": 1, \"Rice\": 1, \"taco\": 3, \"BEER\": 1}, \"count\": 7, \"sum\": 9}, \"Cake\": {\"similar\": {\"Rice\": 1, \"taco\": 3, \"Cake\": 1}, \"count\": 3, \"sum\": 5}, \"null\": {\"similar\": {\"BEER\": 1, \"piza\": 1, \"Cake\": 1, \"Rice\": 1, \"taco\": 3, \"null\": 1}, \"count\": 6, \"sum\": 8}, \"pasta\": {\"similar\": {\"piza\": 1, \"pizza\": 4, \"pasta\": 2}, \"count\": 3, \"sum\": 7}}'" + ] + }, + "execution_count": 258, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dc.levenshtein_cluster(df,\"product\", output=\"json\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 6 of 6 rows / 4 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
count
\n", + "
1 (string)
\n", + "
\n", + " \n", + " not nullable\n", + " \n", + "
\n", + "
\n", + "
names
\n", + "
2 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
names***NGRAM
\n", + "
3 (array<string>)
\n", + "
\n", + " \n", + " not nullable\n", + " \n", + "
\n", + "
\n", + "
names***NGRAM_FINGERPRINT
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bumbl#ebéé⋅⋅\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['bumblebee']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bumblebee\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ironhide&\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['ironhide']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ironhide\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Megatron2\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['megatron2']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " megatron2\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Optimus⋅OptimusPrime\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['optimusoptimusprime']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " optimusoptimusprime\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Megatron1\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['megatron1']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " megatron1\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Megatron\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['megatron']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " megatron\n", + " \n", + "
\n", + "
\n", + "\n", + "\n", + "
Viewing 6 of 6 rows / 4 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'{\"ironhide&\": {\"similar\": {\"ironhide&\": 1}, \"count\": 1, \"sum\": 1.0}, \"Megatron1\": {\"similar\": {\"Megatron1\": 1}, \"count\": 1, \"sum\": 1.0}, \"Optimus OptimusPrime\": {\"similar\": {\"Optimus OptimusPrime\": 1}, \"count\": 1, \"sum\": 1.0}, \"Megatron\": {\"similar\": {\"Megatron\": 1}, \"count\": 1, \"sum\": 1.0}, \"bumbl#eb\\\\u00e9\\\\u00e9 \": {\"similar\": {\"bumbl#eb\\\\u00e9\\\\u00e9 \": 1}, \"count\": 1, \"sum\": 1.0}, \"Megatron2\": {\"similar\": {\"Megatron2\": 1}, \"count\": 1, \"sum\": 1.0}}'" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "keyCol.n_gram_fingerprint_cluster(df,\"names\", n_size=1,output=\"json\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 6 of 6 rows / 16 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
names
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
height(ft)
\n", + "
2 (smallint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
function
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
rank
\n", + "
4 (tinyint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
age
\n", + "
5 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
weight(t)
\n", + "
6 (float)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
japanese name
\n", + "
7 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
last position seen
\n", + "
8 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
date arrival
\n", + "
9 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
last date seen
\n", + "
10 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
attributes
\n", + "
11 (array<float>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
DateType
\n", + "
12 (date)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
timestamp
\n", + "
13 (timestamp)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
Cybertronian
\n", + "
14 (boolean)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
function(binary)
\n", + "
15 (binary)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
NullType
\n", + "
16 (null)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Optimus⋅OptimusPrime\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 28\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Leader\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 4.300000190734863\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Inochi',⋅'Convoy']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 19.442735,-99.201111\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2016/09/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [8.53439998626709,⋅4300.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2016-09-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Leader')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bumbl#ebéé⋅⋅\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 17\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Espionage\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 7\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Bumble',⋅'Goldback']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10.642707,-71.612534\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2015/08/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [5.334000110626221,⋅2000.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2015-08-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Espionage')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ironhide&\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 26\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Security\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 7\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 4.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Roadbuster']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 37.789563,-122.400356\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014/07/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [7.924799919128418,⋅4000.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Security')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Megatron1\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 13\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " First⋅Lieutenant\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1.7999999523162842\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Meister']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 33.670666,-117.841553\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2013/06/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [3.962399959564209,⋅1800.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2013-06-24\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'First⋅Lieutenant')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Megatron\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5.699999809265137\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Megatron']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2012/05/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [None,⋅5700.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2012-05-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'None')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " megatron\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 300\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Battle⋅Station\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Metroflex']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2011/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [91.44000244140625,⋅None]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2011-04-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Battle⋅Station')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "\n", + "\n", + "
Viewing 6 of 6 rows / 16 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.table()" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "# df = op.load.csv(\"data/foo.csv\", sep=\",\", header='true', infer_schema='true', charset=\"UTF-8\", null_value=\"None\")" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 6 of 6 rows / 16 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
names
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
height(ft)
\n", + "
2 (smallint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
function
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
rank
\n", + "
4 (tinyint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
age
\n", + "
5 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
weight(t)
\n", + "
6 (float)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
japanese name
\n", + "
7 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
last position seen
\n", + "
8 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
date arrival
\n", + "
9 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
last date seen
\n", + "
10 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
attributes
\n", + "
11 (array<float>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
DateType
\n", + "
12 (date)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
timestamp
\n", + "
13 (timestamp)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
Cybertronian
\n", + "
14 (boolean)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
function(binary)
\n", + "
15 (binary)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
NullType
\n", + "
16 (null)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Optimus⋅OptimusPrime\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 28\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Leader\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 4.300000190734863\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Inochi',⋅'Convoy']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 19.442735,-99.201111\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2016/09/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [8.53439998626709,⋅4300.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2016-09-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Leader')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bumbl#ebéé⋅⋅\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 17\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Espionage\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 7\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Bumble',⋅'Goldback']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10.642707,-71.612534\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2015/08/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [5.334000110626221,⋅2000.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2015-08-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Espionage')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ironhide&\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 26\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Security\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 7\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 4.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Roadbuster']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 37.789563,-122.400356\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014/07/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [7.924799919128418,⋅4000.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Security')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " JaJa⋅JaJaJ\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 13\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " First⋅Lieutenant\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1.7999999523162842\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Meister']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 33.670666,-117.841553\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2013/06/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [3.962399959564209,⋅1800.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2013-06-24\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'First⋅Lieutenant')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Megatron\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5.699999809265137\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Megatron']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2012/05/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [None,⋅5700.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2012-05-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'None')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Metroplex_)^$\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 300\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Battle⋅Station\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Metroflex']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2011/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [91.44000244140625,⋅None]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2011-04-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Battle⋅Station')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "\n", + "\n", + "
Viewing 6 of 6 rows / 16 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.table()" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 6 of 6 rows / 16 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
names
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
height(ft)
\n", + "
2 (smallint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
function
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
rank
\n", + "
4 (tinyint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
age
\n", + "
5 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
weight(t)
\n", + "
6 (float)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
japanese name
\n", + "
7 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
last position seen
\n", + "
8 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
date arrival
\n", + "
9 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
last date seen
\n", + "
10 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
attributes
\n", + "
11 (array<float>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
DateType
\n", + "
12 (date)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
timestamp
\n", + "
13 (timestamp)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
Cybertronian
\n", + "
14 (boolean)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
function(binary)
\n", + "
15 (binary)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
NullType
\n", + "
16 (null)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Optimus⋅OptimusPrime\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 28\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Leader\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 4.300000190734863\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Inochi',⋅'Convoy']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 19.442735,-99.201111\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2016/09/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [8.53439998626709,⋅4300.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2016-09-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Leader')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bumbl#ebéé⋅⋅\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 17\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Espionage\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 7\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Bumble',⋅'Goldback']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10.642707,-71.612534\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2015/08/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [5.334000110626221,⋅2000.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2015-08-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Espionage')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ironhide&\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 26\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Security\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 7\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 4.0\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Roadbuster']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 37.789563,-122.400356\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014/07/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [7.924799919128418,⋅4000.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Security')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " aaa⋅JaJaJ\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 13\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " First⋅Lieutenant\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1.7999999523162842\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Meister']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 33.670666,-117.841553\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2013/06/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [3.962399959564209,⋅1800.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2013-06-24\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'First⋅Lieutenant')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Megatron\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5.699999809265137\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Megatron']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2012/05/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [None,⋅5700.0]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2012-05-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'None')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Metroplex_)^$\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 300\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Battle⋅Station\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5000000\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['Metroflex']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2011/04/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " [91.44000244140625,⋅None]\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2011-04-10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2014-06-24⋅00:00:00\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " True\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " bytearray(b'Battle⋅Station')\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " None\n", + " \n", + "
\n", + "
\n", + "\n", + "\n", + "
Viewing 6 of 6 rows / 16 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.cols.replace(\"names\",[\"JaJa\",\"bbb\"],\"aaa\",search_by=\"words\").table()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Send!\n" + ] + } + ], + "source": [ + "df.send()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 19 of 19 rows / 8 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
id
\n", + "
1 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
firstName
\n", + "
2 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
lastName
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
billingId
\n", + "
4 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
product
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
price
\n", + "
6 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
birth
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
dummyCol
\n", + "
8 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Luis\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Alvarez$$%!\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 123\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Cake\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1980/07/07\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " never\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " André\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Ampère\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 423\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " piza\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1950/07/08\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " gonna\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 3\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " NiELS\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Böhr//((%%\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 551\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " pizza\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1990/07/09\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " give\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 4\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " PAUL\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " dirac$\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 521\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " pizza\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1954/07/10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " you\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Albert\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Einstein\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 634\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " pizza\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1990/07/11\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " up\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 6\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Galileo\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 672\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " arepa\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1930/08/12\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " never\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 7\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " CaRL\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Ga%%%uss\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 323\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " taco\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 3\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1970/07/13\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " gonna\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " David\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " H$$$ilbert\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 624\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " taaaccoo\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 3\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1950/07/14\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " let\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 9\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Johannes\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " KEPLER\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 735\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " taco\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 3\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1920/04/22\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " you\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " JaMES\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " M$$ax%%well\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 875\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " taco\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 3\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1923/03/12\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " down\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 11\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Isaac\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Newton\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 992\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " pasta\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 9\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1999/02/15\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " never⋅\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 12\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Emmy%%\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Nöether$\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 234\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " pasta\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 9\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1993/12/08\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " gonna\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 13\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Max!!!\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Planck!!!\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 111\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " hamburguer\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 4\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1994/01/04\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " run⋅\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 14\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Fred\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Hoy&&&le\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 553\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " pizzza\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1997/06/27\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " around\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 15\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " (((⋅⋅⋅Heinrich⋅)))))\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Hertz\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 116\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " pizza\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 8\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1956/11/30\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " and\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 16\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " William\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Gilbert###\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 886\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " BEER\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1958/03/26\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " desert\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 17\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Marie\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " CURIE\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 912\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Rice\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 2000/03/22\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " you\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 18\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Arthur\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " COM%%%pton\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 812\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 110790\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 5\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1899/01/01\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " #\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 19\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " JAMES\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Chadwick\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 467\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " null\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 10\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1921/05/03\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " #\n", + " \n", + "
\n", + "
\n", + "\n", + "\n", + "
Viewing 19 of 19 rows / 8 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.table(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'count_outliers': 8, 'count_non_outliers': 11, 'max_z_score': 1.7111}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.outliers.z_score(\"price\",threshold =1).info()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'count_outliers': 0,\n", + " 'count_non_outliers': 19,\n", + " 'lower_bound': -4.5,\n", + " 'lower_bound_count': 0,\n", + " 'upper_bound': 15.5,\n", + " 'upper_bound_count': 0,\n", + " 'iqr1': 3,\n", + " 'iqr3': 8}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.outliers.tukey(\"price\").info()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'count_outliers': 9,\n", + " 'count_non_outliers': 19,\n", + " 'lower_bound': 6,\n", + " 'lower_bound_count': 9,\n", + " 'upper_bound': 10,\n", + " 'upper_bound_count': 0}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.outliers.mad(\"price\", threshold =1).info()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'count_outliers': 19, 'count_non_outliers': 19, 'max_m_z_score': 2.36075}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.outliers.modified_z_score(\"price\",threshold =1).info()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"taaaccoo\": {\"similar\": {\"taco\": 3, \"taaaccoo\": 1}, \"count\": 2, \"sum\": 4}, \"piza\": {\"similar\": {\"pizza\": 4, \"piza\": 1}, \"count\": 2, \"sum\": 5}, \"hamburguer\": {\"similar\": {\"BEER\": 1, \"hamburguer\": 1}, \"count\": 2, \"sum\": 2}, \"taco\": {\"similar\": {\"Cake\": 1, \"Rice\": 1, \"taco\": 3}, \"count\": 3, \"sum\": 5}, \"pizzza\": {\"similar\": {\"pizza\": 4, \"pizzza\": 1}, \"count\": 2, \"sum\": 5}, \"arepa\": {\"similar\": {\"BEER\": 1, \"piza\": 1, \"pasta\": 2, \"Cake\": 1, \"Rice\": 1, \"pizza\": 4, \"arepa\": 1}, \"count\": 7, \"sum\": 11}, \"pizza\": {\"similar\": {\"piza\": 1, \"pizzza\": 1, \"pizza\": 4}, \"count\": 3, \"sum\": 6}, \"Rice\": {\"similar\": {\"piza\": 1, \"Cake\": 1, \"taco\": 3, \"Rice\": 1}, \"count\": 4, \"sum\": 6}, \"110790\": {\"similar\": {\"arepa\": 1, \"BEER\": 1, \"piza\": 1, \"pizzza\": 1, \"pasta\": 2, \"Cake\": 1, \"null\": 1, \"Rice\": 1, \"pizza\": 4, \"taco\": 3, \"110790\": 1}, \"count\": 11, \"sum\": 17}, \"BEER\": {\"similar\": {\"arepa\": 1, \"piza\": 1, \"Cake\": 1, \"null\": 1, \"Rice\": 1, \"taco\": 3, \"BEER\": 1}, \"count\": 7, \"sum\": 9}, \"Cake\": {\"similar\": {\"Rice\": 1, \"taco\": 3, \"Cake\": 1}, \"count\": 3, \"sum\": 5}, \"null\": {\"similar\": {\"BEER\": 1, \"piza\": 1, \"Cake\": 1, \"Rice\": 1, \"taco\": 3, \"null\": 1}, \"count\": 6, \"sum\": 8}, \"pasta\": {\"similar\": {\"piza\": 1, \"pizza\": 4, \"pasta\": 2}, \"count\": 3, \"sum\": 7}}\n", + "Wall time: 9.6 s\n" + ] + } + ], + "source": [ + "%%time\n", + "from optimus.ml import distancecluster as dc\n", + "print(dc.levenshtein_cluster(df,'product',output=\"json\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "from optimus.ml import distancecluster as dc\n", + "from optimus.ml import keycollision as kc\n", + "\n", + "# result = dc.levenshtein_json(df,'product')\n", + "result = kc.fingerprint_cluster(df, \"product\",3)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 10 of 13 rows / 4 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
count
\n", + "
1 (string)
\n", + "
\n", + " \n", + " not nullable\n", + " \n", + "
\n", + "
\n", + "
product
\n", + "
2 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
product***NGRAM
\n", + "
3 (array<string>)
\n", + "
\n", + " \n", + " not nullable\n", + " \n", + "
\n", + "
\n", + "
product***NGRAM_FINGERPRINT
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " taaaccoo\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['taaaccoo']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " taaaccoo\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " piza\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['piza']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " piza\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " hamburguer\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['hamburguer']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " hamburguer\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 3\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " taco\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['taco']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " taco\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " BEER\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['beer']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " beer\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " pizzza\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['pizzza']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " pizzza\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " arepa\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['arepa']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " arepa\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 4\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " pizza\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['pizza']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " pizza\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " Rice\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['rice']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " rice\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 1\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 110790\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " ['110790']\n", + " \n", + "
\n", + "
\n", + "
\n", + " \n", + " 110790\n", + " \n", + "
\n", + "
\n", "\n", "\n", - "
Viewing 10 of 319,073 rows / 17 columns
\n", - "
8 partition(s)
\n" + "
Viewing 10 of 13 rows / 4 columns
\n", + "
1 partition(s)
\n" ], "text/plain": [ "" @@ -1695,7 +7556,136 @@ } ], "source": [ - "df.table()" + "result = kc.n_gram_fingerprint_cluster(df, \"product\",3)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'taaaccoo': {'similar': ['taaaccoo'], 'count': 1, 'sum': 1.0}, 'piza': {'similar': ['piza'], 'count': 1, 'sum': 1.0}, 'hamburguer': {'similar': ['hamburguer'], 'count': 1, 'sum': 1.0}, 'taco': {'similar': ['taco'], 'count': 1, 'sum': 3.0}, 'pizzza': {'similar': ['pizzza'], 'count': 1, 'sum': 1.0}, 'arepa': {'similar': ['arepa'], 'count': 1, 'sum': 1.0}, 'pizza': {'similar': ['pizza'], 'count': 1, 'sum': 4.0}, 'Rice': {'similar': ['Rice'], 'count': 1, 'sum': 1.0}, '110790': {'similar': ['110790'], 'count': 1, 'sum': 1.0}, 'BEER': {'similar': ['BEER'], 'count': 1, 'sum': 1.0}, 'Cake': {'similar': ['Cake'], 'count': 1, 'sum': 1.0}, 'null': {'similar': ['null'], 'count': 1, 'sum': 1.0}, 'pasta': {'similar': ['pasta'], 'count': 1, 'sum': 2.0}}\n" + ] + } + ], + "source": [ + "print(result)" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "str" + ] + }, + "execution_count": 159, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(result)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['taaaccoo', 1]\n", + "['piza', 1]\n", + "['hamburguer', 1]\n", + "['taco', 3]\n", + "['BEER', 1]\n", + "['pizzza', 1]\n", + "['arepa', 1]\n", + "['pizza', 4]\n", + "['Rice', 1]\n", + "['110790', 1]\n", + "['Cake', 1]\n", + "['null', 1]\n", + "['pasta', 2]\n" + ] + } + ], + "source": [ + "kv_dict ={}\n", + "for row in result.collect():\n", + " _row = list(row.asDict().values())\n", + " print(_row)\n", + " kv_dict[_row[0]] = _row[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'taaaccoo': 1, 'piza': 1, 'hamburguer': 1, 'taco': 3, 'BEER': 1, 'pizzza': 1, 'arepa': 1, 'pizza': 4, 'Rice': 1, '110790': 1, 'Cake': 1, 'null': 1, 'pasta': 2}\n" + ] + } + ], + "source": [ + "print(kv_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'str' object has no attribute 'cols'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0ma\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcols\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"product***LEVENSHTEIN_DISTANCE\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtable\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m: 'str' object has no attribute 'cols'" + ] + } + ], + "source": [ + "a.cols.replace(\"product***LEVENSHTEIN_DISTANCE\", 0, None).table()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'str' object has no attribute 'rows'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0ma\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrows\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mwhere\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"product_LEVENSHTEIN_1\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m!=\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"product_LEVENSHTEIN_2\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m&\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"product***LEVENSHTEIN_DISTANCE\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m==\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtable\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m: 'str' object has no attribute 'rows'" + ] + } + ], + "source": [ + "a.rows.drop(where=((a[\"product_LEVENSHTEIN_1\"]!=a[\"product_LEVENSHTEIN_2\"])& (a[\"product***LEVENSHTEIN_DISTANCE\"]==0))).table()" ] }, {