diff --git a/data-ingestion-and-preparation/dask-cluster.ipynb b/data-ingestion-and-preparation/dask-cluster.ipynb index 5247c8ed..cd6cd292 100644 --- a/data-ingestion-and-preparation/dask-cluster.ipynb +++ b/data-ingestion-and-preparation/dask-cluster.ipynb @@ -41,7 +41,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Both server & client are aligned (0.6.0rc13).\n" + "Both server & client are aligned (0.7.1).\n" ] } ], @@ -79,7 +79,7 @@ "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", - "100 84.9M 100 84.9M 0 0 8206k 0 0:00:10 0:00:10 --:--:-- 8639k\n" + "100 84.9M 100 84.9M 0 0 21.7M 0 0:00:03 0:00:03 --:--:-- 21.7M\n" ] } ], @@ -103,26 +103,6 @@ "cell_type": "code", "execution_count": 5, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%nuclio: setting kind to 'job'\n", - "%nuclio: setting spec.image to 'mlrun/ml-models'\n" - ] - } - ], - "source": [ - "# nuclio: start-code\n", - "%nuclio config kind = \"job\"\n", - "%nuclio config spec.image = \"mlrun/ml-models\"" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, "outputs": [], "source": [ "from mlrun.execution import MLClientCtx\n", @@ -157,7 +137,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -178,7 +158,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -194,9 +174,17 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2021-10-12 09:43:54,745 [info] loaded project default from MLRun DB\n" + ] + } + ], "source": [ "import mlrun\n", "artifact_path = mlrun.set_environment(api_path = mlrun.mlconf.dbpath or 'http://mlrun-api:8080',\n", @@ -220,7 +208,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -236,24 +224,16 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> 2021-02-04 19:07:34,933 [info] using in-cluster config.\n" - ] - } - ], + "outputs": [], "source": [ "dsf = mlrun.new_function(\"dask_init\", kind='dask', image='mlrun/ml-models').apply(mlrun.mount_v3io())" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -266,21 +246,21 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "> 2021-02-04 19:07:35,158 [info] trying dask client at: tcp://mlrun-dask-init-b7725876-7.default-tenant:8786\n", - "> 2021-02-04 19:07:35,234 [info] using remote dask scheduler (mlrun-dask-init-b7725876-7) at: tcp://mlrun-dask-init-b7725876-7.default-tenant:8786\n" + "> 2021-10-12 09:44:06,095 [info] trying dask client at: tcp://mlrun-dask-init-b76f44f5-0.default-tenant:8786\n", + "> 2021-10-12 09:44:06,116 [info] using remote dask scheduler (mlrun-dask-init-b76f44f5-0) at: tcp://mlrun-dask-init-b76f44f5-0.default-tenant:8786\n" ] }, { "data": { "text/html": [ - "dashboard link: default-tenant.app.bsmhzmwkycjf.iguazio-cd2.com:30053" + "dashboard link: default-tenant.app.app-lab-eks-testing.iguazio-cd1.com:30990" ], "text/plain": [ "" @@ -297,8 +277,8 @@ "\n", "

Client

\n", "\n", "\n", "\n", @@ -306,17 +286,17 @@ "\n", "\n", "\n", "" ], "text/plain": [ - "" + "" ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -335,7 +315,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -360,16 +340,16 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "> 2021-02-04 19:07:35,288 [info] starting run test-dask-test_dask uid=4973d57c968747e491d6fcaa401968e0 DB=http://mlrun-api:8080\n", - "> 2021-02-04 19:07:35,769 [info] Job is running in the background, pod: test-dask-test-dask-4m29c\n", - "> 2021-02-04 19:07:56,270 [info] run executed, status=completed\n", + "> 2021-10-12 09:44:06,154 [info] starting run test-dask-test_dask uid=3d3ec4b558a140b5849700dc5d77c97c DB=http://mlrun-api:8080\n", + "> 2021-10-12 09:44:06,317 [info] Job is running in the background, pod: test-dask-test-dask-w28ff\n", + "> 2021-10-12 09:44:20,436 [info] run executed, status=completed\n", "final state: completed\n" ] }, @@ -544,26 +524,26 @@ " \n", " \n", " default\n", - "
...401968e0
\n", + "
...5d77c97c
\n", " 0\n", - " Feb 04 19:07:42\n", + " Oct 12 09:44:11\n", " completed\n", " test-dask-test_dask\n", - "
v3io_user=admin
kind=job
owner=admin
host=test-dask-test-dask-4m29c
\n", - "
dataset
\n", - "
dask_client=tcp://mlrun-dask-init-b7725876-7.default-tenant:8786
\n", + "
v3io_user=dani
kind=job
owner=dani
host=test-dask-test-dask-w28ff
\n", + "
dataset
\n", + "
dask_client=tcp://mlrun-dask-init-b76f44f5-0.default-tenant:8786
\n", " \n", " \n", " \n", " \n", "\n", "\n", - "
\n", + "
\n", "
\n", - " Title\n", - " ×\n", + " Title\n", + " ×\n", "
\n", - " \n", + " \n", "
\n", "
\n" ], @@ -578,18 +558,35 @@ "name": "stdout", "output_type": "stream", "text": [ - "to track results use .show() or .logs() or in CLI: \n", - "!mlrun get run 4973d57c968747e491d6fcaa401968e0 --project default , !mlrun logs 4973d57c968747e491d6fcaa401968e0 --project default\n", - "> 2021-02-04 19:08:05,372 [info] run executed, status=completed\n" + "\n" + ] + }, + { + "data": { + "text/html": [ + " > to track results use the .show() or .logs() methods or click here to open in UI" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> 2021-10-12 09:44:25,557 [info] run executed, status=completed\n" ] }, { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 15, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } diff --git a/data-ingestion-and-preparation/parquet-to-hive.ipynb b/data-ingestion-and-preparation/parquet-to-hive.ipynb index e9d65f76..500514e0 100644 --- a/data-ingestion-and-preparation/parquet-to-hive.ipynb +++ b/data-ingestion-and-preparation/parquet-to-hive.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -33,7 +33,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -50,19 +50,25 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "def getCreateTableScript(databaseName, tableName, path, df):\n", - " cols = df.dtypes\n", - " createScript = \"CREATE EXTERNAL TABLE IF NOT EXISTS \" + databaseName + \".\" + tableName + \"(\"\n", + "def getCreateTableScript(databaseName, tableName, path, df, partitions=[]):\n", + " #remove partition columns from the df to avoid repetition exception\n", + " partition_names = map(lambda x: x.split(' ')[0] , partitions )\n", + " ndf = df.drop(*partition_names)\n", + " \n", + " cols = ndf.dtypes\n", + " createScript = \"CREATE EXTERNAL TABLE \" + databaseName + \".\" + tableName + \"(\"\n", " colArray = []\n", " for colName, colType in cols:\n", " colArray.append(colName.replace(\" \", \"_\") + \" \" + colType)\n", - " createColsScript = \", \".join(colArray )\n", - " \n", - " script = createScript + createColsScript + \") STORED AS PARQUET LOCATION '\" + path + \"'\"\n", + " createColsScript = \", \".join(colArray ) + \") \"\n", + " partitionBy = \"\"\n", + " if len(partitions) > 0:\n", + " partitionBy = \"PARTITIONED BY (\" + \", \".join(partitions) + \") \"\n", + " script = createScript + createColsScript + partitionBy + \" STORED AS PARQUET LOCATION '\" + path + \"'\"\n", " print(script)\n", " return script\n", " " @@ -70,15 +76,17 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#define main function for creating table where arqument 'path' is path to parquet files \n", - "def createTable(databaseName, tableName, path): \n", + "def createTable(databaseName, tableName, path, partitions=[]): \n", " df = spark.read.parquet(path)\n", - " sqlScript = getCreateTableScript(databaseName, tableName, path, df)\n", - " spark.sql(sqlScript)" + " sqlScript = getCreateTableScript(databaseName, tableName, path, df, partitions)\n", + " spark.sql(sqlScript)\n", + " if len(partitions) > 0:\n", + " spark.sql(f'msck repair table {databaseName}.{tableName}')" ] }, { @@ -90,17 +98,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CREATE EXTERNAL TABLE IF NOT EXISTS default.tab1_single_file(registration_dttm timestamp, id int, first_name string, last_name string, email string, gender string, ip_address string, cc string, country string, birthdate string, salary double, title string, comments string) STORED AS PARQUET LOCATION 'v3io://users/adi/examples/userdata1.parquet'\n" - ] - } - ], + "outputs": [], "source": [ "# Set the path where the parquet file is located.\n", "my_parqute_file_path = os.path.join('v3io://users/'+os.getenv('V3IO_USERNAME')+'/examples/userdata1.parquet')\n", @@ -108,6 +108,15 @@ "createTable(\"default\",\"tab1_single_file\",my_parqute_file_path)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%sql select * from hive.default.tab1_single_file limit 10" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -117,345 +126,93 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CREATE EXTERNAL TABLE IF NOT EXISTS default.table_from_dir(registration_dttm timestamp, id int, first_name string, last_name string, email string, gender string, ip_address string, cc string, country string, birthdate string, salary double, title string, comments string) STORED AS PARQUET LOCATION 'v3io://users/adi/examples/spark-output/*'\n" - ] - } - ], + "outputs": [], "source": [ "# Set the path where the parquet folder is located.\n", - "folder_path = os.path.join('v3io://users/'+os.getenv('V3IO_USERNAME')+'/examples/spark-output/*')\n", + "folder_path = os.path.join('v3io://users/'+os.getenv('V3IO_USERNAME')+'/examples/spark-output/')\n", "\n", "createTable(\"default\",\"table_from_dir\",folder_path)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%sql select * from hive.default.table_from_dir limit 10" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Multiple files and folders example\n", + "# Partitioned parquet example\n", "\n", - "In this example change the name of the database and path to the folder where all parquet files (or folders with them) are located.
\n", - "This code goes over all files and dirs in the provided path and uses them for creating tables.\n", - "File should be ended with .parquet format\n", - "Directory (in which stored parquet files) should be started with \".\"\n", - "Name of directory or file will be name of table." + "Table partitioning is a common optimization approach used in systems like Hive. In a partitioned table, data are usually stored in different directories, with partitioning column values encoded in the path of each partition directory." ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, - "outputs": [ - { - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: 'v3io://users/adi/examples/multiple-parquet-files'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mfileOrDir\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlistdir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mfileOrDir\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mendswith\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\".parquet\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mcreateTable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdatabaseName\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfileOrDir\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\".parquet\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilepath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/v3io/\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"v3io://\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"/\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mfileOrDir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'v3io://users/adi/examples/multiple-parquet-files'" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "databaseName = \"default\"\n", - "filepath = os.path.join('v3io://users/'+os.getenv('V3IO_USERNAME')+'/examples/multiple-parquet-files')\n", - "\n", - "\n", - "for fileOrDir in os.listdir(filepath):\n", - " if fileOrDir.endswith(\".parquet\") :\n", - " createTable(databaseName, fileOrDir.split(\".parquet\")[0], filepath.replace(\"/v3io/\", \"v3io://\", 1) + \"/\" + fileOrDir)\n", - " elif not fileOrDir.startswith(\".\") :\n", - " createTable(databaseName, fileOrDir, filepath.replace(\"/v3io/\", \"v3io://\", 1) + \"/\" + fileOrDir + \"/*\")\n", - "\n" + "# Set path where parquet folder with parquet partitions are located indside.\n", + "folder_path = os.path.join('v3io://users/'+os.getenv('V3IO_USERNAME')+'/examples/partitioned_pq')\n", + "#provide list of partitions and their type\n", + "partition_list = [\"gender string\"] \n", + "createTable(\"default\", \"partitioned_table\", folder_path, partition_list)" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "# Test how it works" + "%sql select * from hive.default.partitioned_table limit 10" ] }, { - "cell_type": "code", - "execution_count": 7, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------------+\n", - "|databaseName|\n", - "+------------+\n", - "| default|\n", - "+------------+\n", - "\n", - "+--------+----------------+-----------+\n", - "|database| tableName|isTemporary|\n", - "+--------+----------------+-----------+\n", - "| default|tab1_single_file| false|\n", - "| default| table_from_dir| false|\n", - "+--------+----------------+-----------+\n", - "\n" - ] - } - ], "source": [ - "# test how the tables were saved\n", - "#spark.sql(\"drop database test CASCADE\")\n", - "databaseName = \"default\"\n", - "\n", - "spark.sql(\"show databases\").show()\n", - "spark.sql(\"show tables in \" + databaseName).show()" + "# Adding new partitions" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "DataFrame[registration_dttm: timestamp, id: int, first_name: string, last_name: string, email: string, gender: string, ip_address: string, cc: string, country: string, birthdate: string, salary: double, title: string, comments: string]" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "# test how saving to table works\n", - "tableName = \"table_from_dir\"\n", - "spark.sql(\"select * from \" + databaseName + \".\" + tableName)" + "#Once added new partitions to the table, \n", + "# it is required to run the below command in order for the hive metastore to be aware of the new files.\n", + "spark.sql('msck repair table default.partitioned_table')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Run select via Hive" + "# Browse the Metastore" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
registration_dttmidfirst_namelast_nameemailgenderip_addresscccountrybirthdatesalarytitlecomments
2016-02-03 07:55:29.0001AmandaJordanajordan0@com.comFemale1.197.201.26759521864920116Indonesia3/8/197149756.53Internal Auditor1E+02
2016-02-03 06:47:06.0008HarryHowellhhowell7@eepurl.comMale91.235.51.73Bosnia and Herzegovina3/1/1962186469.43Web Developer IV
2016-02-03 03:52:53.0009JoseFosterjfoster8@yelp.comMale132.31.53.61South Korea3/27/1992231067.84Software Test Engineer I1E+02
2016-02-03 18:29:47.00010EmilyStewartestewart9@opensource.orgFemale143.28.251.2453574254110301671Nigeria1/28/199727234.28Health Coach IV
2016-02-03 00:36:21.0004DeniseRileydriley3@gmpg.orgFemale140.35.109.833576031598965625China4/8/199790263.05Senior Cost Accountant
2016-02-03 05:05:31.0005CarlosBurnscburns4@miitbeian.gov.cn169.113.235.405602256255204850South AfricaNone
2016-02-03 07:22:34.0006KathrynWhitekwhite5@google.comFemale195.131.81.1793583136326049310Indonesia2/25/198369227.11Account Executive
2016-02-03 08:33:08.0007SamuelHolmessholmes6@foxnews.comMale232.234.81.1973582641366974690Portugal12/18/198714247.62Senior Financial Analyst
2016-02-03 17:04:03.0002AlbertFreemanafreeman1@is.gdMale218.111.175.34Canada1/16/1968150280.17Accountant IV
2016-02-03 01:09:31.0003EvelynMorganemorgan2@altervista.orgFemale7.161.136.946767119071901597Russia2/1/1960144972.51Structural Engineer
" - ], - "text/plain": [ - "[('2016-02-03 07:55:29.000', 1, 'Amanda', 'Jordan', 'ajordan0@com.com', 'Female', '1.197.201.2', '6759521864920116', 'Indonesia', '3/8/1971', 49756.53, 'Internal Auditor', '1E+02'),\n", - " ('2016-02-03 06:47:06.000', 8, 'Harry', 'Howell', 'hhowell7@eepurl.com', 'Male', '91.235.51.73', '', 'Bosnia and Herzegovina', '3/1/1962', 186469.43, 'Web Developer IV', ''),\n", - " ('2016-02-03 03:52:53.000', 9, 'Jose', 'Foster', 'jfoster8@yelp.com', 'Male', '132.31.53.61', '', 'South Korea', '3/27/1992', 231067.84, 'Software Test Engineer I', '1E+02'),\n", - " ('2016-02-03 18:29:47.000', 10, 'Emily', 'Stewart', 'estewart9@opensource.org', 'Female', '143.28.251.245', '3574254110301671', 'Nigeria', '1/28/1997', 27234.28, 'Health Coach IV', ''),\n", - " ('2016-02-03 00:36:21.000', 4, 'Denise', 'Riley', 'driley3@gmpg.org', 'Female', '140.35.109.83', '3576031598965625', 'China', '4/8/1997', 90263.05, 'Senior Cost Accountant', ''),\n", - " ('2016-02-03 05:05:31.000', 5, 'Carlos', 'Burns', 'cburns4@miitbeian.gov.cn', '', '169.113.235.40', '5602256255204850', 'South Africa', '', None, '', ''),\n", - " ('2016-02-03 07:22:34.000', 6, 'Kathryn', 'White', 'kwhite5@google.com', 'Female', '195.131.81.179', '3583136326049310', 'Indonesia', '2/25/1983', 69227.11, 'Account Executive', ''),\n", - " ('2016-02-03 08:33:08.000', 7, 'Samuel', 'Holmes', 'sholmes6@foxnews.com', 'Male', '232.234.81.197', '3582641366974690', 'Portugal', '12/18/1987', 14247.62, 'Senior Financial Analyst', ''),\n", - " ('2016-02-03 17:04:03.000', 2, 'Albert', 'Freeman', 'afreeman1@is.gd', 'Male', '218.111.175.34', '', 'Canada', '1/16/1968', 150280.17, 'Accountant IV', ''),\n", - " ('2016-02-03 01:09:31.000', 3, 'Evelyn', 'Morgan', 'emorgan2@altervista.org', 'Female', '7.161.136.94', '6767119071901597', 'Russia', '2/1/1960', 144972.51, 'Structural Engineer', '')]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "%sql select * from hive.default.tab1_single_file limit 10" + "# test how the tables were saved\n", + "#spark.sql(\"drop database test CASCADE\")\n", + "databaseName = \"default\"\n", + "\n", + "spark.sql(\"show databases\").show()\n", + "spark.sql(\"show tables in \" + databaseName).show()" ] }, { @@ -475,45 +232,41 @@ "e.g. select * from table_from_single_file2;" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleanup\n", + "This will only clean the metastore definitions.\n", + "
The underlying data won't be affected." + ] + }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "DataFrame[]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spark.sql(\"drop table \" + databaseName + \".tab1_single_file\")" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "DataFrame[]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spark.sql(\"drop table \" + databaseName + \".table_from_dir\")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"drop table \" + databaseName + \".partitioned_table\")" + ] } ], "metadata": { diff --git a/examples/multiple-parquet-files/userdata1.parquet b/examples/multiple-parquet-files/userdata1.parquet deleted file mode 100644 index 2ae23dac..00000000 Binary files a/examples/multiple-parquet-files/userdata1.parquet and /dev/null differ diff --git a/examples/partitioned_pq/gender=Female/part-00000-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet b/examples/partitioned_pq/gender=Female/part-00000-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet new file mode 100755 index 00000000..80b6ecb9 Binary files /dev/null and b/examples/partitioned_pq/gender=Female/part-00000-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet differ diff --git a/examples/partitioned_pq/gender=Female/part-00001-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet b/examples/partitioned_pq/gender=Female/part-00001-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet new file mode 100755 index 00000000..eb4259d2 Binary files /dev/null and b/examples/partitioned_pq/gender=Female/part-00001-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet differ diff --git a/examples/partitioned_pq/gender=Female/part-00002-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet b/examples/partitioned_pq/gender=Female/part-00002-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet new file mode 100755 index 00000000..3e1558a8 Binary files /dev/null and b/examples/partitioned_pq/gender=Female/part-00002-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet differ diff --git a/examples/partitioned_pq/gender=Male/part-00000-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet b/examples/partitioned_pq/gender=Male/part-00000-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet new file mode 100755 index 00000000..9016f9c9 Binary files /dev/null and b/examples/partitioned_pq/gender=Male/part-00000-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet differ diff --git a/examples/partitioned_pq/gender=Male/part-00001-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet b/examples/partitioned_pq/gender=Male/part-00001-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet new file mode 100755 index 00000000..1ba846d4 Binary files /dev/null and b/examples/partitioned_pq/gender=Male/part-00001-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet differ diff --git a/examples/partitioned_pq/gender=Male/part-00002-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet b/examples/partitioned_pq/gender=Male/part-00002-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet new file mode 100755 index 00000000..10a5eb6d Binary files /dev/null and b/examples/partitioned_pq/gender=Male/part-00002-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet differ diff --git a/examples/multiple-parquet-files/userdata2.parquet b/examples/spark-output/userdata2.parquet similarity index 100% rename from examples/multiple-parquet-files/userdata2.parquet rename to examples/spark-output/userdata2.parquet diff --git a/examples/multiple-parquet-files/userdata3.parquet b/examples/spark-output/userdata3.parquet similarity index 100% rename from examples/multiple-parquet-files/userdata3.parquet rename to examples/spark-output/userdata3.parquet diff --git a/igz-tutorials-get.sh b/igz-tutorials-get.sh old mode 100755 new mode 100644 index c41d43a1..8bfb1ec0 --- a/igz-tutorials-get.sh +++ b/igz-tutorials-get.sh @@ -86,9 +86,10 @@ fi if [ -z "${branch}" ]; then platform_version="${IGZ_VERSION%%_*}" echo "Detected platform version: ${platform_version}" - latest_tag=`git ls-remote --tags --refs --sort='v:refname' "${git_url}" "refs/tags/v${platform_version}.*" | tail -n1 | awk '{ print $2}'` + tag_prefix=`echo ${platform_version} | cut -d . -f1-2` + latest_tag=`git ls-remote --tags --refs --sort='v:refname' "${git_url}" "refs/tags/v${tag_prefix}.*" | tail -n1 | awk '{ print $2}'` if [ -z "${latest_tag}" ]; then - error_exit "Couldn't locate a Git tag with prefix 'v${platform_version}.*'. Aborting..." + error_exit "Couldn't locate a Git tag with prefix 'v${tag_prefix}.*'. Aborting..." else # Remove the prefix from the Git tag branch=${latest_tag#refs/tags/} diff --git a/update-demos.sh b/update-demos.sh old mode 100755 new mode 100644 index e1151f9f..6f12f0ab --- a/update-demos.sh +++ b/update-demos.sh @@ -93,7 +93,7 @@ do --mlrun-ver=?*) mlrun_version=${1#*=} # Delete everything up to "=" and assign the remainder. ;; - --umlrun-ver=) # Handle the case of an empty --mlrun-ver= + --mlrun-ver=) # Handle the case of an empty --mlrun-ver= error_usage "$1: Missing MLRun version." ;; --dry-run)