"
@@ -297,8 +277,8 @@
"\n",
"Client\n",
"\n",
" | \n",
"\n",
@@ -306,17 +286,17 @@
"\n",
" - Workers: 2
\n",
" - Cores: 2
\n",
- " - Memory: 8.31 GB
\n",
+ " - Memory: 8.34 GB
\n",
" \n",
" | \n",
"\n",
""
],
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 13,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@@ -335,7 +315,7 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
@@ -360,16 +340,16 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "> 2021-02-04 19:07:35,288 [info] starting run test-dask-test_dask uid=4973d57c968747e491d6fcaa401968e0 DB=http://mlrun-api:8080\n",
- "> 2021-02-04 19:07:35,769 [info] Job is running in the background, pod: test-dask-test-dask-4m29c\n",
- "> 2021-02-04 19:07:56,270 [info] run executed, status=completed\n",
+ "> 2021-10-12 09:44:06,154 [info] starting run test-dask-test_dask uid=3d3ec4b558a140b5849700dc5d77c97c DB=http://mlrun-api:8080\n",
+ "> 2021-10-12 09:44:06,317 [info] Job is running in the background, pod: test-dask-test-dask-w28ff\n",
+ "> 2021-10-12 09:44:20,436 [info] run executed, status=completed\n",
"final state: completed\n"
]
},
@@ -544,26 +524,26 @@
" \n",
" \n",
" default | \n",
- " | \n",
+ " | \n",
" 0 | \n",
- " Feb 04 19:07:42 | \n",
+ " Oct 12 09:44:11 | \n",
" completed | \n",
" test-dask-test_dask | \n",
- " v3io_user=admin kind=job owner=admin host=test-dask-test-dask-4m29c | \n",
- " dataset | \n",
- " dask_client=tcp://mlrun-dask-init-b7725876-7.default-tenant:8786 | \n",
+ " v3io_user=dani kind=job owner=dani host=test-dask-test-dask-w28ff | \n",
+ " dataset | \n",
+ " dask_client=tcp://mlrun-dask-init-b76f44f5-0.default-tenant:8786 | \n",
" | \n",
" | \n",
"
\n",
" \n",
"\n",
"\n",
- " \n",
+ "
\n",
" \n",
- " \n",
+ " \n",
"
\n",
"
\n"
],
@@ -578,18 +558,35 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "to track results use .show() or .logs() or in CLI: \n",
- "!mlrun get run 4973d57c968747e491d6fcaa401968e0 --project default , !mlrun logs 4973d57c968747e491d6fcaa401968e0 --project default\n",
- "> 2021-02-04 19:08:05,372 [info] run executed, status=completed\n"
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ " > to track results use the .show() or .logs() methods or click here to open in UI"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "> 2021-10-12 09:44:25,557 [info] run executed, status=completed\n"
]
},
{
"data": {
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 15,
+ "execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
diff --git a/data-ingestion-and-preparation/parquet-to-hive.ipynb b/data-ingestion-and-preparation/parquet-to-hive.ipynb
index e9d65f76..500514e0 100644
--- a/data-ingestion-and-preparation/parquet-to-hive.ipynb
+++ b/data-ingestion-and-preparation/parquet-to-hive.ipynb
@@ -17,7 +17,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -33,7 +33,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -50,19 +50,25 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "def getCreateTableScript(databaseName, tableName, path, df):\n",
- " cols = df.dtypes\n",
- " createScript = \"CREATE EXTERNAL TABLE IF NOT EXISTS \" + databaseName + \".\" + tableName + \"(\"\n",
+ "def getCreateTableScript(databaseName, tableName, path, df, partitions=[]):\n",
+ " #remove partition columns from the df to avoid repetition exception\n",
+ " partition_names = map(lambda x: x.split(' ')[0] , partitions )\n",
+ " ndf = df.drop(*partition_names)\n",
+ " \n",
+ " cols = ndf.dtypes\n",
+ " createScript = \"CREATE EXTERNAL TABLE \" + databaseName + \".\" + tableName + \"(\"\n",
" colArray = []\n",
" for colName, colType in cols:\n",
" colArray.append(colName.replace(\" \", \"_\") + \" \" + colType)\n",
- " createColsScript = \", \".join(colArray )\n",
- " \n",
- " script = createScript + createColsScript + \") STORED AS PARQUET LOCATION '\" + path + \"'\"\n",
+ " createColsScript = \", \".join(colArray ) + \") \"\n",
+ " partitionBy = \"\"\n",
+ " if len(partitions) > 0:\n",
+ " partitionBy = \"PARTITIONED BY (\" + \", \".join(partitions) + \") \"\n",
+ " script = createScript + createColsScript + partitionBy + \" STORED AS PARQUET LOCATION '\" + path + \"'\"\n",
" print(script)\n",
" return script\n",
" "
@@ -70,15 +76,17 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#define main function for creating table where arqument 'path' is path to parquet files \n",
- "def createTable(databaseName, tableName, path): \n",
+ "def createTable(databaseName, tableName, path, partitions=[]): \n",
" df = spark.read.parquet(path)\n",
- " sqlScript = getCreateTableScript(databaseName, tableName, path, df)\n",
- " spark.sql(sqlScript)"
+ " sqlScript = getCreateTableScript(databaseName, tableName, path, df, partitions)\n",
+ " spark.sql(sqlScript)\n",
+ " if len(partitions) > 0:\n",
+ " spark.sql(f'msck repair table {databaseName}.{tableName}')"
]
},
{
@@ -90,17 +98,9 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "CREATE EXTERNAL TABLE IF NOT EXISTS default.tab1_single_file(registration_dttm timestamp, id int, first_name string, last_name string, email string, gender string, ip_address string, cc string, country string, birthdate string, salary double, title string, comments string) STORED AS PARQUET LOCATION 'v3io://users/adi/examples/userdata1.parquet'\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# Set the path where the parquet file is located.\n",
"my_parqute_file_path = os.path.join('v3io://users/'+os.getenv('V3IO_USERNAME')+'/examples/userdata1.parquet')\n",
@@ -108,6 +108,15 @@
"createTable(\"default\",\"tab1_single_file\",my_parqute_file_path)"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%sql select * from hive.default.tab1_single_file limit 10"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -117,345 +126,93 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "CREATE EXTERNAL TABLE IF NOT EXISTS default.table_from_dir(registration_dttm timestamp, id int, first_name string, last_name string, email string, gender string, ip_address string, cc string, country string, birthdate string, salary double, title string, comments string) STORED AS PARQUET LOCATION 'v3io://users/adi/examples/spark-output/*'\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# Set the path where the parquet folder is located.\n",
- "folder_path = os.path.join('v3io://users/'+os.getenv('V3IO_USERNAME')+'/examples/spark-output/*')\n",
+ "folder_path = os.path.join('v3io://users/'+os.getenv('V3IO_USERNAME')+'/examples/spark-output/')\n",
"\n",
"createTable(\"default\",\"table_from_dir\",folder_path)"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%sql select * from hive.default.table_from_dir limit 10"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Multiple files and folders example\n",
+ "# Partitioned parquet example\n",
"\n",
- "In this example change the name of the database and path to the folder where all parquet files (or folders with them) are located.
\n",
- "This code goes over all files and dirs in the provided path and uses them for creating tables.\n",
- "File should be ended with .parquet format\n",
- "Directory (in which stored parquet files) should be started with \".\"\n",
- "Name of directory or file will be name of table."
+ "Table partitioning is a common optimization approach used in systems like Hive. In a partitioned table, data are usually stored in different directories, with partitioning column values encoded in the path of each partition directory."
]
},
{
"cell_type": "code",
- "execution_count": 14,
- "metadata": {
- "collapsed": true,
- "jupyter": {
- "outputs_hidden": true
- }
- },
- "outputs": [
- {
- "ename": "FileNotFoundError",
- "evalue": "[Errno 2] No such file or directory: 'v3io://users/adi/examples/multiple-parquet-files'",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mfileOrDir\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlistdir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mfileOrDir\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mendswith\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\".parquet\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mcreateTable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdatabaseName\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfileOrDir\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\".parquet\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilepath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/v3io/\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"v3io://\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"/\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mfileOrDir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'v3io://users/adi/examples/multiple-parquet-files'"
- ]
- }
- ],
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
"source": [
- "databaseName = \"default\"\n",
- "filepath = os.path.join('v3io://users/'+os.getenv('V3IO_USERNAME')+'/examples/multiple-parquet-files')\n",
- "\n",
- "\n",
- "for fileOrDir in os.listdir(filepath):\n",
- " if fileOrDir.endswith(\".parquet\") :\n",
- " createTable(databaseName, fileOrDir.split(\".parquet\")[0], filepath.replace(\"/v3io/\", \"v3io://\", 1) + \"/\" + fileOrDir)\n",
- " elif not fileOrDir.startswith(\".\") :\n",
- " createTable(databaseName, fileOrDir, filepath.replace(\"/v3io/\", \"v3io://\", 1) + \"/\" + fileOrDir + \"/*\")\n",
- "\n"
+ "# Set path where parquet folder with parquet partitions are located indside.\n",
+ "folder_path = os.path.join('v3io://users/'+os.getenv('V3IO_USERNAME')+'/examples/partitioned_pq')\n",
+ "#provide list of partitions and their type\n",
+ "partition_list = [\"gender string\"] \n",
+ "createTable(\"default\", \"partitioned_table\", folder_path, partition_list)"
]
},
{
- "cell_type": "markdown",
+ "cell_type": "code",
+ "execution_count": null,
"metadata": {},
+ "outputs": [],
"source": [
- "# Test how it works"
+ "%sql select * from hive.default.partitioned_table limit 10"
]
},
{
- "cell_type": "code",
- "execution_count": 7,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+------------+\n",
- "|databaseName|\n",
- "+------------+\n",
- "| default|\n",
- "+------------+\n",
- "\n",
- "+--------+----------------+-----------+\n",
- "|database| tableName|isTemporary|\n",
- "+--------+----------------+-----------+\n",
- "| default|tab1_single_file| false|\n",
- "| default| table_from_dir| false|\n",
- "+--------+----------------+-----------+\n",
- "\n"
- ]
- }
- ],
"source": [
- "# test how the tables were saved\n",
- "#spark.sql(\"drop database test CASCADE\")\n",
- "databaseName = \"default\"\n",
- "\n",
- "spark.sql(\"show databases\").show()\n",
- "spark.sql(\"show tables in \" + databaseName).show()"
+ "# Adding new partitions"
]
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "DataFrame[registration_dttm: timestamp, id: int, first_name: string, last_name: string, email: string, gender: string, ip_address: string, cc: string, country: string, birthdate: string, salary: double, title: string, comments: string]"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "# test how saving to table works\n",
- "tableName = \"table_from_dir\"\n",
- "spark.sql(\"select * from \" + databaseName + \".\" + tableName)"
+ "#Once added new partitions to the table, \n",
+ "# it is required to run the below command in order for the hive metastore to be aware of the new files.\n",
+ "spark.sql('msck repair table default.partitioned_table')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Run select via Hive"
+ "# Browse the Metastore"
]
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Done.\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " registration_dttm | \n",
- " id | \n",
- " first_name | \n",
- " last_name | \n",
- " email | \n",
- " gender | \n",
- " ip_address | \n",
- " cc | \n",
- " country | \n",
- " birthdate | \n",
- " salary | \n",
- " title | \n",
- " comments | \n",
- "
\n",
- " \n",
- " 2016-02-03 07:55:29.000 | \n",
- " 1 | \n",
- " Amanda | \n",
- " Jordan | \n",
- " ajordan0@com.com | \n",
- " Female | \n",
- " 1.197.201.2 | \n",
- " 6759521864920116 | \n",
- " Indonesia | \n",
- " 3/8/1971 | \n",
- " 49756.53 | \n",
- " Internal Auditor | \n",
- " 1E+02 | \n",
- "
\n",
- " \n",
- " 2016-02-03 06:47:06.000 | \n",
- " 8 | \n",
- " Harry | \n",
- " Howell | \n",
- " hhowell7@eepurl.com | \n",
- " Male | \n",
- " 91.235.51.73 | \n",
- " | \n",
- " Bosnia and Herzegovina | \n",
- " 3/1/1962 | \n",
- " 186469.43 | \n",
- " Web Developer IV | \n",
- " | \n",
- "
\n",
- " \n",
- " 2016-02-03 03:52:53.000 | \n",
- " 9 | \n",
- " Jose | \n",
- " Foster | \n",
- " jfoster8@yelp.com | \n",
- " Male | \n",
- " 132.31.53.61 | \n",
- " | \n",
- " South Korea | \n",
- " 3/27/1992 | \n",
- " 231067.84 | \n",
- " Software Test Engineer I | \n",
- " 1E+02 | \n",
- "
\n",
- " \n",
- " 2016-02-03 18:29:47.000 | \n",
- " 10 | \n",
- " Emily | \n",
- " Stewart | \n",
- " estewart9@opensource.org | \n",
- " Female | \n",
- " 143.28.251.245 | \n",
- " 3574254110301671 | \n",
- " Nigeria | \n",
- " 1/28/1997 | \n",
- " 27234.28 | \n",
- " Health Coach IV | \n",
- " | \n",
- "
\n",
- " \n",
- " 2016-02-03 00:36:21.000 | \n",
- " 4 | \n",
- " Denise | \n",
- " Riley | \n",
- " driley3@gmpg.org | \n",
- " Female | \n",
- " 140.35.109.83 | \n",
- " 3576031598965625 | \n",
- " China | \n",
- " 4/8/1997 | \n",
- " 90263.05 | \n",
- " Senior Cost Accountant | \n",
- " | \n",
- "
\n",
- " \n",
- " 2016-02-03 05:05:31.000 | \n",
- " 5 | \n",
- " Carlos | \n",
- " Burns | \n",
- " cburns4@miitbeian.gov.cn | \n",
- " | \n",
- " 169.113.235.40 | \n",
- " 5602256255204850 | \n",
- " South Africa | \n",
- " | \n",
- " None | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " 2016-02-03 07:22:34.000 | \n",
- " 6 | \n",
- " Kathryn | \n",
- " White | \n",
- " kwhite5@google.com | \n",
- " Female | \n",
- " 195.131.81.179 | \n",
- " 3583136326049310 | \n",
- " Indonesia | \n",
- " 2/25/1983 | \n",
- " 69227.11 | \n",
- " Account Executive | \n",
- " | \n",
- "
\n",
- " \n",
- " 2016-02-03 08:33:08.000 | \n",
- " 7 | \n",
- " Samuel | \n",
- " Holmes | \n",
- " sholmes6@foxnews.com | \n",
- " Male | \n",
- " 232.234.81.197 | \n",
- " 3582641366974690 | \n",
- " Portugal | \n",
- " 12/18/1987 | \n",
- " 14247.62 | \n",
- " Senior Financial Analyst | \n",
- " | \n",
- "
\n",
- " \n",
- " 2016-02-03 17:04:03.000 | \n",
- " 2 | \n",
- " Albert | \n",
- " Freeman | \n",
- " afreeman1@is.gd | \n",
- " Male | \n",
- " 218.111.175.34 | \n",
- " | \n",
- " Canada | \n",
- " 1/16/1968 | \n",
- " 150280.17 | \n",
- " Accountant IV | \n",
- " | \n",
- "
\n",
- " \n",
- " 2016-02-03 01:09:31.000 | \n",
- " 3 | \n",
- " Evelyn | \n",
- " Morgan | \n",
- " emorgan2@altervista.org | \n",
- " Female | \n",
- " 7.161.136.94 | \n",
- " 6767119071901597 | \n",
- " Russia | \n",
- " 2/1/1960 | \n",
- " 144972.51 | \n",
- " Structural Engineer | \n",
- " | \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- "[('2016-02-03 07:55:29.000', 1, 'Amanda', 'Jordan', 'ajordan0@com.com', 'Female', '1.197.201.2', '6759521864920116', 'Indonesia', '3/8/1971', 49756.53, 'Internal Auditor', '1E+02'),\n",
- " ('2016-02-03 06:47:06.000', 8, 'Harry', 'Howell', 'hhowell7@eepurl.com', 'Male', '91.235.51.73', '', 'Bosnia and Herzegovina', '3/1/1962', 186469.43, 'Web Developer IV', ''),\n",
- " ('2016-02-03 03:52:53.000', 9, 'Jose', 'Foster', 'jfoster8@yelp.com', 'Male', '132.31.53.61', '', 'South Korea', '3/27/1992', 231067.84, 'Software Test Engineer I', '1E+02'),\n",
- " ('2016-02-03 18:29:47.000', 10, 'Emily', 'Stewart', 'estewart9@opensource.org', 'Female', '143.28.251.245', '3574254110301671', 'Nigeria', '1/28/1997', 27234.28, 'Health Coach IV', ''),\n",
- " ('2016-02-03 00:36:21.000', 4, 'Denise', 'Riley', 'driley3@gmpg.org', 'Female', '140.35.109.83', '3576031598965625', 'China', '4/8/1997', 90263.05, 'Senior Cost Accountant', ''),\n",
- " ('2016-02-03 05:05:31.000', 5, 'Carlos', 'Burns', 'cburns4@miitbeian.gov.cn', '', '169.113.235.40', '5602256255204850', 'South Africa', '', None, '', ''),\n",
- " ('2016-02-03 07:22:34.000', 6, 'Kathryn', 'White', 'kwhite5@google.com', 'Female', '195.131.81.179', '3583136326049310', 'Indonesia', '2/25/1983', 69227.11, 'Account Executive', ''),\n",
- " ('2016-02-03 08:33:08.000', 7, 'Samuel', 'Holmes', 'sholmes6@foxnews.com', 'Male', '232.234.81.197', '3582641366974690', 'Portugal', '12/18/1987', 14247.62, 'Senior Financial Analyst', ''),\n",
- " ('2016-02-03 17:04:03.000', 2, 'Albert', 'Freeman', 'afreeman1@is.gd', 'Male', '218.111.175.34', '', 'Canada', '1/16/1968', 150280.17, 'Accountant IV', ''),\n",
- " ('2016-02-03 01:09:31.000', 3, 'Evelyn', 'Morgan', 'emorgan2@altervista.org', 'Female', '7.161.136.94', '6767119071901597', 'Russia', '2/1/1960', 144972.51, 'Structural Engineer', '')]"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "%sql select * from hive.default.tab1_single_file limit 10"
+ "# test how the tables were saved\n",
+ "#spark.sql(\"drop database test CASCADE\")\n",
+ "databaseName = \"default\"\n",
+ "\n",
+ "spark.sql(\"show databases\").show()\n",
+ "spark.sql(\"show tables in \" + databaseName).show()"
]
},
{
@@ -475,45 +232,41 @@
"e.g. select * from table_from_single_file2;"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Cleanup\n",
+ "This will only clean the metastore definitions.\n",
+ "
The underlying data won't be affected."
+ ]
+ },
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "DataFrame[]"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"spark.sql(\"drop table \" + databaseName + \".tab1_single_file\")"
]
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "DataFrame[]"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"spark.sql(\"drop table \" + databaseName + \".table_from_dir\")"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spark.sql(\"drop table \" + databaseName + \".partitioned_table\")"
+ ]
}
],
"metadata": {
diff --git a/examples/multiple-parquet-files/userdata1.parquet b/examples/multiple-parquet-files/userdata1.parquet
deleted file mode 100644
index 2ae23dac..00000000
Binary files a/examples/multiple-parquet-files/userdata1.parquet and /dev/null differ
diff --git a/examples/partitioned_pq/gender=Female/part-00000-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet b/examples/partitioned_pq/gender=Female/part-00000-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet
new file mode 100755
index 00000000..80b6ecb9
Binary files /dev/null and b/examples/partitioned_pq/gender=Female/part-00000-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet differ
diff --git a/examples/partitioned_pq/gender=Female/part-00001-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet b/examples/partitioned_pq/gender=Female/part-00001-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet
new file mode 100755
index 00000000..eb4259d2
Binary files /dev/null and b/examples/partitioned_pq/gender=Female/part-00001-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet differ
diff --git a/examples/partitioned_pq/gender=Female/part-00002-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet b/examples/partitioned_pq/gender=Female/part-00002-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet
new file mode 100755
index 00000000..3e1558a8
Binary files /dev/null and b/examples/partitioned_pq/gender=Female/part-00002-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet differ
diff --git a/examples/partitioned_pq/gender=Male/part-00000-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet b/examples/partitioned_pq/gender=Male/part-00000-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet
new file mode 100755
index 00000000..9016f9c9
Binary files /dev/null and b/examples/partitioned_pq/gender=Male/part-00000-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet differ
diff --git a/examples/partitioned_pq/gender=Male/part-00001-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet b/examples/partitioned_pq/gender=Male/part-00001-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet
new file mode 100755
index 00000000..1ba846d4
Binary files /dev/null and b/examples/partitioned_pq/gender=Male/part-00001-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet differ
diff --git a/examples/partitioned_pq/gender=Male/part-00002-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet b/examples/partitioned_pq/gender=Male/part-00002-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet
new file mode 100755
index 00000000..10a5eb6d
Binary files /dev/null and b/examples/partitioned_pq/gender=Male/part-00002-29794148-268c-4898-bdae-a1827c03e4c0.c000.snappy.parquet differ
diff --git a/examples/multiple-parquet-files/userdata2.parquet b/examples/spark-output/userdata2.parquet
similarity index 100%
rename from examples/multiple-parquet-files/userdata2.parquet
rename to examples/spark-output/userdata2.parquet
diff --git a/examples/multiple-parquet-files/userdata3.parquet b/examples/spark-output/userdata3.parquet
similarity index 100%
rename from examples/multiple-parquet-files/userdata3.parquet
rename to examples/spark-output/userdata3.parquet
diff --git a/igz-tutorials-get.sh b/igz-tutorials-get.sh
old mode 100755
new mode 100644
index c41d43a1..8bfb1ec0
--- a/igz-tutorials-get.sh
+++ b/igz-tutorials-get.sh
@@ -86,9 +86,10 @@ fi
if [ -z "${branch}" ]; then
platform_version="${IGZ_VERSION%%_*}"
echo "Detected platform version: ${platform_version}"
- latest_tag=`git ls-remote --tags --refs --sort='v:refname' "${git_url}" "refs/tags/v${platform_version}.*" | tail -n1 | awk '{ print $2}'`
+ tag_prefix=`echo ${platform_version} | cut -d . -f1-2`
+ latest_tag=`git ls-remote --tags --refs --sort='v:refname' "${git_url}" "refs/tags/v${tag_prefix}.*" | tail -n1 | awk '{ print $2}'`
if [ -z "${latest_tag}" ]; then
- error_exit "Couldn't locate a Git tag with prefix 'v${platform_version}.*'. Aborting..."
+ error_exit "Couldn't locate a Git tag with prefix 'v${tag_prefix}.*'. Aborting..."
else
# Remove the prefix from the Git tag
branch=${latest_tag#refs/tags/}
diff --git a/update-demos.sh b/update-demos.sh
old mode 100755
new mode 100644
index e1151f9f..6f12f0ab
--- a/update-demos.sh
+++ b/update-demos.sh
@@ -93,7 +93,7 @@ do
--mlrun-ver=?*)
mlrun_version=${1#*=} # Delete everything up to "=" and assign the remainder.
;;
- --umlrun-ver=) # Handle the case of an empty --mlrun-ver=
+ --mlrun-ver=) # Handle the case of an empty --mlrun-ver=
error_usage "$1: Missing MLRun version."
;;
--dry-run)