From 07bdd30d60d60875397da19afd7d533ad653ada3 Mon Sep 17 00:00:00 2001 From: liyuan Date: Mon, 20 Jan 2025 17:55:23 +0800 Subject: [PATCH 1/3] temp commit Signed-off-by: liyuan --- .../SQL+DF-Examples/open-street-map/README.md | 15 ++ .../open-street-map-benchmarks.ipynb | 240 ++++++++++++++++++ 2 files changed, 255 insertions(+) create mode 100644 examples/SQL+DF-Examples/open-street-map/README.md create mode 100644 examples/SQL+DF-Examples/open-street-map/notebooks/open-street-map-benchmarks.ipynb diff --git a/examples/SQL+DF-Examples/open-street-map/README.md b/examples/SQL+DF-Examples/open-street-map/README.md new file mode 100644 index 00000000..ecb4bb11 --- /dev/null +++ b/examples/SQL+DF-Examples/open-street-map/README.md @@ -0,0 +1,15 @@ +# Open Street Map Benchmarks +[OpenStreetMap (OSM)](https://www.openstreetmap.org/copyright/en) is a collaborative, free, +and open-source digital map of the world. It allows users to contribute to the creation and editing +of maps through a simple, web-based interface. Anyone can contribute to the project by mapping areas +they are familiar with, either through manual surveys, satellite imagery, or aerial photographs. + +Users can perform analyses by running Spark ETL jobs based on datasets that include geospatial analysis, +traffic flow, socioeconomic analysis, environmental analysis, route optimization, etc. The queries on it +can stray into terabytes, [Spark Rapids](https://nvidia.github.io/spark-rapids/) can speed up the +entire analysis process. + + + + + diff --git a/examples/SQL+DF-Examples/open-street-map/notebooks/open-street-map-benchmarks.ipynb b/examples/SQL+DF-Examples/open-street-map/notebooks/open-street-map-benchmarks.ipynb new file mode 100644 index 00000000..a4f05941 --- /dev/null +++ b/examples/SQL+DF-Examples/open-street-map/notebooks/open-street-map-benchmarks.ipynb @@ -0,0 +1,240 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "62787244", + "metadata": {}, + "source": [ + "# Open Street Map Benchmarks\n", + "Below benchmarks show case how user do analysis based on OpenStreetMap dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1c3a15d7", + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "from pyspark.conf import SparkConf\n", + "from time import time\n", + "RAPIDS_JAR = \"/usr/lib/spark/jars/rapids-4-spark_2.12-24.10.0.jar\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0c3536ad", + "metadata": {}, + "outputs": [], + "source": [ + "def runQuery(spark, appName, query, retryTimes):\n", + " count = 0\n", + " total_time = 0\n", + " while count < retryTimes:\n", + " start = time()\n", + " spark.sql(query).show(5)\n", + " end = time()\n", + " total_time += round(end - start, 2)\n", + " count = count + 1\n", + " print(\"Retry times : {}, \".format(count) + appName + \" benchmark takes {} seconds\".format(round(end - start, 2)))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "975717da", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "25/01/20 09:23:47 INFO SparkEnv: Registering MapOutputTracker\n", + "25/01/20 09:23:47 INFO SparkEnv: Registering BlockManagerMaster\n", + "25/01/20 09:23:47 INFO SparkEnv: Registering BlockManagerMasterHeartbeat\n", + "25/01/20 09:23:47 INFO SparkEnv: Registering OutputCommitCoordinator\n", + "25/01/20 09:23:48 WARN RapidsPluginUtils: RAPIDS Accelerator 24.10.0 using cudf 24.10.0, private revision bd4e99e18e20234ee0c54f95f4b0bfce18a6255e\n", + "25/01/20 09:23:48 WARN RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.\n", + "25/01/20 09:23:48 WARN RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n", + "25/01/20 09:23:48 WARN RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n", + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------------------------------------\n" + ] + } + ], + "source": [ + "# Common spark settings\n", + "conf = SparkConf()\n", + "conf.setMaster(\"yarn\")\n", + "conf.setAppName(\"Open Street Map Benchmarks\")\n", + "conf.set(\"spark.executor.instances\",\"2\")\n", + "conf.set(\"spark.executor.cores\", \"16\") \n", + "conf.set(\"spark.driver.memory\", \"10g\")\n", + "conf.set(\"spark.driver.cores\", \"1\")\n", + "## The tasks will run on GPU memory, so there is no need to set a high host memory\n", + "conf.set(\"spark.executor.memory\", \"16g\")\n", + "\n", + "conf.set(\"spark.locality.wait\", \"0\")\n", + "conf.set(\"spark.rapids.sql.reader.batchSizeBytes\",\"512m\")\n", + "conf.set(\"spark.dynamicAllocation.enabled\", \"false\") \n", + "\n", + "# Plugin settings\n", + "conf.set(\"spark.executor.resource.gpu.amount\", \"1\")\n", + "# 4 tasks will run concurrently per GPU\n", + "conf.set(\"spark.rapids.sql.concurrentGpuTasks\", \"2\")\n", + "# Pinned 8g host memory to transfer data between GPU and host memory\n", + "conf.set(\"spark.rapids.memory.pinnedPool.size\", \"4G\")\n", + "# 16 tasks will run concurrently per executor, as we set spark.executor.cores=16\n", + "conf.set(\"spark.task.resource.gpu.amount\", \"0.0625\") \n", + "conf.set(\"spark.rapids.sql.enabled\", \"true\") \n", + "conf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\n", + "conf.set(\"spark.driver.extraClassPath\", RAPIDS_JAR)\n", + "conf.set(\"spark.executor.extraClassPath\", RAPIDS_JAR)\n", + "conf.set(\"spark.jars\", RAPIDS_JAR)\n", + "# Create spark session\n", + "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n", + "# Load dataframe and create tempView\n", + "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/history_changesets/\").createOrReplaceTempView(\"history_changesets\")\n", + "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/history_layers/\").createOrReplaceTempView(\"history_layers\")\n", + "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/history_nodes/\").createOrReplaceTempView(\"history_nodes\")\n", + "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/history_relations/\").createOrReplaceTempView(\"history_relations\")\n", + "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/history_ways/\").createOrReplaceTempView(\"history_ways\")\n", + "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_changesets/\").createOrReplaceTempView(\"planet_changesets\")\n", + "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features/\").createOrReplaceTempView(\"planet_features\")\n", + "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features_lines/\").createOrReplaceTempView(\"planet_features_lines\")\n", + "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features_multilinestrings/\").createOrReplaceTempView(\"planet_features_multilinestrings\")\n", + "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features_multipolygons/\").createOrReplaceTempView(\"planet_features_multipolygons\")\n", + "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features_other_relations/\").createOrReplaceTempView(\"planet_features_other_relations\")\n", + "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features_points/\").createOrReplaceTempView(\"planet_features_points\")\n", + "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_layers/\").createOrReplaceTempView(\"planet_layers\")\n", + "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_nodes/\").createOrReplaceTempView(\"planet_nodes\")\n", + "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_relations/\").createOrReplaceTempView(\"planet_relations\")\n", + "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_ways/\").createOrReplaceTempView(\"planet_ways\")\n", + "print(\"-\"*50)" + ] + }, + { + "cell_type": "markdown", + "id": "7136eb63", + "metadata": {}, + "source": [ + "### Calculate length of ways with 'highway' tag in Japan\n" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "dd12d749", + "metadata": {}, + "outputs": [], + "source": [ + "query = '''\n", + "select count(osm_id),feature_type from planet_features group by feature_type\n", + "\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "33368682", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "25/01/20 09:50:03 WARN GpuOverrides: \n", + "!Exec cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n", + " @Partitioning could run on GPU\n", + "\n", + "25/01/20 09:50:03 WARN GpuOverrides: \n", + "!Exec cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n", + " @Partitioning could run on GPU\n", + "\n", + "25/01/20 09:50:03 WARN GpuOverrides: \n", + "!Exec cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n", + " @Partitioning could run on GPU\n", + "\n", + "25/01/20 09:50:29 WARN GpuOverrides: ==========================>(311 + 1) / 312]\n", + "!Exec cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n", + " @Partitioning could run on GPU\n", + "\n", + "25/01/20 09:50:29 WARN GpuOverrides: \n", + "!Exec cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n", + " @Partitioning could run on GPU\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------------+----------------+\n", + "|count(osm_id)| feature_type|\n", + "+-------------+----------------+\n", + "| 5628557| multipolygons|\n", + "| 2782002| other_relations|\n", + "| 247031768| lines|\n", + "| 184798630| points|\n", + "| 804708|multilinestrings|\n", + "+-------------+----------------+\n", + "\n", + "Retry times : 1, Japan highway benchmark takes 26.28 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " \r" + ] + } + ], + "source": [ + "# Run microbenchmark with n retry time\n", + "runQuery(spark,\"Japan highway\",query,1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d47060d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file From 04264c498caf1d517da785367c5132ef53057bed Mon Sep 17 00:00:00 2001 From: liyuan Date: Mon, 20 Jan 2025 18:40:55 +0800 Subject: [PATCH 2/3] buildings Signed-off-by: liyuan --- .../open-street-map-benchmarks.ipynb | 140 +++++++++--------- 1 file changed, 72 insertions(+), 68 deletions(-) diff --git a/examples/SQL+DF-Examples/open-street-map/notebooks/open-street-map-benchmarks.ipynb b/examples/SQL+DF-Examples/open-street-map/notebooks/open-street-map-benchmarks.ipynb index a4f05941..892b67ae 100644 --- a/examples/SQL+DF-Examples/open-street-map/notebooks/open-street-map-benchmarks.ipynb +++ b/examples/SQL+DF-Examples/open-street-map/notebooks/open-street-map-benchmarks.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "id": "1c3a15d7", "metadata": {}, "outputs": [], @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "id": "0c3536ad", "metadata": {}, "outputs": [], @@ -43,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "id": "975717da", "metadata": {}, "outputs": [ @@ -53,14 +53,14 @@ "text": [ "Setting default log level to \"WARN\".\n", "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", - "25/01/20 09:23:47 INFO SparkEnv: Registering MapOutputTracker\n", - "25/01/20 09:23:47 INFO SparkEnv: Registering BlockManagerMaster\n", - "25/01/20 09:23:47 INFO SparkEnv: Registering BlockManagerMasterHeartbeat\n", - "25/01/20 09:23:47 INFO SparkEnv: Registering OutputCommitCoordinator\n", - "25/01/20 09:23:48 WARN RapidsPluginUtils: RAPIDS Accelerator 24.10.0 using cudf 24.10.0, private revision bd4e99e18e20234ee0c54f95f4b0bfce18a6255e\n", - "25/01/20 09:23:48 WARN RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.\n", - "25/01/20 09:23:48 WARN RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n", - "25/01/20 09:23:48 WARN RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n", + "25/01/20 10:36:07 INFO SparkEnv: Registering MapOutputTracker\n", + "25/01/20 10:36:07 INFO SparkEnv: Registering BlockManagerMaster\n", + "25/01/20 10:36:07 INFO SparkEnv: Registering BlockManagerMasterHeartbeat\n", + "25/01/20 10:36:07 INFO SparkEnv: Registering OutputCommitCoordinator\n", + "25/01/20 10:36:08 WARN RapidsPluginUtils: RAPIDS Accelerator 24.10.0 using cudf 24.10.0, private revision bd4e99e18e20234ee0c54f95f4b0bfce18a6255e\n", + "25/01/20 10:36:08 WARN RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.\n", + "25/01/20 10:36:08 WARN RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n", + "25/01/20 10:36:08 WARN RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n", " \r" ] }, @@ -81,6 +81,7 @@ "conf.set(\"spark.executor.cores\", \"16\") \n", "conf.set(\"spark.driver.memory\", \"10g\")\n", "conf.set(\"spark.driver.cores\", \"1\")\n", + "conf.set(\"spark.sql.parquet.binaryAsString\",\"true\")\n", "## The tasks will run on GPU memory, so there is no need to set a high host memory\n", "conf.set(\"spark.executor.memory\", \"16g\")\n", "\n", @@ -104,21 +105,21 @@ "# Create spark session\n", "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n", "# Load dataframe and create tempView\n", - "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/history_changesets/\").createOrReplaceTempView(\"history_changesets\")\n", - "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/history_layers/\").createOrReplaceTempView(\"history_layers\")\n", - "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/history_nodes/\").createOrReplaceTempView(\"history_nodes\")\n", - "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/history_relations/\").createOrReplaceTempView(\"history_relations\")\n", - "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/history_ways/\").createOrReplaceTempView(\"history_ways\")\n", - "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_changesets/\").createOrReplaceTempView(\"planet_changesets\")\n", - "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features/\").createOrReplaceTempView(\"planet_features\")\n", - "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features_lines/\").createOrReplaceTempView(\"planet_features_lines\")\n", - "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features_multilinestrings/\").createOrReplaceTempView(\"planet_features_multilinestrings\")\n", - "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features_multipolygons/\").createOrReplaceTempView(\"planet_features_multipolygons\")\n", - "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features_other_relations/\").createOrReplaceTempView(\"planet_features_other_relations\")\n", - "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features_points/\").createOrReplaceTempView(\"planet_features_points\")\n", - "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_layers/\").createOrReplaceTempView(\"planet_layers\")\n", - "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_nodes/\").createOrReplaceTempView(\"planet_nodes\")\n", - "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_relations/\").createOrReplaceTempView(\"planet_relations\")\n", + "# spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/history_changesets/\").createOrReplaceTempView(\"history_changesets\")\n", + "# spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/history_layers/\").createOrReplaceTempView(\"history_layers\")\n", + "# spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/history_nodes/\").createOrReplaceTempView(\"history_nodes\")\n", + "# spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/history_relations/\").createOrReplaceTempView(\"history_relations\")\n", + "# spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/history_ways/\").createOrReplaceTempView(\"history_ways\")\n", + "# spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_changesets/\").createOrReplaceTempView(\"planet_changesets\")\n", + "# spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features/\").createOrReplaceTempView(\"planet_features\")\n", + "# spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features_lines/\").createOrReplaceTempView(\"planet_features_lines\")\n", + "# spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features_multilinestrings/\").createOrReplaceTempView(\"planet_features_multilinestrings\")\n", + "# spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features_multipolygons/\").createOrReplaceTempView(\"planet_features_multipolygons\")\n", + "# spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features_other_relations/\").createOrReplaceTempView(\"planet_features_other_relations\")\n", + "# spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features_points/\").createOrReplaceTempView(\"planet_features_points\")\n", + "# spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_layers/\").createOrReplaceTempView(\"planet_layers\")\n", + "# spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_nodes/\").createOrReplaceTempView(\"planet_nodes\")\n", + "# spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_relations/\").createOrReplaceTempView(\"planet_relations\")\n", "spark.read.parquet(\"gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_ways/\").createOrReplaceTempView(\"planet_ways\")\n", "print(\"-\"*50)" ] @@ -128,89 +129,92 @@ "id": "7136eb63", "metadata": {}, "source": [ - "### Calculate length of ways with 'highway' tag in Japan\n" + "### Calculate the building numbers over 5 levels" ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 4, "id": "dd12d749", "metadata": {}, - "outputs": [], - "source": [ - "query = '''\n", - "select count(osm_id),feature_type from planet_features group by feature_type\n", - "\n", - "'''" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "33368682", - "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "25/01/20 09:50:03 WARN GpuOverrides: \n", + "25/01/20 10:37:35 WARN GpuOverrides: \n", + "!Exec cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n", + " @Partitioning could run on GPU\n", + "\n", + "25/01/20 10:37:35 WARN GpuOverrides: \n", "!Exec cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n", " @Partitioning could run on GPU\n", "\n", - "25/01/20 09:50:03 WARN GpuOverrides: \n", + "25/01/20 10:37:35 WARN GpuOverrides: \n", "!Exec cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n", " @Partitioning could run on GPU\n", "\n", - "25/01/20 09:50:03 WARN GpuOverrides: \n", + "25/01/20 10:38:12 WARN GpuOverrides: 80][Stage 2:==============>(279 + 1) / 280]\n", "!Exec cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n", " @Partitioning could run on GPU\n", "\n", - "25/01/20 09:50:29 WARN GpuOverrides: ==========================>(311 + 1) / 312]\n", + "25/01/20 10:40:11 WARN GpuOverrides: ==========================>(279 + 1) / 280]\n", "!Exec cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n", " @Partitioning could run on GPU\n", "\n", - "25/01/20 09:50:29 WARN GpuOverrides: \n", + "25/01/20 10:40:11 WARN GpuOverrides: \n", "!Exec cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n", " @Partitioning could run on GPU\n", - "\n" + "\n", + " \r" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "+-------------+----------------+\n", - "|count(osm_id)| feature_type|\n", - "+-------------+----------------+\n", - "| 5628557| multipolygons|\n", - "| 2782002| other_relations|\n", - "| 247031768| lines|\n", - "| 184798630| points|\n", - "| 804708|multilinestrings|\n", - "+-------------+----------------+\n", + "+---------+-------+-------------------+---------+-------------------+\n", + "| id|version| username|changeset| osm_timestamp|\n", + "+---------+-------+-------------------+---------+-------------------+\n", + "|109568528| 10| Stalker61| 91879126|2020-10-02 18:25:07|\n", + "|236002978| 3| MaksimTo| 54555625|2017-12-12 03:43:01|\n", + "| 85820500| 4| Achoin| 50043141|2017-07-04 19:46:51|\n", + "|505004057| 5| mbouzada| 58562021|2018-04-30 17:31:20|\n", + "|252765823| 1|catastro_pontevedra| 19552118|2013-12-20 16:01:44|\n", + "+---------+-------+-------------------+---------+-------------------+\n", + "only showing top 5 rows\n", "\n", - "Retry times : 1, Japan highway benchmark takes 26.28 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\r", - " \r" + "Retry times : 1, 5 levels building numbers benchmark takes 159.22 seconds\n" ] } ], "source": [ - "# Run microbenchmark with n retry time\n", - "runQuery(spark,\"Japan highway\",query,1)" + "query = \"\"\"\n", + "WITH exploded_tags AS (\n", + " SELECT id, version, username, changeset, osm_timestamp, tag.key AS tag_key, tag.value AS tag_value\n", + " FROM planet_ways\n", + " LATERAL VIEW explode(all_tags) AS tag\n", + ")\n", + "\n", + "SELECT id, version, username, changeset, osm_timestamp\n", + "FROM exploded_tags\n", + "WHERE tag_key = 'building' \n", + " AND EXISTS (\n", + " SELECT 1\n", + " FROM exploded_tags AS inner_tags\n", + " WHERE inner_tags.id = exploded_tags.id \n", + " AND inner_tags.tag_key = 'building:levels' \n", + " AND CAST(inner_tags.tag_value AS INT) > 5\n", + " )\n", + "\n", + "\"\"\"\n", + "runQuery(spark,\"5 levels building numbers\",query,1)" ] }, { "cell_type": "code", "execution_count": null, - "id": "9d47060d", + "id": "d9b3d5db", "metadata": {}, "outputs": [], "source": [] From 2d454ed2a9d7f41a266391eed7424564bff76220 Mon Sep 17 00:00:00 2001 From: liyuan Date: Mon, 20 Jan 2025 18:44:35 +0800 Subject: [PATCH 3/3] buildings Signed-off-by: liyuan --- examples/SQL+DF-Examples/open-street-map/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/SQL+DF-Examples/open-street-map/README.md b/examples/SQL+DF-Examples/open-street-map/README.md index ecb4bb11..6cffbc86 100644 --- a/examples/SQL+DF-Examples/open-street-map/README.md +++ b/examples/SQL+DF-Examples/open-street-map/README.md @@ -9,7 +9,10 @@ traffic flow, socioeconomic analysis, environmental analysis, route optimization can stray into terabytes, [Spark Rapids](https://nvidia.github.io/spark-rapids/) can speed up the entire analysis process. +Steps to prepare the dataset: +Performance (CPU VS GPU) and env: +IMG