From e139800823aa2205c32162be5fbe5d966884f3c8 Mon Sep 17 00:00:00 2001 From: AB019TC Date: Wed, 22 Mar 2023 15:34:09 +0200 Subject: [PATCH 1/3] Added DateTime configs for `run_enceladus` --- scripts/bash/enceladus_env.template.sh | 3 +++ scripts/bash/run_enceladus.sh | 15 +++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/scripts/bash/enceladus_env.template.sh b/scripts/bash/enceladus_env.template.sh index 4582fe506..4a8ca6c6a 100644 --- a/scripts/bash/enceladus_env.template.sh +++ b/scripts/bash/enceladus_env.template.sh @@ -65,6 +65,9 @@ CONF_DEFAULT_DRA_MIN_EXECUTORS=0 CONF_DEFAULT_DRA_ALLOCATION_RATIO=0.5 CONF_DEFAULT_ADAPTIVE_TARGET_POSTSHUFFLE_INPUT_SIZE=134217728 +DEFAULT_PARQUET_DATETIME_READ_MODE="CORRECTED" +DEFAULT_PARQUET_DATETIME_WRITE_MODE="CORRECTED" + DEFAULT_DEPLOY_MODE="client" LOG_DIR="/tmp" diff --git a/scripts/bash/run_enceladus.sh b/scripts/bash/run_enceladus.sh index f14b02795..d2903eaca 100644 --- a/scripts/bash/run_enceladus.sh +++ b/scripts/bash/run_enceladus.sh @@ -40,6 +40,8 @@ DRA_MIN_EXECUTORS="$DEFAULT_DRA_MIN_EXECUTORS" DRA_MAX_EXECUTORS="$DEFAULT_DRA_MAX_EXECUTORS" DRA_ALLOCATION_RATIO="$DEFAULT_DRA_ALLOCATION_RATIO" ADAPTIVE_TARGET_POSTSHUFFLE_INPUT_SIZE="$DEFAULT_ADAPTIVE_TARGET_POSTSHUFFLE_INPUT_SIZE" +PARQUET_DATETIME_WRITE_MODE="$DEFAULT_PARQUET_DATETIME_WRITE_MODE" +PARQUET_DATETIME_READ_MODE="$DEFAULT_PARQUET_DATETIME_READ_MODE" # Command like default for the job JAR=${SPARK_JOBS_JAR_OVERRIDE:-$SPARK_JOBS_JAR} @@ -118,6 +120,14 @@ case $key in DRA_EXECUTOR_MEMORY="$2" shift 2 # past argument and value ;; + --parquet-datetime-read-mode) + PARQUET_DATETIME_READ_MODE="$2" + shift 2 # past argument and value + ;; + --parquet-datetime-write-mode) + PARQUET_DATETIME_WRITE_MODE="$2" + shift 2 # past argument and value + ;; --master) MASTER="$2" shift 2 # past argument and value @@ -477,6 +487,11 @@ else add_to_cmd_line "--executor-cores" "${EXECUTOR_CORES}" fi +add_spark_conf_cmd "spark.sql.parquet.datetimeRebaseModeInRead" "${PARQUET_DATETIME_READ_MODE}" +add_spark_conf_cmd "spark.sql.parquet.datetimeRebaseModeInWrite" "${PARQUET_DATETIME_WRITE_MODE}" +add_spark_conf_cmd "spark.sql.parquet.int96RebaseModeInRead" "${PARQUET_DATETIME_READ_MODE}" +add_spark_conf_cmd "spark.sql.parquet.int96RebaseModeInWrite" "${PARQUET_DATETIME_WRITE_MODE}" + JVM_CONF="spark.driver.extraJavaOptions=-Dstandardized.hdfs.path=$STD_HDFS_PATH \ -Dspline.mongodb.url=$SPLINE_MONGODB_URL -Dspline.mongodb.name=$SPLINE_MONGODB_NAME -Dhdp.version=$HDP_VERSION \ $MT_PATTERN $MIN_PARTITION_SIZE $MAX_PARTITION_SIZE" From 9098681fdfcae2ed1df448b1440f2c038a91c91a Mon Sep 17 00:00:00 2001 From: AB019TC Date: Wed, 22 Mar 2023 16:26:58 +0200 Subject: [PATCH 2/3] Added DateTime configs for `run_enceladus.cmd` --- scripts/bash/_print_help.sh | 2 ++ scripts/cmd/_run_enceladus.cmd | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/scripts/bash/_print_help.sh b/scripts/bash/_print_help.sh index 63ca1798e..cc6c36175 100644 --- a/scripts/bash/_print_help.sh +++ b/scripts/bash/_print_help.sh @@ -39,6 +39,8 @@ echo " --executor-memory MEM Memory per executor (e.g. 1000M echo " --dra-num-executors NUM Same as '--num-executors' but used when DRA is enabled. Use with care! DRA won't scale below this NUM." echo " --dra-executor-cores NUM Same as '--executor-memory' but used when DRA is enabled." echo " --dra-executor-memory MEM Same as '--executor-cores' but used when DRA is enabled." +echo " --parquet-datetime-read-mode" +echo " --parquet-datetime-write-mode" echo " --master MASTER_URL spark://host:port, mesos://host:port, yarn, k8s://https://host:port, or local" echo " --deploy-mode DEPLOY_MODE Whether to launch the driver program locally (\"client\") or on one of the worker machines inside the cluster (\"cluster\")." echo " --driver-cores NUM Number of cores used by the driver, only in cluster mode." diff --git a/scripts/cmd/_run_enceladus.cmd b/scripts/cmd/_run_enceladus.cmd index 49b73dd7c..08197d530 100644 --- a/scripts/cmd/_run_enceladus.cmd +++ b/scripts/cmd/_run_enceladus.cmd @@ -29,6 +29,8 @@ SET EXECUTOR_CORES=%DEFAULT_EXECUTOR_CORES% SET EXECUTOR_MEMORY=%EFAULT_EXECUTOR_MEMORY% SET DRA_EXECUTOR_CORES=%DEFAULT_DRA_EXECUTOR_CORES% SET DRA_EXECUTOR_MEMORY=%DEFAULT_DRA_EXECUTOR_MEMORY% +SET PARQUET_DATETIME_READ_MODE=%DEFAULT_PARQUET_DATETIME_READ_MODE% +SET PARQUET_DATETIME_WRITE_MODE=%DEFAULT_PARQUET_DATETIME_WRITE_MODE% SET NUM_EXECUTORS=%DEFAULT_NUM_EXECUTORS% SET DRA_NUM_EXECUTORS= SET FILES=%ENCELADUS_FILES% @@ -131,6 +133,18 @@ IF "%1"=="--dra-executor-memory" ( SHIFT GOTO CmdParse ) +IF "%1"=="--parquet-datetime-read-mode" ( + SET PARQUET_DATETIME_READ_MODE=%2 + SHIFT + SHIFT + GOTO CmdParse +) +IF "%1"=="--parquet-datetime-write-mode" ( + SET PARQUET_DATETIME_WRITE_MODE=%2 + SHIFT + SHIFT + GOTO CmdParse +) IF "%1"=="--master" ( SET MASTER=%2 SHIFT @@ -561,6 +575,11 @@ IF %DRA_ENABLED%==true ( IF DEFINED EXECUTOR_CORES SET CMD_LINE=%CMD_LINE% --executor-cores %EXECUTOR_CORES% ) +SET SPARK_CONF=%SPARK_CONF% --conf spark.sql.parquet.datetimeRebaseModeInRead=%PARQUET_DATETIME_READ_MODE% +SET SPARK_CONF=%SPARK_CONF% --conf spark.sql.parquet.datetimeRebaseModeInWrite=%PARQUET_DATETIME_WRITE_MODE% +SET SPARK_CONF=%SPARK_CONF% --conf spark.sql.parquet.int96RebaseModeInRead=%PARQUET_DATETIME_READ_MODE% +SET SPARK_CONF=%SPARK_CONF% --conf spark.sql.parquet.int96RebaseModeInWrite=%PARQUET_DATETIME_WRITE_MODE% + SET JVM_CONF=spark.driver.extraJavaOptions=-Dstandardized.hdfs.path=%STD_HDFS_PATH% -Dspline.mongodb.url=%SPLINE_MONGODB_URL% -Dspline.mongodb.name=%SPLINE_MONGODB_NAME% -Dhdp.version=%HDP_VERSION% %MT_PATTERN% %MIN_BLOCK_SIZE% %MAX_BLOCK_SIZE% SET CMD_LINE=%SPARK_SUBMIT% From 8df173e1604b57ab7caf42019302689c382a084c Mon Sep 17 00:00:00 2001 From: AB019TC Date: Thu, 23 Mar 2023 09:14:35 +0200 Subject: [PATCH 3/3] Added DateTime variables descriptions --- scripts/bash/_print_help.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/bash/_print_help.sh b/scripts/bash/_print_help.sh index cc6c36175..7219699a5 100644 --- a/scripts/bash/_print_help.sh +++ b/scripts/bash/_print_help.sh @@ -39,8 +39,8 @@ echo " --executor-memory MEM Memory per executor (e.g. 1000M echo " --dra-num-executors NUM Same as '--num-executors' but used when DRA is enabled. Use with care! DRA won't scale below this NUM." echo " --dra-executor-cores NUM Same as '--executor-memory' but used when DRA is enabled." echo " --dra-executor-memory MEM Same as '--executor-cores' but used when DRA is enabled." -echo " --parquet-datetime-read-mode" -echo " --parquet-datetime-write-mode" +echo " --parquet-datetime-read-mode Spark_submit datetime read mode for parquet files with the default value of 'corrected'." +echo " --parquet-datetime-write-mode Spark_submit datetime write mode for parquet files with the default value of 'corrected'." echo " --master MASTER_URL spark://host:port, mesos://host:port, yarn, k8s://https://host:port, or local" echo " --deploy-mode DEPLOY_MODE Whether to launch the driver program locally (\"client\") or on one of the worker machines inside the cluster (\"cluster\")." echo " --driver-cores NUM Number of cores used by the driver, only in cluster mode."