From 265b1e592eb979613288b34dfc132e5edfc70964 Mon Sep 17 00:00:00 2001 From: "Hongbin Ma (Mahone)" Date: Tue, 21 May 2024 13:50:31 +0800 Subject: [PATCH 1/4] refine UT framework to promote GPU evaluation Signed-off-by: Hongbin Ma (Mahone) --- .../com/nvidia/spark/rapids/RapidsConf.scala | 9 ++ .../com/nvidia/spark/rapids/RapidsMeta.scala | 2 +- .../suites/RapidsJsonExpressionsSuite.scala | 42 +++++++ .../utils/RapidsSQLTestsBaseTrait.scala | 2 + .../sql/rapids/utils/RapidsTestSettings.scala | 31 ++--- .../sql/rapids/utils/RapidsTestsTrait.scala | 119 ++++++------------ 6 files changed, 111 insertions(+), 94 deletions(-) create mode 100644 tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonExpressionsSuite.scala diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index 6dd0441f76f..f7ff38cb193 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -1485,6 +1485,13 @@ val GPU_COREDUMP_PIPE_PATTERN = conf("spark.rapids.gpu.coreDump.pipePattern") .stringConf .createWithDefault(false.toString) + val FOLDABLE_NON_LIT_ALLOWED = conf("spark.rapids.sql.test.isFoldableNonLitAllowed") + .doc("Only to be used in tests. If `true` the foldable expressions that are not literals " + + "will be allowed") + .internal() + .booleanConf + .createWithDefault(false) + val TEST_CONF = conf("spark.rapids.sql.test.enabled") .doc("Intended to be used by unit tests, if enabled all operations must run on the " + "GPU or an error happens.") @@ -2428,6 +2435,8 @@ class RapidsConf(conf: Map[String, String]) extends Logging { lazy val isTestEnabled: Boolean = get(TEST_CONF) + lazy val isFoldableNonLitAllowed: Boolean = get(FOLDABLE_NON_LIT_ALLOWED) + /** * Convert a string value to the injection configuration OomInjection. * diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala index 5cb170968a6..a876ea6c9e0 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala @@ -1108,7 +1108,7 @@ abstract class BaseExprMeta[INPUT <: Expression]( case _ => ExpressionContext.getRegularOperatorContext(this) } - val isFoldableNonLitAllowed: Boolean = false + val isFoldableNonLitAllowed: Boolean = conf.isFoldableNonLitAllowed // There are 4 levels of timezone check in GPU plan tag phase: // Level 1: Check whether an expression is related to timezone. This is achieved by diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonExpressionsSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonExpressionsSuite.scala new file mode 100644 index 00000000000..2e0c7528eba --- /dev/null +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonExpressionsSuite.scala @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "330"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.suites + +import org.apache.spark.sql.catalyst.expressions.JsonExpressionsSuite +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.rapids.utils.RapidsTestsTrait + +class RapidsJsonExpressionsSuite extends JsonExpressionsSuite with RapidsTestsTrait { + override def beforeAll(): Unit = { + super.beforeAll() + SQLConf.get.setConfString("spark.rapids.sql.expression.JsonTuple", "true") + SQLConf.get.setConfString("spark.rapids.sql.expression.GetJsonObject", "true") + SQLConf.get.setConfString("spark.rapids.sql.expression.JsonToStructs", "true") + SQLConf.get.setConfString("spark.rapids.sql.expression.StructsToJson", "true") + } + + override def afterAll(): Unit = { + super.afterAll() + SQLConf.get.unsetConf("spark.rapids.sql.expression.JsonTuple") + SQLConf.get.unsetConf("spark.rapids.sql.expression.GetJsonObject") + SQLConf.get.unsetConf("spark.rapids.sql.expression.JsonToStructs") + SQLConf.get.unsetConf("spark.rapids.sql.expression.StructsToJson") + } +} diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsSQLTestsBaseTrait.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsSQLTestsBaseTrait.scala index 540c70a2ee1..e384a0c1ea5 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsSQLTestsBaseTrait.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsSQLTestsBaseTrait.scala @@ -121,6 +121,8 @@ object RapidsSQLTestsBaseTrait { "org.apache.spark.sql.rapids.ExecutionPlanCaptureCallback") .set("spark.sql.warehouse.dir", warehouse) .set("spark.sql.cache.serializer", "com.nvidia.spark.ParquetCachedBatchSerializer") + .set("spark.sql.session.timeZone", "UTC") + .set("spark.rapids.sql.explain", "ALL") .setAppName("rapids spark plugin running Vanilla Spark UT") conf diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala index 4faede8269a..ca15b0f0259 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala @@ -19,7 +19,7 @@ spark-rapids-shim-json-lines ***/ package org.apache.spark.sql.rapids.utils -import org.apache.spark.sql.rapids.suites.{RapidsCastSuite, RapidsDataFrameAggregateSuite, RapidsJsonFunctionsSuite, RapidsJsonSuite, RapidsMathFunctionsSuite, RapidsRegexpExpressionsSuite, RapidsStringExpressionsSuite, RapidsStringFunctionsSuite} +import org.apache.spark.sql.rapids.suites.{RapidsCastSuite, RapidsDataFrameAggregateSuite, RapidsJsonExpressionsSuite, RapidsJsonFunctionsSuite, RapidsJsonSuite, RapidsMathFunctionsSuite, RapidsRegexpExpressionsSuite, RapidsStringExpressionsSuite, RapidsStringFunctionsSuite} // Some settings' line length exceeds 100 // scalastyle:off line.size.limit @@ -41,6 +41,22 @@ class RapidsTestSettings extends BackendTestSettings { .exclude("SPARK-17641: collect functions should not collect null values", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10772")) .exclude("SPARK-19471: AggregationIterator does not initialize the generated result projection before using it", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10772")) .exclude("SPARK-24788: RelationalGroupedDataset.toString with unresolved exprs should not fail", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10801")) + enableSuite[RapidsJsonExpressionsSuite] + .exclude("from_json - invalid data", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849")) + .exclude("from_json - input=empty array, schema=struct, output=single row with null", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849")) + .exclude("from_json - input=empty object, schema=struct, output=single row with null", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849")) + .exclude("SPARK-20549: from_json bad UTF-8", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849")) + .exclude("from_json with timestamp", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849")) + .exclude("to_json - array", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849")) + .exclude("to_json - array with single empty row", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849")) + .exclude("to_json - empty array", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849")) + .exclude("to_json with timestamp", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849")) + .exclude("SPARK-21513: to_json support map[string, struct] to json", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849")) + .exclude("SPARK-21513: to_json support map[struct, struct] to json", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849")) + .exclude("SPARK-21513: to_json support map[string, integer] to json", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849")) + .exclude("to_json - array with maps", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849")) + .exclude("to_json - array with single map", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849")) + .exclude("from_json missing fields", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849")) enableSuite[RapidsJsonFunctionsSuite] enableSuite[RapidsJsonSuite] .exclude("Casting long as timestamp", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10773")) @@ -58,24 +74,11 @@ class RapidsTestSettings extends BackendTestSettings { .exclude("SPARK-37360: Timestamp type inference for a mix of TIMESTAMP_NTZ and TIMESTAMP_LTZ", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10773")) enableSuite[RapidsMathFunctionsSuite] enableSuite[RapidsRegexpExpressionsSuite] - .exclude("RegexReplace", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10774")) - .exclude("RegexExtract", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10774")) - .exclude("RegexExtractAll", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10774")) - .exclude("SPLIT", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10774")) enableSuite[RapidsStringExpressionsSuite] .exclude("SPARK-22498: Concat should not generate codes beyond 64KB", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) .exclude("SPARK-22549: ConcatWs should not generate codes beyond 64KB", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) .exclude("SPARK-22550: Elt should not generate codes beyond 64KB", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) - .exclude("StringComparison", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) - .exclude("Substring", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) - .exclude("ascii for string", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) - .exclude("base64/unbase64 for string", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) - .exclude("encode/decode for string", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) .exclude("SPARK-22603: FormatString should not generate codes beyond 64KB", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) - .exclude("LOCATE", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) - .exclude("LPAD/RPAD", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) - .exclude("REPEAT", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) - .exclude("length for string / binary", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) .exclude("ParseUrl", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) enableSuite[RapidsStringFunctionsSuite] } diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestsTrait.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestsTrait.scala index 08e10f4d4fd..1f8a775825a 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestsTrait.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestsTrait.scala @@ -20,28 +20,29 @@ spark-rapids-shim-json-lines ***/ package org.apache.spark.sql.rapids.utils import java.io.File +import java.util.TimeZone import com.nvidia.spark.rapids.{GpuProjectExec, TestStats} import org.apache.commons.io.{FileUtils => fu} import org.apache.commons.math3.util.Precision import org.scalactic.TripleEqualsSupport.Spread import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer -import org.apache.spark.sql.{Column, DataFrame, Row, SparkSession} +import org.apache.spark.sql.{Column, Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis.ResolveTimeZone import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection -import org.apache.spark.sql.catalyst.optimizer.{ConstantFolding, ConvertToLocalRelation, NullPropagation} +import org.apache.spark.sql.catalyst.optimizer.{ConstantFolding, ConvertToLocalRelation} import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, MapData, TypeUtils} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.rapids.utils.RapidsQueryTestUtil.isNaNOrInf import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.UTF8String + trait RapidsTestsTrait extends RapidsTestsCommonTrait { + val originalTimeZone = TimeZone.getDefault + override def beforeAll(): Unit = { // prepare working paths val basePathDir = new File(basePath) @@ -54,9 +55,12 @@ trait RapidsTestsTrait extends RapidsTestsCommonTrait { super.beforeAll() initializeSession() _spark.sparkContext.setLogLevel("WARN") + TimeZone.setDefault(TimeZone.getTimeZone("UTC")) } override def afterAll(): Unit = { + TimeZone.setDefault(originalTimeZone) + try { super.afterAll() } finally { @@ -91,12 +95,17 @@ trait RapidsTestsTrait extends RapidsTestsCommonTrait { .config( SQLConf.OPTIMIZER_EXCLUDED_RULES.key, ConvertToLocalRelation.ruleName + - "," + ConstantFolding.ruleName + "," + NullPropagation.ruleName) + "," + ConstantFolding.ruleName) .config("spark.rapids.sql.enabled", "true") .config("spark.plugins", "com.nvidia.spark.SQLPlugin") .config("spark.sql.queryExecutionListeners", "org.apache.spark.sql.rapids.ExecutionPlanCaptureCallback") .config("spark.sql.warehouse.dir", warehouse) + .config("spark.sql.session.timeZone","UTC") + .config("spark.rapids.sql.explain", "ALL") + .config("spark.rapids.sql.test.isFoldableNonLitAllowed", "true") + // uncomment below config to run `strict mode`, where fallback to CPU is treated as fail + // .config("spark.rapids.sql.test.enabled", "true") .appName("rapids spark plugin running Vanilla Spark UT") _spark = sparkBuilder @@ -115,31 +124,20 @@ trait RapidsTestsTrait extends RapidsTestsCommonTrait { val expr = resolver.resolveTimeZones(expression) assert(expr.resolved) - if (canConvertToDataFrame(inputRow)) { - rapidsCheckExpression(expr, expected, inputRow) - } else { - logWarning(s"The status of this unit test is not guaranteed.") - val catalystValue = CatalystTypeConverters.convertToCatalyst(expected) - checkEvaluationWithoutCodegen(expr, catalystValue, inputRow) - checkEvaluationWithMutableProjection(expr, catalystValue, inputRow) - if (GenerateUnsafeProjection.canSupport(expr.dataType)) { - checkEvaluationWithUnsafeProjection(expr, catalystValue, inputRow) - } - checkEvaluationWithOptimization(expr, catalystValue, inputRow) - } + rapidsCheckExpression(expr, expected, inputRow) } /** * Sort map data by key and return the sorted key array and value array. * * @param input - * : input map data. + * : input map data. * @param kt - * : key type. + * : key type. * @param vt - * : value type. + * : value type. * @return - * the sorted key array and value array. + * the sorted key array and value array. */ private def getSortedArrays( input: MapData, @@ -202,7 +200,7 @@ trait RapidsTestsTrait extends RapidsTestsCommonTrait { case (result: Double, expected: Double) => if ( (isNaNOrInf(result) || isNaNOrInf(expected)) - || (result == -0.0) || (expected == -0.0) + || (result == -0.0) || (expected == -0.0) ) { java.lang.Double.doubleToRawLongBits(result) == java.lang.Double.doubleToRawLongBits(expected) @@ -221,20 +219,23 @@ trait RapidsTestsTrait extends RapidsTestsCommonTrait { RapidsTestConstants.SUPPORTED_DATA_TYPE.acceptsType(expr.dataType) } - def rapidsCheckExpression(expression: Expression, expected: Any, inputRow: InternalRow): Unit = { - val df = if (inputRow != EmptyRow && inputRow != InternalRow.empty) { - convertInternalRowToDataFrame(inputRow) - } else { - val schema = StructType(StructField("a", IntegerType, nullable = true) :: Nil) - val empData = Seq(Row(1)) - _spark.createDataFrame(_spark.sparkContext.parallelize(empData), schema) + def rapidsCheckExpression(origExpr: Expression, expected: Any, inputRow: InternalRow): Unit = { + // many of of the expressions in RAPIDS do not support + // vectorized parameters. (e.g. regexp_replace) + // So we downgrade all expression + // evaluation to use scalar parameters. + // In a follow-up issue we'll take care of the expressions + // those already support vectorized paramters. + val expression = origExpr.transformUp { + case BoundReference(ordinal, dataType, _) => + Literal(inputRow.asInstanceOf[GenericInternalRow].get(ordinal, dataType), dataType) } - val resultDF = df.select(Column(expression)) + val resultDF = _spark.range(0, 1).select(Column(expression)) val result = resultDF.collect() TestStats.testUnitNumber = TestStats.testUnitNumber + 1 if ( checkDataTypeSupported(expression) && - expression.children.forall(checkDataTypeSupported) + expression.children.forall(checkDataTypeSupported) ) { val projectTransformer = resultDF.queryExecution.executedPlan.collect { case p: GpuProjectExec => p @@ -254,13 +255,13 @@ trait RapidsTestsTrait extends RapidsTestsCommonTrait { if ( !(checkResult(result.head.get(0), expected, expression.dataType, expression.nullable) || checkResult( - CatalystTypeConverters.createToCatalystConverter(expression.dataType)( - result.head.get(0) - ), // decimal precision is wrong from value - CatalystTypeConverters.convertToCatalyst(expected), - expression.dataType, - expression.nullable - )) + CatalystTypeConverters.createToCatalystConverter(expression.dataType)( + result.head.get(0) + ), // decimal precision is wrong from value + CatalystTypeConverters.convertToCatalyst(expected), + expression.dataType, + expression.nullable + )) ) { val input = if (inputRow == EmptyRow) "" else s", input: $inputRow" fail( @@ -292,44 +293,4 @@ trait RapidsTestsTrait extends RapidsTestsCommonTrait { } true } - - def convertInternalRowToDataFrame(inputRow: InternalRow): DataFrame = { - val structFileSeq = new ArrayBuffer[StructField]() - val values = inputRow match { - case genericInternalRow: GenericInternalRow => - genericInternalRow.values - case _ => throw new UnsupportedOperationException("Unsupported InternalRow.") - } - values.foreach { - case boolean: java.lang.Boolean => - structFileSeq.append(StructField("bool", BooleanType, boolean == null)) - case short: java.lang.Short => - structFileSeq.append(StructField("i16", ShortType, short == null)) - case byte: java.lang.Byte => - structFileSeq.append(StructField("i8", ByteType, byte == null)) - case integer: java.lang.Integer => - structFileSeq.append(StructField("i32", IntegerType, integer == null)) - case long: java.lang.Long => - structFileSeq.append(StructField("i64", LongType, long == null)) - case float: java.lang.Float => - structFileSeq.append(StructField("fp32", FloatType, float == null)) - case double: java.lang.Double => - structFileSeq.append(StructField("fp64", DoubleType, double == null)) - case utf8String: UTF8String => - structFileSeq.append(StructField("str", StringType, utf8String == null)) - case byteArr: Array[Byte] => - structFileSeq.append(StructField("vbin", BinaryType, byteArr == null)) - case decimal: Decimal => - structFileSeq.append( - StructField("dec", DecimalType(decimal.precision, decimal.scale), decimal == null)) - case null => - structFileSeq.append(StructField("null", IntegerType, nullable = true)) - case unsupported @ _ => - throw new UnsupportedOperationException(s"Unsupported type: ${unsupported.getClass}") - } - val fields = structFileSeq.toSeq - _spark.internalCreateDataFrame( - _spark.sparkContext.parallelize(Seq(inputRow)), - StructType(fields)) - } } From 6780184a9fff54e418385756f8ff238d506b728b Mon Sep 17 00:00:00 2001 From: "Hongbin Ma (Mahone)" Date: Tue, 21 May 2024 15:07:14 +0800 Subject: [PATCH 2/4] enable some exprs for json Signed-off-by: Hongbin Ma (Mahone) --- .../suites/RapidsJsonFunctionsSuite.scala | 19 ++++++++++++++++++- .../sql/rapids/suites/RapidsJsonSuite.scala | 16 ++++++++++++++++ .../utils/RapidsSQLTestsBaseTrait.scala | 4 ++++ .../sql/rapids/utils/RapidsTestSettings.scala | 5 +++++ .../sql/rapids/utils/RapidsTestsTrait.scala | 2 ++ 5 files changed, 45 insertions(+), 1 deletion(-) diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonFunctionsSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonFunctionsSuite.scala index 43150c0df4b..40087aeaad9 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonFunctionsSuite.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonFunctionsSuite.scala @@ -20,6 +20,23 @@ spark-rapids-shim-json-lines ***/ package org.apache.spark.sql.rapids.suites import org.apache.spark.sql.JsonFunctionsSuite +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.rapids.utils.RapidsSQLTestsTrait -class RapidsJsonFunctionsSuite extends JsonFunctionsSuite with RapidsSQLTestsTrait {} +class RapidsJsonFunctionsSuite extends JsonFunctionsSuite with RapidsSQLTestsTrait { + override def beforeAll(): Unit = { + super.beforeAll() + SQLConf.get.setConfString("spark.rapids.sql.expression.JsonTuple", "true") + SQLConf.get.setConfString("spark.rapids.sql.expression.GetJsonObject", "true") + SQLConf.get.setConfString("spark.rapids.sql.expression.JsonToStructs", "true") + SQLConf.get.setConfString("spark.rapids.sql.expression.StructsToJson", "true") + } + + override def afterAll(): Unit = { + super.afterAll() + SQLConf.get.unsetConf("spark.rapids.sql.expression.JsonTuple") + SQLConf.get.unsetConf("spark.rapids.sql.expression.GetJsonObject") + SQLConf.get.unsetConf("spark.rapids.sql.expression.JsonToStructs") + SQLConf.get.unsetConf("spark.rapids.sql.expression.StructsToJson") + } +} diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonSuite.scala index 6d244c67ad0..32203f708f9 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonSuite.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonSuite.scala @@ -31,6 +31,22 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap class RapidsJsonSuite extends JsonSuite with RapidsSQLTestsBaseTrait { + override def beforeAll(): Unit = { + super.beforeAll() + SQLConf.get.setConfString("spark.rapids.sql.expression.JsonTuple", "true") + SQLConf.get.setConfString("spark.rapids.sql.expression.GetJsonObject", "true") + SQLConf.get.setConfString("spark.rapids.sql.expression.JsonToStructs", "true") + SQLConf.get.setConfString("spark.rapids.sql.expression.StructsToJson", "true") + } + + override def afterAll(): Unit = { + super.afterAll() + SQLConf.get.unsetConf("spark.rapids.sql.expression.JsonTuple") + SQLConf.get.unsetConf("spark.rapids.sql.expression.GetJsonObject") + SQLConf.get.unsetConf("spark.rapids.sql.expression.JsonToStructs") + SQLConf.get.unsetConf("spark.rapids.sql.expression.StructsToJson") + } + /** Returns full path to the given file in the resource folder */ override protected def testFile(fileName: String): String = { getWorkspaceFilePath("sql", "core", "src", "test", "resources").toString + "/" + fileName diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsSQLTestsBaseTrait.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsSQLTestsBaseTrait.scala index e384a0c1ea5..6529ef42d00 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsSQLTestsBaseTrait.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsSQLTestsBaseTrait.scala @@ -123,6 +123,10 @@ object RapidsSQLTestsBaseTrait { .set("spark.sql.cache.serializer", "com.nvidia.spark.ParquetCachedBatchSerializer") .set("spark.sql.session.timeZone", "UTC") .set("spark.rapids.sql.explain", "ALL") + // uncomment below config to run `strict mode`, where fallback to CPU is treated as fail + // .set("spark.rapids.sql.test.enabled", "true") + // .set("spark.rapids.sql.test.allowedNonGpu", + // "SerializeFromObjectExec,DeserializeToObjectExec,ExternalRDDScanExec") .setAppName("rapids spark plugin running Vanilla Spark UT") conf diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala index ca15b0f0259..40ee67a4c5c 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala @@ -58,6 +58,11 @@ class RapidsTestSettings extends BackendTestSettings { .exclude("to_json - array with single map", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849")) .exclude("from_json missing fields", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849")) enableSuite[RapidsJsonFunctionsSuite] + .exclude("from_json invalid json", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10852")) + .exclude("to_json - array", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10852")) + .exclude("to_json - map", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10852")) + .exclude("to_json - array of primitive types", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10852")) + .exclude("SPARK-33134: return partial results only for root JSON objects", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10852")) enableSuite[RapidsJsonSuite] .exclude("Casting long as timestamp", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10773")) .exclude("Write timestamps correctly with timestampFormat option and timeZone option", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10773")) diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestsTrait.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestsTrait.scala index 1f8a775825a..44b791124c2 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestsTrait.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestsTrait.scala @@ -106,6 +106,8 @@ trait RapidsTestsTrait extends RapidsTestsCommonTrait { .config("spark.rapids.sql.test.isFoldableNonLitAllowed", "true") // uncomment below config to run `strict mode`, where fallback to CPU is treated as fail // .config("spark.rapids.sql.test.enabled", "true") + // .config("spark.rapids.sql.test.allowedNonGpu", + // "SerializeFromObjectExec,DeserializeToObjectExec,ExternalRDDScanExec") .appName("rapids spark plugin running Vanilla Spark UT") _spark = sparkBuilder From e900d50b1ba66d71b2c74700a1b112d908498513 Mon Sep 17 00:00:00 2001 From: "Hongbin Ma (Mahone)" Date: Tue, 21 May 2024 16:43:38 +0800 Subject: [PATCH 3/4] exclude flaky tests Signed-off-by: Hongbin Ma (Mahone) --- .../org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala index 40ee67a4c5c..d0d67c41f0e 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala @@ -80,6 +80,9 @@ class RapidsTestSettings extends BackendTestSettings { enableSuite[RapidsMathFunctionsSuite] enableSuite[RapidsRegexpExpressionsSuite] enableSuite[RapidsStringExpressionsSuite] + .exclude("concat", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) + .exclude("string substring_index function", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) + .exclude("format_number / FormatNumber", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) .exclude("SPARK-22498: Concat should not generate codes beyond 64KB", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) .exclude("SPARK-22549: ConcatWs should not generate codes beyond 64KB", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) .exclude("SPARK-22550: Elt should not generate codes beyond 64KB", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) From 7237cb601ff9e8097deab6e95c968f6cb4c863a7 Mon Sep 17 00:00:00 2001 From: "Hongbin Ma (Mahone)" Date: Wed, 22 May 2024 11:51:42 +0800 Subject: [PATCH 4/4] fix review comments Signed-off-by: Hongbin Ma (Mahone) --- .../suites/RapidsJsonExpressionsSuite.scala | 22 ++-------- .../suites/RapidsJsonFunctionsSuite.scala | 22 ++-------- .../sql/rapids/suites/RapidsJsonSuite.scala | 22 ++-------- .../rapids/utils/RapidsJsonConfTrait.scala | 42 +++++++++++++++++++ .../sql/rapids/utils/RapidsTestsTrait.scala | 10 ++--- 5 files changed, 55 insertions(+), 63 deletions(-) create mode 100644 tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsJsonConfTrait.scala diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonExpressionsSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonExpressionsSuite.scala index 2e0c7528eba..eb5fdc535e8 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonExpressionsSuite.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonExpressionsSuite.scala @@ -20,23 +20,7 @@ spark-rapids-shim-json-lines ***/ package org.apache.spark.sql.rapids.suites import org.apache.spark.sql.catalyst.expressions.JsonExpressionsSuite -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.rapids.utils.RapidsTestsTrait +import org.apache.spark.sql.rapids.utils.{RapidsJsonConfTrait, RapidsTestsTrait} -class RapidsJsonExpressionsSuite extends JsonExpressionsSuite with RapidsTestsTrait { - override def beforeAll(): Unit = { - super.beforeAll() - SQLConf.get.setConfString("spark.rapids.sql.expression.JsonTuple", "true") - SQLConf.get.setConfString("spark.rapids.sql.expression.GetJsonObject", "true") - SQLConf.get.setConfString("spark.rapids.sql.expression.JsonToStructs", "true") - SQLConf.get.setConfString("spark.rapids.sql.expression.StructsToJson", "true") - } - - override def afterAll(): Unit = { - super.afterAll() - SQLConf.get.unsetConf("spark.rapids.sql.expression.JsonTuple") - SQLConf.get.unsetConf("spark.rapids.sql.expression.GetJsonObject") - SQLConf.get.unsetConf("spark.rapids.sql.expression.JsonToStructs") - SQLConf.get.unsetConf("spark.rapids.sql.expression.StructsToJson") - } -} +class RapidsJsonExpressionsSuite + extends JsonExpressionsSuite with RapidsTestsTrait with RapidsJsonConfTrait {} diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonFunctionsSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonFunctionsSuite.scala index 40087aeaad9..ebddc498202 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonFunctionsSuite.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonFunctionsSuite.scala @@ -20,23 +20,7 @@ spark-rapids-shim-json-lines ***/ package org.apache.spark.sql.rapids.suites import org.apache.spark.sql.JsonFunctionsSuite -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.rapids.utils.RapidsSQLTestsTrait +import org.apache.spark.sql.rapids.utils.{RapidsJsonConfTrait, RapidsSQLTestsTrait} -class RapidsJsonFunctionsSuite extends JsonFunctionsSuite with RapidsSQLTestsTrait { - override def beforeAll(): Unit = { - super.beforeAll() - SQLConf.get.setConfString("spark.rapids.sql.expression.JsonTuple", "true") - SQLConf.get.setConfString("spark.rapids.sql.expression.GetJsonObject", "true") - SQLConf.get.setConfString("spark.rapids.sql.expression.JsonToStructs", "true") - SQLConf.get.setConfString("spark.rapids.sql.expression.StructsToJson", "true") - } - - override def afterAll(): Unit = { - super.afterAll() - SQLConf.get.unsetConf("spark.rapids.sql.expression.JsonTuple") - SQLConf.get.unsetConf("spark.rapids.sql.expression.GetJsonObject") - SQLConf.get.unsetConf("spark.rapids.sql.expression.JsonToStructs") - SQLConf.get.unsetConf("spark.rapids.sql.expression.StructsToJson") - } -} +class RapidsJsonFunctionsSuite + extends JsonFunctionsSuite with RapidsSQLTestsTrait with RapidsJsonConfTrait {} diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonSuite.scala index 32203f708f9..3e9f685dfdc 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonSuite.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonSuite.scala @@ -24,29 +24,13 @@ import org.apache.spark.sql.execution.datasources.{InMemoryFileIndex, NoopCache} import org.apache.spark.sql.execution.datasources.json.JsonSuite import org.apache.spark.sql.execution.datasources.v2.json.JsonScanBuilder import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.rapids.utils.RapidsSQLTestsBaseTrait +import org.apache.spark.sql.rapids.utils.{RapidsJsonConfTrait, RapidsSQLTestsBaseTrait} import org.apache.spark.sql.sources import org.apache.spark.sql.types.{IntegerType, StructType} import org.apache.spark.sql.util.CaseInsensitiveStringMap -class RapidsJsonSuite extends JsonSuite with RapidsSQLTestsBaseTrait { - - override def beforeAll(): Unit = { - super.beforeAll() - SQLConf.get.setConfString("spark.rapids.sql.expression.JsonTuple", "true") - SQLConf.get.setConfString("spark.rapids.sql.expression.GetJsonObject", "true") - SQLConf.get.setConfString("spark.rapids.sql.expression.JsonToStructs", "true") - SQLConf.get.setConfString("spark.rapids.sql.expression.StructsToJson", "true") - } - - override def afterAll(): Unit = { - super.afterAll() - SQLConf.get.unsetConf("spark.rapids.sql.expression.JsonTuple") - SQLConf.get.unsetConf("spark.rapids.sql.expression.GetJsonObject") - SQLConf.get.unsetConf("spark.rapids.sql.expression.JsonToStructs") - SQLConf.get.unsetConf("spark.rapids.sql.expression.StructsToJson") - } - +class RapidsJsonSuite + extends JsonSuite with RapidsSQLTestsBaseTrait with RapidsJsonConfTrait { /** Returns full path to the given file in the resource folder */ override protected def testFile(fileName: String): String = { getWorkspaceFilePath("sql", "core", "src", "test", "resources").toString + "/" + fileName diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsJsonConfTrait.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsJsonConfTrait.scala new file mode 100644 index 00000000000..a1324deb21b --- /dev/null +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsJsonConfTrait.scala @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "330"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.utils + +import org.scalatest.{BeforeAndAfterAll, Suite} + +import org.apache.spark.sql.internal.SQLConf + +trait RapidsJsonConfTrait extends BeforeAndAfterAll { this: Suite => + override def beforeAll(): Unit = { + super.beforeAll() + SQLConf.get.setConfString("spark.rapids.sql.expression.JsonTuple", true.toString) + SQLConf.get.setConfString("spark.rapids.sql.expression.GetJsonObject", true.toString) + SQLConf.get.setConfString("spark.rapids.sql.expression.JsonToStructs", true.toString) + SQLConf.get.setConfString("spark.rapids.sql.expression.StructsToJson", true.toString) + } + + override def afterAll(): Unit = { + SQLConf.get.unsetConf("spark.rapids.sql.expression.JsonTuple") + SQLConf.get.unsetConf("spark.rapids.sql.expression.GetJsonObject") + SQLConf.get.unsetConf("spark.rapids.sql.expression.JsonToStructs") + SQLConf.get.unsetConf("spark.rapids.sql.expression.StructsToJson") + super.afterAll() + } +} diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestsTrait.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestsTrait.scala index 44b791124c2..1e5cf9277e9 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestsTrait.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestsTrait.scala @@ -222,12 +222,10 @@ trait RapidsTestsTrait extends RapidsTestsCommonTrait { } def rapidsCheckExpression(origExpr: Expression, expected: Any, inputRow: InternalRow): Unit = { - // many of of the expressions in RAPIDS do not support - // vectorized parameters. (e.g. regexp_replace) - // So we downgrade all expression - // evaluation to use scalar parameters. - // In a follow-up issue we'll take care of the expressions - // those already support vectorized paramters. + // many of the expressions in RAPIDS do not support vectorized parameters(e.g. regexp_replace). + // So we downgrade all expression evaluation to use scalar parameters. + // In a follow-up issue (https://github.com/NVIDIA/spark-rapids/issues/10859), + // we'll take care of the expressions those already support vectorized parameters. val expression = origExpr.transformUp { case BoundReference(ordinal, dataType, _) => Literal(inputRow.asInstanceOf[GenericInternalRow].get(ordinal, dataType), dataType)