diff --git a/gradle.properties b/gradle.properties index eea91798f7a0..62c9a1c479ae 100644 --- a/gradle.properties +++ b/gradle.properties @@ -18,9 +18,6 @@ doFindbugs=false # Run animal sniffer to verify compatibility of API with actual Java version doAnimalSniffer=false -# include ORC support inside default h2o.jar. -doIncludeOrc=false - # include MOJO Pipeline support inside default h2o.jar. doIncludeMojoPipeline=false @@ -55,12 +52,9 @@ httpClientVersion=4.5.2 defaultParquetVersion=1.12.3 # Default Hadoop client version -defaultHadoopVersion=2.8.4 +defaultHadoopVersion=3.3.5 defaultHdfsDependency=hadoop-hdfs-client -# Default Hive version -defaultHiveExecVersion=1.1.0 - defaultWebserverModule=h2o-jetty-9 # default module to be included in assemblies defaultExtWebserverModule=h2o-jetty-9-ext diff --git a/h2o-assemblies/main/build.gradle b/h2o-assemblies/main/build.gradle index 611f10dedd30..8351fde0fb65 100644 --- a/h2o-assemblies/main/build.gradle +++ b/h2o-assemblies/main/build.gradle @@ -17,12 +17,28 @@ dependencies { api project(":h2o-persist-http") api project(":h2o-persist-hdfs") api project(":h2o-ext-krbstandalone") - if (project.hasProperty("doIncludeOrc") && project.doIncludeOrc == "true") { - api project(":h2o-orc-parser") - } api project(":h2o-parquet-parser") + api("org.apache.hadoop:hadoop-mapreduce-client-core:${defaultHadoopVersion}") { + transitive = false + } api project(":h2o-k8s-int") + api "org.apache.hadoop:hadoop-hdfs-client:${defaultHadoopVersion}" + implementation("org.apache.hadoop:hadoop-mapreduce-client-core:${defaultHadoopVersion}") { + transitive = false + } + api("org.apache.hadoop:hadoop-common:${defaultHadoopVersion}") { + exclude group: "com.sun.jersey" + exclude group: "javax.servlet" + exclude group: "org.apache.avro" + exclude group: "org.apache.curator" + exclude group: "org.apache.zookeeper" + exclude group: "org.eclipse.jetty" + } + + // Need to a newer org.apache.hadoop.hive.shims.ShimLoader to make older hive JDBC drivers work on Hadoop 3. + implementation 'org.apache.hive.shims:hive-shims-common:2.3.9' + constraints { api('com.fasterxml.jackson.core:jackson-databind:2.13.4.2') { because 'Fixes CVE-2022-42003' @@ -50,6 +66,17 @@ shadowJar { exclude 'test.properties' exclude 'cockpitlite.properties' exclude 'devpay_products.properties' + + // Need to a newer org.apache.hadoop.hive.shims.ShimLoader to make older hive JDBC drivers work on Hadoop 3. + // Excluding other classes of org.apache.hive.shims:hive-shims-common. + exclude 'org/apache/hadoop/hive/thrift/**/*.*' + exclude 'org/apache/hadoop/hive/io/**/*.*' + exclude 'org/apache/hadoop/hive/upgrade/**/*.*' + exclude 'org/apache/hadoop/hive/shims/Utils.*' + exclude 'org/apache/hadoop/hive/shims/CombineHiveKey.*' + exclude 'org/apache/hadoop/hive/shims/*Shims*.*' + exclude 'org/apache/hadoop/hive/shims/HiveHarFileSystem.*' + manifest { attributes 'Main-Class': 'water.H2OApp' attributes 'Add-Opens': 'java.base/java.lang java.base/java.util java.base/java.lang.reflect' diff --git a/h2o-assemblies/steam/build.gradle b/h2o-assemblies/steam/build.gradle index 7de32060f6f4..210cb0d94e6c 100644 --- a/h2o-assemblies/steam/build.gradle +++ b/h2o-assemblies/steam/build.gradle @@ -25,15 +25,13 @@ dependencies { api(project(":h2o-persist-s3")) { exclude group: "org.apache.hadoop" } - api(project(":h2o-persist-hdfs")) { - exclude group: "org.apache.hadoop" - } + api(project(":h2o-persist-hdfs")) api(project(":h2o-parquet-parser")) { exclude group: "org.apache.hadoop" } // Force latest version Hadoop with unused components excluded - we need Hadoop for Parquet and S3A export - api "org.apache.hadoop:hadoop-hdfs-client:3.3.5" - api("org.apache.hadoop:hadoop-common:3.3.5") { + api "org.apache.hadoop:hadoop-hdfs-client:${defaultHadoopVersion}" + api("org.apache.hadoop:hadoop-common:${defaultHadoopVersion}") { exclude group: "com.sun.jersey" exclude group: "javax.servlet" exclude group: "org.apache.avro" @@ -42,18 +40,11 @@ dependencies { exclude group: "org.eclipse.jetty" exclude group: "org.apache.hadoop.thirdparty", module: "hadoop-shaded-protobuf_3_7" } - api("org.apache.hadoop:hadoop-aws:3.3.5") { - exclude group: "com.amazonaws", module: "aws-java-sdk-bundle" - } - // aws-java-sdk-dynamodb is required for S3A support, S3A import throws NoClassDefFoundError (AmazonDynamoDBException) - api("com.amazonaws:aws-java-sdk-dynamodb:${awsJavaSdkVersion}") { - transitive = false - } // Upgrade dependencies coming from Hadoop to address vulnerabilities api "org.apache.commons:commons-compress:1.21" // Force specific Parquet version to avoid dependency on vulnerable FasterXML jackson-mapper-asl api "org.apache.parquet:parquet-hadoop:${defaultParquetVersion}" - api("org.apache.hadoop:hadoop-mapreduce-client-core:3.3.5") { + api("org.apache.hadoop:hadoop-mapreduce-client-core:${defaultHadoopVersion}") { transitive = false } // Google OAuth force version diff --git a/h2o-extensions/krbstandalone/build.gradle b/h2o-extensions/krbstandalone/build.gradle index 9ac573ace6cc..78f882606bfb 100644 --- a/h2o-extensions/krbstandalone/build.gradle +++ b/h2o-extensions/krbstandalone/build.gradle @@ -3,6 +3,7 @@ description = "H2O Kerberos Standalone support" dependencies { api project(":h2o-core") api project(":h2o-persist-hdfs") + compileOnly("org.apache.hadoop:hadoop-common:$defaultHadoopVersion") api("org.apache.hadoop:hadoop-auth:$defaultHadoopVersion") { // Pull all dependencies to allow run directly from IDE or command line transitive = true diff --git a/h2o-parsers/h2o-orc-parser/build.gradle b/h2o-parsers/h2o-orc-parser/build.gradle index dd992438cf72..09308d37da14 100644 --- a/h2o-parsers/h2o-orc-parser/build.gradle +++ b/h2o-parsers/h2o-orc-parser/build.gradle @@ -12,8 +12,10 @@ configurations{ } dependencies { - hadoopCommonExclude("org.apache.hadoop:hadoop-common:${defaultHadoopVersion}") - hiveExecExclude("org.apache.hive:hive-exec:$defaultHiveExecVersion"){ + def hadoopVersion="2.8.4" + def hiveExecVersion="1.1.0" + hadoopCommonExclude("org.apache.hadoop:hadoop-common:$hadoopVersion") + hiveExecExclude("org.apache.hive:hive-exec:$hiveExecVersion"){ // this dependency need to be excluded manually as Gradle can't find it in maven central exclude group: 'org.pentaho', module: 'pentaho-aggdesigner-algorithm' exclude group: 'eigenbase', module: 'eigenbase-properties' @@ -23,14 +25,13 @@ dependencies { api(project(":h2o-persist-hdfs")) { exclude group: 'ai.h2o', module: 'h2o-core' exclude group: 'net.java.dev.jets3t', module: 'jets3t' - exclude group: 'org.apache.hadoop', module: 'hadoop-client' - exclude group: 'org.apache.hadoop', module: 'hadoop-aws' + exclude group: 'org.apache.hadoop' } // Note: What is connection between hive-exec version and hadoop-version and orc version? // Note: In this case we are using hive version which is compatible with $defaultHadoopVersion // Note: for newest version it should be replaces by hive-orc - api("org.apache.hive:hive-exec:$defaultHiveExecVersion") { + api("org.apache.hive:hive-exec:$hiveExecVersion") { // we can't use transitive=false so we need to exclude the dependencies manually configurations.hiveExecExclude.getResolvedConfiguration().getResolvedArtifacts().each { if (it.moduleVersion.id.group != "org.apache.hive" && it.moduleVersion.id.module.name != "hive-exec") { @@ -40,7 +41,7 @@ dependencies { exclude group: 'org.pentaho', module: 'pentaho-aggdesigner-algorithm' } // For compilation we need common - api("org.apache.hadoop:hadoop-common:$defaultHadoopVersion") { + api("org.apache.hadoop:hadoop-common:$hadoopVersion") { // we can't use transitive=false so we need to exclude the dependencies manually configurations.hadoopCommonExclude.getResolvedConfiguration().getResolvedArtifacts().each { if (it.moduleVersion.id.group != "org.apache.hadoop" && it.moduleVersion.id.module.name != "hadoop-common") { @@ -52,7 +53,7 @@ dependencies { testImplementation project(":h2o-test-support") testRuntimeOnly project(":${defaultWebserverModule}") // We need correct version of MapRe Hadoop to run JUnits - testRuntimeOnly("org.apache.hadoop:hadoop-client:$defaultHadoopVersion") { + testRuntimeOnly("org.apache.hadoop:hadoop-client:$hadoopVersion") { exclude module: "jasper-runtime" exclude module: "jasper-compiler" exclude module: "curator-client" diff --git a/h2o-parsers/h2o-parquet-compat/h2o-parquet-v17-compat/build.gradle b/h2o-parsers/h2o-parquet-compat/h2o-parquet-v17-compat/build.gradle index 1b3b333ef1db..77905c235cd2 100644 --- a/h2o-parsers/h2o-parquet-compat/h2o-parquet-v17-compat/build.gradle +++ b/h2o-parsers/h2o-parquet-compat/h2o-parquet-v17-compat/build.gradle @@ -16,9 +16,7 @@ dependencies { } // Parquet support api("org.apache.parquet:parquet-hadoop:1.7.0") - api("org.apache.hadoop:hadoop-common:${parquetHadoopVersion}") { - transitive = false - } + compileOnly("org.apache.hadoop:hadoop-common:${parquetHadoopVersion}") testImplementation project(":h2o-test-support") testImplementation project(":h2o-parquet-parser-tests") @@ -28,6 +26,7 @@ dependencies { testImplementation("org.apache.hadoop:hadoop-client:${parquetHadoopVersion}") { exclude module: "servlet-api" } + testImplementation("org.apache.hadoop:hadoop-common:${parquetHadoopVersion}") } apply from: "${rootDir}/gradle/dataCheck.gradle" diff --git a/h2o-parsers/h2o-parquet-parser/build.gradle b/h2o-parsers/h2o-parquet-parser/build.gradle index 79d171bd5b5a..00a41816fd15 100644 --- a/h2o-parsers/h2o-parquet-parser/build.gradle +++ b/h2o-parsers/h2o-parquet-parser/build.gradle @@ -6,6 +6,9 @@ description = "H2O Parquet Parser" dependencies { testImplementation project(":h2o-test-support") testImplementation project(":h2o-parquet-parser-tests") + testImplementation("org.apache.hadoop:hadoop-mapreduce-client-core:${defaultHadoopVersion}") { + transitive = false + } testRuntimeOnly project(":${defaultWebserverModule}") } diff --git a/h2o-parsers/h2o-parquet-parser/parquet_dependencies.gradle b/h2o-parsers/h2o-parquet-parser/parquet_dependencies.gradle index 4abb69601464..dbfe5b30a52b 100644 --- a/h2o-parsers/h2o-parquet-parser/parquet_dependencies.gradle +++ b/h2o-parsers/h2o-parquet-parser/parquet_dependencies.gradle @@ -1,14 +1,7 @@ def parquetHadoopVersion = binding.variables.get("hadoopVersion") ? binding.variables.get("hadoopVersion") : defaultHadoopVersion -configurations{ - // Configuration used to get all transitive dependencies for org.apache.hadoop:hadoop-common - hadoopCommonExclude -} - dependencies { - hadoopCommonExclude("org.apache.hadoop:hadoop-common:${parquetHadoopVersion}") - api project(":h2o-core") api(project(":h2o-persist-hdfs")) { exclude group: 'ai.h2o', module: 'h2o-core' @@ -21,15 +14,6 @@ dependencies { api("org.apache.parquet:parquet-hadoop:${defaultParquetVersion}") - api("org.apache.hadoop:hadoop-common:${parquetHadoopVersion}") { - // we can't use transitive=false so we need to exclude the dependencies manually - configurations.hadoopCommonExclude.getResolvedConfiguration().getResolvedArtifacts().each { - if (it.moduleVersion.id.group != "org.apache.hadoop" && it.moduleVersion.id.module.name != "hadoop-common") { - exclude group: it.moduleVersion.id.group, module: it.moduleVersion.id.module.name - } - } - } - implementation("org.apache.hadoop:hadoop-mapreduce-client-core:${parquetHadoopVersion}") { - transitive = false - } + compileOnly("org.apache.hadoop:hadoop-common:${parquetHadoopVersion}") + testImplementation("org.apache.hadoop:hadoop-common:${parquetHadoopVersion}") } diff --git a/h2o-persist-hdfs/build.gradle b/h2o-persist-hdfs/build.gradle index 1822c9b6a69e..6420975e036c 100644 --- a/h2o-persist-hdfs/build.gradle +++ b/h2o-persist-hdfs/build.gradle @@ -8,15 +8,25 @@ configurations { dependencies { api project(":h2o-core") + compileOnly("org.apache.hadoop:hadoop-common:$defaultHadoopVersion") api("org.apache.hadoop:$defaultHdfsDependency:$defaultHadoopVersion") { // Pull all dependencies to allow run directly from IDE or command line transitive = true } - api("org.apache.hadoop:hadoop-aws:$defaultHadoopVersion") + api("org.apache.hadoop:hadoop-aws:${defaultHadoopVersion}") { + exclude group: "com.amazonaws", module: "aws-java-sdk-bundle" + } + // aws-java-sdk-dynamodb is required for S3A support, S3A import throws NoClassDefFoundError (AmazonDynamoDBException) + api("com.amazonaws:aws-java-sdk-dynamodb:${awsJavaSdkVersion}") { + transitive = false + } + api("com.nimbusds:nimbus-jose-jwt:9.11.3") testImplementation project(":h2o-test-support") + testImplementation "org.apache.hadoop:hadoop-common:$defaultHadoopVersion" testImplementation "com.amazonaws:aws-java-sdk-s3:${awsJavaSdkVersion}" + testImplementation "org.jets3t:jets3t:0.9.7" testRuntimeOnly project(":${defaultWebserverModule}") testRuntimeOnly project(":h2o-persist-s3") } diff --git a/h2o-persist-hdfs/src/test/java/water/persist/PersistHdfsTest.java b/h2o-persist-hdfs/src/test/java/water/persist/PersistHdfsTest.java index 4e6e2153f09b..53fa98fbc67b 100644 --- a/h2o-persist-hdfs/src/test/java/water/persist/PersistHdfsTest.java +++ b/h2o-persist-hdfs/src/test/java/water/persist/PersistHdfsTest.java @@ -23,8 +23,10 @@ public class PersistHdfsTest extends TestUtil { @Parameterized.Parameters(name = "{index}: scheme={0}") - public static Object[] schemes() { - return new Object[] { "s3n", "s3a" }; + public static Object[] schemes() { + return new Object[] { + // "s3n", - s3n is not supported by hadoop-aws 3.0+ + "s3a" }; } @Parameterized.Parameter diff --git a/h2o-persist-hdfs/src/test/java/water/persist/PersistS3HdfsTest.java b/h2o-persist-hdfs/src/test/java/water/persist/PersistS3HdfsTest.java index ce44b634defe..b6a746339d81 100644 --- a/h2o-persist-hdfs/src/test/java/water/persist/PersistS3HdfsTest.java +++ b/h2o-persist-hdfs/src/test/java/water/persist/PersistS3HdfsTest.java @@ -2,11 +2,9 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.s3.S3FileSystem; -import org.jets3t.service.S3Service; -import org.jets3t.service.model.S3Object; +import org.apache.hadoop.fs.s3a.S3AFileSystem; +import com.amazonaws.services.s3.model.S3Object; import org.junit.BeforeClass; -import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; import org.junit.rules.ExpectedException; @@ -37,29 +35,14 @@ public void testPubDev5663() throws Exception { // Demonstrates that S3FileSyste PersistHdfs hdfsPersist = (PersistHdfs) H2O.getPM().getPersistForURI(URI.create("hdfs://localhost/")); - String existing = "s3://" + bucket + "/" + key; + String existing = "s3a://" + bucket + "/" + key; Path p = new Path(existing); - S3FileSystem fs = (S3FileSystem) FileSystem.get(p.toUri(), PersistHdfs.CONF); - // use crazy reflection to get to the actual S3 Service instance - S3Service s3Service = (S3Service) getValue(fs, "store", "h", "proxyDescriptor", "fpp", "proxy", "s3Service"); - - S3Object s3Object = s3Service.getObject(bucket, key); + S3AFileSystem fs = (S3AFileSystem) FileSystem.get(p.toUri(), PersistHdfs.CONF); + S3Object s3Object = fs.getAmazonS3ClientForTesting("testPubDev5663").getObject(bucket, key); + assertNotNull(s3Object); // The object exists - assertFalse(fs.exists(p)); // But FS says it doesn't => S3 is broken in Hadoop - assertFalse(hdfsPersist.exists(existing)); // Our persist gives the same result - } - - private Object getValue(Object o, String... fieldNames) { - StringBuilder path = new StringBuilder(o.getClass().getName()); - for (String f : fieldNames) { - path.append('.').append(f); - Object no = ReflectionUtils.getFieldValue(o, f); - if (no == null) - throw new IllegalStateException("Invalid path: " + path.toString() + ", object is instance of " + o.getClass()); - o = no; - } - return o; + assert(fs.exists(p)); // But FS says it exists as well. + assert(hdfsPersist.exists(existing)); // Our persist gives the same result } - } diff --git a/h2o-persist-s3/build.gradle b/h2o-persist-s3/build.gradle index 4fe10af31c97..cdec1d6441ce 100644 --- a/h2o-persist-s3/build.gradle +++ b/h2o-persist-s3/build.gradle @@ -7,6 +7,7 @@ configurations { dependencies { api project(":h2o-core") + compileOnly("org.apache.hadoop:hadoop-common:$defaultHadoopVersion") api "com.amazonaws:aws-java-sdk-s3:${awsJavaSdkVersion}" api "com.amazonaws:aws-java-sdk-sts:${awsJavaSdkVersion}" // Required by WebIdentityTokenCredentialsProvider from AWS SDK api "org.apache.httpcomponents:httpclient:${httpClientVersion}" @@ -19,6 +20,10 @@ dependencies { testRuntimeOnly project(":${defaultWebserverModule}") testRuntimeOnly project(":h2o-parquet-parser") testImplementation project(":h2o-persist-hdfs") + testImplementation "org.apache.hadoop:hadoop-common:$defaultHadoopVersion" + testImplementation("org.apache.hadoop:hadoop-mapreduce-client-core:$defaultHadoopVersion") { + transitive = false + } } apply from: "${rootDir}/gradle/dataCheck.gradle" diff --git a/scripts/jenkins/groovy/defineTestStages.groovy b/scripts/jenkins/groovy/defineTestStages.groovy index e34c0057fe38..5792dab4123c 100644 --- a/scripts/jenkins/groovy/defineTestStages.groovy +++ b/scripts/jenkins/groovy/defineTestStages.groovy @@ -589,7 +589,8 @@ def call(final pipelineContext) { version: distribution.version, commandFactory: 'h2o-3/scripts/jenkins/groovy/hadoopCommands.groovy', ldapConfigPath: ldapConfigPath, - ldapConfigPathStandalone: 'scripts/jenkins/config/ldap-jetty-9.txt' + ldapConfigPathStandalone: 'scripts/jenkins/config/ldap-jetty-9.txt', + bundledS3FileSystems: 's3a,s3n' ], pythonVersion: '3.6', customDockerArgs: [ '--privileged' ], @@ -599,6 +600,7 @@ def call(final pipelineContext) { def standaloneStage = evaluate(stageTemplate.inspect()) standaloneStage.stageName = "${distribution.name.toUpperCase()} ${distribution.version} - STANDALONE" standaloneStage.customData.mode = 'STANDALONE' + standaloneStage.customData.bundledS3FileSystems = 's3a' def onHadoopStage = evaluate(stageTemplate.inspect()) onHadoopStage.stageName = "${distribution.name.toUpperCase()} ${distribution.version} - HADOOP" @@ -672,10 +674,12 @@ def call(final pipelineContext) { def standaloneStage = evaluate(stageTemplate.inspect()) standaloneStage.stageName = "${distribution.name.toUpperCase()} ${distribution.version} - STANDALONE" standaloneStage.customData.mode = 'STANDALONE' + standaloneStage.customData.bundledS3FileSystems = 's3a' def standaloneKeytabStage = evaluate(stageTemplate.inspect()) standaloneKeytabStage.stageName = "${distribution.name.toUpperCase()} ${distribution.version} - STANDALONE KEYTAB" standaloneKeytabStage.customData.mode = 'STANDALONE_KEYTAB' + standaloneKeytabStage.customData.bundledS3FileSystems = 's3a' def standaloneDriverKeytabStage = evaluate(stageTemplate.inspect()) standaloneDriverKeytabStage.stageName = "${distribution.name.toUpperCase()} ${distribution.version} - DRIVER KEYTAB" diff --git a/scripts/jenkins/groovy/hadoopCommands.groovy b/scripts/jenkins/groovy/hadoopCommands.groovy index 34e440d37fe8..44e175491745 100644 --- a/scripts/jenkins/groovy/hadoopCommands.groovy +++ b/scripts/jenkins/groovy/hadoopCommands.groovy @@ -81,6 +81,7 @@ private GString getCommandStandalone(final stageConfig) { fi export CLOUD_IP=\$(hostname --ip-address) export CLOUD_PORT=${defaultPort} + export HADOOP_S3_FILESYSTEMS=${stageConfig.customData.bundledS3FileSystems} """ }