Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GH-15470] Upgrade Hadoop Libraries in Main Standalone Jar #15469

Merged
merged 27 commits into from
Aug 17, 2023
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
5886b4d
[PUBDEV-9089] Upgrade Hadoop Libraries in Main Standalone Jar
mn-mikke May 18, 2023
a1856f8
Add hadoop commons
mn-mikke Jul 10, 2023
e9cb550
Add some transitive dependencies for hdfs
mn-mikke Jul 14, 2023
219664e
Remove orc dependency for main
mn-mikke Jul 18, 2023
99e1396
Specify different version of hadoop for orc
mn-mikke Jul 19, 2023
1b13d0f
remove hive exec version spec from hive properties
mn-mikke Jul 19, 2023
16c6c68
Change hadoop version in orc tests
mn-mikke Jul 20, 2023
a9ce21b
Fix usage of http client
mn-mikke Jul 20, 2023
4d48215
Add libs for s3 persist tests
mn-mikke Jul 21, 2023
a218325
Update tests
mn-mikke Jul 21, 2023
99b4a6b
Update tests
mn-mikke Jul 24, 2023
1668d88
add hadoop-common to perstist-s3 tests
mn-mikke Jul 25, 2023
b777d3a
Refactor usage of hadoop-mapreduce-client-core
mn-mikke Jul 26, 2023
237be03
update usage of hadoop-common
mn-mikke Jul 26, 2023
6b5bb92
try to relocate hadoop libraries
mn-mikke Jul 26, 2023
49d27fb
Revert relocation
mn-mikke Jul 27, 2023
85720a9
Relocated hadoop libs
mn-mikke Jul 27, 2023
9aa9407
add hadoop-common.jar on tests
mn-mikke Aug 15, 2023
b341fbc
exclude org.apache.hadoop.net.DNSDomainNameResolver from relocation
mn-mikke Aug 15, 2023
db72320
exclude org.apache.hadoop.security.JniBasedUnixGroupsMappingWithFallback
mn-mikke Aug 15, 2023
c8d8cf1
include hive shims common
mn-mikke Aug 16, 2023
3d38927
remove hadoop common
mn-mikke Aug 16, 2023
f55fd26
remove s3n filesystem from hadoop tests
mn-mikke Aug 16, 2023
8e0a377
Upgrade hive shimps common
mn-mikke Aug 16, 2023
899fa99
Exclude most of the Shim classes
mn-mikke Aug 16, 2023
901caf0
Add some comments to build.gradle about extensions
mn-mikke Aug 17, 2023
46cbd20
Fix S3 tests
mn-mikke Aug 17, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 1 addition & 7 deletions gradle.properties
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,6 @@ doFindbugs=false
# Run animal sniffer to verify compatibility of API with actual Java version
doAnimalSniffer=false

# include ORC support inside default h2o.jar.
doIncludeOrc=false

# include MOJO Pipeline support inside default h2o.jar.
doIncludeMojoPipeline=false

Expand Down Expand Up @@ -55,12 +52,9 @@ httpClientVersion=4.5.2
defaultParquetVersion=1.12.3

# Default Hadoop client version
defaultHadoopVersion=2.8.4
defaultHadoopVersion=3.3.5
defaultHdfsDependency=hadoop-hdfs-client

# Default Hive version
defaultHiveExecVersion=1.1.0

defaultWebserverModule=h2o-jetty-9
# default module to be included in assemblies
defaultExtWebserverModule=h2o-jetty-9-ext
Expand Down
14 changes: 11 additions & 3 deletions h2o-assemblies/main/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,20 @@ dependencies {
api project(":h2o-persist-http")
api project(":h2o-persist-hdfs")
api project(":h2o-ext-krbstandalone")
if (project.hasProperty("doIncludeOrc") && project.doIncludeOrc == "true") {
api project(":h2o-orc-parser")
}
api project(":h2o-parquet-parser")
api project(":h2o-k8s-int")

api "org.apache.hadoop:hadoop-hdfs-client:${defaultHadoopVersion}"
api("org.apache.hadoop:hadoop-common:${defaultHadoopVersion}") {
exclude group: "com.sun.jersey"
exclude group: "javax.servlet"
exclude group: "org.apache.avro"
exclude group: "org.apache.curator"
exclude group: "org.apache.zookeeper"
exclude group: "org.eclipse.jetty"
exclude group: "org.apache.hadoop.thirdparty", module: "hadoop-shaded-protobuf_3_7"
}

constraints {
api('com.fasterxml.jackson.core:jackson-databind:2.13.4.2') {
because 'Fixes CVE-2022-42003'
Expand Down
17 changes: 4 additions & 13 deletions h2o-assemblies/steam/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,13 @@ dependencies {
api(project(":h2o-persist-s3")) {
exclude group: "org.apache.hadoop"
}
api(project(":h2o-persist-hdfs")) {
exclude group: "org.apache.hadoop"
}
api(project(":h2o-persist-hdfs"))
api(project(":h2o-parquet-parser")) {
exclude group: "org.apache.hadoop"
}
// Force latest version Hadoop with unused components excluded - we need Hadoop for Parquet and S3A export
api "org.apache.hadoop:hadoop-hdfs-client:3.3.5"
api("org.apache.hadoop:hadoop-common:3.3.5") {
api "org.apache.hadoop:hadoop-hdfs-client:${defaultHadoopVersion}"
api("org.apache.hadoop:hadoop-common:${defaultHadoopVersion}") {
exclude group: "com.sun.jersey"
exclude group: "javax.servlet"
exclude group: "org.apache.avro"
Expand All @@ -42,18 +40,11 @@ dependencies {
exclude group: "org.eclipse.jetty"
exclude group: "org.apache.hadoop.thirdparty", module: "hadoop-shaded-protobuf_3_7"
}
api("org.apache.hadoop:hadoop-aws:3.3.5") {
exclude group: "com.amazonaws", module: "aws-java-sdk-bundle"
}
// aws-java-sdk-dynamodb is required for S3A support, S3A import throws NoClassDefFoundError (AmazonDynamoDBException)
api("com.amazonaws:aws-java-sdk-dynamodb:${awsJavaSdkVersion}") {
transitive = false
}
// Upgrade dependencies coming from Hadoop to address vulnerabilities
api "org.apache.commons:commons-compress:1.21"
// Force specific Parquet version to avoid dependency on vulnerable FasterXML jackson-mapper-asl
api "org.apache.parquet:parquet-hadoop:${defaultParquetVersion}"
api("org.apache.hadoop:hadoop-mapreduce-client-core:3.3.5") {
api("org.apache.hadoop:hadoop-mapreduce-client-core:${defaultHadoopVersion}") {
transitive = false
}
// Google OAuth force version
Expand Down
1 change: 1 addition & 0 deletions h2o-extensions/krbstandalone/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ description = "H2O Kerberos Standalone support"
dependencies {
api project(":h2o-core")
api project(":h2o-persist-hdfs")
compileOnly("org.apache.hadoop:hadoop-common:$defaultHadoopVersion")
api("org.apache.hadoop:hadoop-auth:$defaultHadoopVersion") {
// Pull all dependencies to allow run directly from IDE or command line
transitive = true
Expand Down
15 changes: 8 additions & 7 deletions h2o-parsers/h2o-orc-parser/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@ configurations{
}

dependencies {
hadoopCommonExclude("org.apache.hadoop:hadoop-common:${defaultHadoopVersion}")
hiveExecExclude("org.apache.hive:hive-exec:$defaultHiveExecVersion"){
def hadoopVersion="2.8.4"
def hiveExecVersion="1.1.0"
hadoopCommonExclude("org.apache.hadoop:hadoop-common:$hadoopVersion")
hiveExecExclude("org.apache.hive:hive-exec:$hiveExecVersion"){
// this dependency need to be excluded manually as Gradle can't find it in maven central
exclude group: 'org.pentaho', module: 'pentaho-aggdesigner-algorithm'
exclude group: 'eigenbase', module: 'eigenbase-properties'
Expand All @@ -23,14 +25,13 @@ dependencies {
api(project(":h2o-persist-hdfs")) {
exclude group: 'ai.h2o', module: 'h2o-core'
exclude group: 'net.java.dev.jets3t', module: 'jets3t'
exclude group: 'org.apache.hadoop', module: 'hadoop-client'
exclude group: 'org.apache.hadoop', module: 'hadoop-aws'
exclude group: 'org.apache.hadoop'
}

// Note: What is connection between hive-exec version and hadoop-version and orc version?
// Note: In this case we are using hive version which is compatible with $defaultHadoopVersion
// Note: for newest version it should be replaces by hive-orc
api("org.apache.hive:hive-exec:$defaultHiveExecVersion") {
api("org.apache.hive:hive-exec:$hiveExecVersion") {
// we can't use transitive=false so we need to exclude the dependencies manually
configurations.hiveExecExclude.getResolvedConfiguration().getResolvedArtifacts().each {
if (it.moduleVersion.id.group != "org.apache.hive" && it.moduleVersion.id.module.name != "hive-exec") {
Expand All @@ -40,7 +41,7 @@ dependencies {
exclude group: 'org.pentaho', module: 'pentaho-aggdesigner-algorithm'
}
// For compilation we need common
api("org.apache.hadoop:hadoop-common:$defaultHadoopVersion") {
api("org.apache.hadoop:hadoop-common:$hadoopVersion") {
// we can't use transitive=false so we need to exclude the dependencies manually
configurations.hadoopCommonExclude.getResolvedConfiguration().getResolvedArtifacts().each {
if (it.moduleVersion.id.group != "org.apache.hadoop" && it.moduleVersion.id.module.name != "hadoop-common") {
Expand All @@ -52,7 +53,7 @@ dependencies {
testImplementation project(":h2o-test-support")
testRuntimeOnly project(":${defaultWebserverModule}")
// We need correct version of MapRe Hadoop to run JUnits
testRuntimeOnly("org.apache.hadoop:hadoop-client:$defaultHadoopVersion") {
testRuntimeOnly("org.apache.hadoop:hadoop-client:$hadoopVersion") {
exclude module: "jasper-runtime"
exclude module: "jasper-compiler"
exclude module: "curator-client"
Expand Down
12 changes: 11 additions & 1 deletion h2o-persist-hdfs/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,25 @@ configurations {

dependencies {
api project(":h2o-core")
compileOnly("org.apache.hadoop:hadoop-common:$defaultHadoopVersion")
api("org.apache.hadoop:$defaultHdfsDependency:$defaultHadoopVersion") {
// Pull all dependencies to allow run directly from IDE or command line
transitive = true
}
api("org.apache.hadoop:hadoop-aws:$defaultHadoopVersion")
api("org.apache.hadoop:hadoop-aws:${defaultHadoopVersion}") {
exclude group: "com.amazonaws", module: "aws-java-sdk-bundle"
}
// aws-java-sdk-dynamodb is required for S3A support, S3A import throws NoClassDefFoundError (AmazonDynamoDBException)
api("com.amazonaws:aws-java-sdk-dynamodb:${awsJavaSdkVersion}") {
transitive = false
}

api("com.nimbusds:nimbus-jose-jwt:9.11.3")

testImplementation project(":h2o-test-support")
testImplementation "org.apache.hadoop:hadoop-common:$defaultHadoopVersion"
testImplementation "com.amazonaws:aws-java-sdk-s3:${awsJavaSdkVersion}"
testImplementation "org.jets3t:jets3t:0.9.7"
testRuntimeOnly project(":${defaultWebserverModule}")
testRuntimeOnly project(":h2o-persist-s3")
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@
public class PersistHdfsTest extends TestUtil {

@Parameterized.Parameters(name = "{index}: scheme={0}")
public static Object[] schemes() {
return new Object[] { "s3n", "s3a" };
public static Object[] schemes() {
return new Object[] {
// "s3n", - s3n is not supported by hadoop-aws 3.0+
"s3a" };
}

@Parameterized.Parameter
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,9 @@

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.s3.S3FileSystem;
import org.jets3t.service.S3Service;
import org.jets3t.service.model.S3Object;
import org.apache.hadoop.fs.s3a.S3AFileSystem;
import com.amazonaws.services.s3.model.S3Object;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;
Expand Down Expand Up @@ -37,29 +35,14 @@ public void testPubDev5663() throws Exception { // Demonstrates that S3FileSyste

PersistHdfs hdfsPersist = (PersistHdfs) H2O.getPM().getPersistForURI(URI.create("hdfs://localhost/"));

String existing = "s3://" + bucket + "/" + key;
String existing = "s3a://" + bucket + "/" + key;
Path p = new Path(existing);

S3FileSystem fs = (S3FileSystem) FileSystem.get(p.toUri(), PersistHdfs.CONF);
// use crazy reflection to get to the actual S3 Service instance
S3Service s3Service = (S3Service) getValue(fs, "store", "h", "proxyDescriptor", "fpp", "proxy", "s3Service");

S3Object s3Object = s3Service.getObject(bucket, key);
S3AFileSystem fs = (S3AFileSystem) FileSystem.get(p.toUri(), PersistHdfs.CONF);
S3Object s3Object = fs.getAmazonS3ClientForTesting("testPubDev5663").getObject(bucket, key);

assertNotNull(s3Object); // The object exists
assertFalse(fs.exists(p)); // But FS says it doesn't => S3 is broken in Hadoop
assertFalse(hdfsPersist.exists(existing)); // Our persist gives the same result
}

private Object getValue(Object o, String... fieldNames) {
StringBuilder path = new StringBuilder(o.getClass().getName());
for (String f : fieldNames) {
path.append('.').append(f);
Object no = ReflectionUtils.getFieldValue(o, f);
if (no == null)
throw new IllegalStateException("Invalid path: " + path.toString() + ", object is instance of " + o.getClass());
o = no;
}
return o;
assert(fs.exists(p)); // But FS says it exists as well.
assert(hdfsPersist.exists(existing)); // Our persist gives the same result
}

}
2 changes: 2 additions & 0 deletions h2o-persist-s3/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ configurations {

dependencies {
api project(":h2o-core")
compileOnly("org.apache.hadoop:hadoop-common:$defaultHadoopVersion")
api "com.amazonaws:aws-java-sdk-s3:${awsJavaSdkVersion}"
api "com.amazonaws:aws-java-sdk-sts:${awsJavaSdkVersion}" // Required by WebIdentityTokenCredentialsProvider from AWS SDK
api "org.apache.httpcomponents:httpclient:${httpClientVersion}"
Expand All @@ -19,6 +20,7 @@ dependencies {
testRuntimeOnly project(":${defaultWebserverModule}")
testRuntimeOnly project(":h2o-parquet-parser")
testImplementation project(":h2o-persist-hdfs")
testImplementation "org.apache.hadoop:hadoop-common:$defaultHadoopVersion"
}

apply from: "${rootDir}/gradle/dataCheck.gradle"
Expand Down