From c09138ba9696584f38a48b15b4fc2d2e43b14183 Mon Sep 17 00:00:00 2001 From: limin Date: Wed, 30 Oct 2024 12:54:59 +0800 Subject: [PATCH] [Feature][Zeta] Add COS support for checkpoint storage (#7931) --- .../en/seatunnel-engine/checkpoint-storage.md | 38 +++++++++++++++- .../zh/seatunnel-engine/checkpoint-storage.md | 38 +++++++++++++++- .../checkpoint-storage-hdfs/pom.xml | 6 +++ .../storage/hdfs/common/CosConfiguration.java | 44 +++++++++++++++++++ .../hdfs/common/FileConfiguration.java | 3 +- 5 files changed, 126 insertions(+), 3 deletions(-) create mode 100644 seatunnel-engine/seatunnel-engine-storage/checkpoint-storage-plugins/checkpoint-storage-hdfs/src/main/java/org/apache/seatunnel/engine/checkpoint/storage/hdfs/common/CosConfiguration.java diff --git a/docs/en/seatunnel-engine/checkpoint-storage.md b/docs/en/seatunnel-engine/checkpoint-storage.md index 7027f8067fb..19c617e0154 100644 --- a/docs/en/seatunnel-engine/checkpoint-storage.md +++ b/docs/en/seatunnel-engine/checkpoint-storage.md @@ -14,7 +14,7 @@ Checkpoint Storage is a storage mechanism for storing checkpoint data. SeaTunnel Engine supports the following checkpoint storage types: -- HDFS (OSS,S3,HDFS,LocalFile) +- HDFS (OSS,COS,S3,HDFS,LocalFile) - LocalFile (native), (it's deprecated: use Hdfs(LocalFile) instead. We use the microkernel design pattern to separate the checkpoint storage module from the engine. This allows users to implement their own checkpoint storage modules. @@ -73,6 +73,42 @@ For additional reading on the Hadoop Credential Provider API, you can see: [Cred For Aliyun OSS Credential Provider implements, you can see: [Auth Credential Providers](https://github.com/aliyun/aliyun-oss-java-sdk/tree/master/src/main/java/com/aliyun/oss/common/auth) +#### COS + +Tencent COS based hdfs-file you can refer [Hadoop COS Docs](https://hadoop.apache.org/docs/stable/hadoop-cos/cloud-storage/) to config COS. + +Except when interacting with cos buckets, the cos client needs the credentials needed to interact with buckets. +The client supports multiple authentication mechanisms and can be configured as to which mechanisms to use, and their order of use. Custom implementations of com.qcloud.cos.auth.COSCredentialsProvider may also be used. +If you used SimpleCredentialsProvider (can be obtained from the Tencent Cloud API Key Management), these consist of an access key, a secret key. +You can config like this: + +```yaml +seatunnel: + engine: + checkpoint: + interval: 6000 + timeout: 7000 + storage: + type: hdfs + max-retained: 3 + plugin-config: + storage.type: cos + cos.bucket: cosn://your-bucket + fs.cosn.credentials.provider: org.apache.hadoop.fs.cosn.auth.SimpleCredentialsProvider + fs.cosn.userinfo.secretId: your-secretId + fs.cosn.userinfo.secretKey: your-secretKey + fs.cosn.bucket.region: your-region +``` + +For additional reading on the Hadoop Credential Provider API, you can see: [Credential Provider API](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/CredentialProviderAPI.html). + +For additional COS configuration, you can see: [Tencent Hadoop-COS Docs](https://doc.fincloud.tencent.cn/tcloud/Storage/COS/846365/hadoop) + +Please add the following jar to the lib directory: +- [hadoop-cos-3.4.1.jar](https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-cos/3.4.1) +- [cos_api-bundle-5.6.69.jar](https://mvnrepository.com/artifact/com.qcloud/cos_api-bundle/5.6.69) +- [hadoop-shaded-guava-1.1.1.jar](https://mvnrepository.com/artifact/org.apache.hadoop.thirdparty/hadoop-shaded-guava/1.1.1) + #### S3 S3 based hdfs-file you can refer [hadoop s3 docs](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) to config s3. diff --git a/docs/zh/seatunnel-engine/checkpoint-storage.md b/docs/zh/seatunnel-engine/checkpoint-storage.md index 86165d5d3be..a60fdff5ae0 100644 --- a/docs/zh/seatunnel-engine/checkpoint-storage.md +++ b/docs/zh/seatunnel-engine/checkpoint-storage.md @@ -12,7 +12,7 @@ sidebar_position: 7 SeaTunnel Engine支持以下检查点存储类型: -- HDFS (OSS,S3,HDFS,LocalFile) +- HDFS (OSS,COS,S3,HDFS,LocalFile) - LocalFile (本地),(已弃用: 使用HDFS(LocalFile)替代). 我们使用微内核设计模式将检查点存储模块从引擎中分离出来。这允许用户实现他们自己的检查点存储模块。 @@ -71,6 +71,42 @@ seatunnel: 阿里云OSS凭证提供程序实现见: [验证凭证提供](https://github.com/aliyun/aliyun-oss-java-sdk/tree/master/src/main/java/com/aliyun/oss/common/auth) +#### COS + +腾讯云COS基于hdfs-file,所以你可以参考[Hadoop COS文档](https://hadoop.apache.org/docs/stable/hadoop-cos/cloud-storage/)来配置COS. + +除了与公共COS buckets交互之外,COS客户端需要与buckets交互所需的凭据。 +客户端支持多种身份验证机制,并且可以配置使用哪种机制及其使用顺序。也可以使用com.qcloud.cos.auth.COSCredentialsProvider的自定义实现。 +如果您使用SimpleCredentialsProvider(可以从腾讯云API密钥管理中获得),它们包括一个secretId和一个secretKey。 +您可以这样配置: + +```yaml +seatunnel: + engine: + checkpoint: + interval: 6000 + timeout: 7000 + storage: + type: hdfs + max-retained: 3 + plugin-config: + storage.type: cos + cos.bucket: cosn://your-bucket + fs.cosn.credentials.provider: org.apache.hadoop.fs.cosn.auth.SimpleCredentialsProvider + fs.cosn.userinfo.secretId: your-secretId + fs.cosn.userinfo.secretKey: your-secretKey + fs.cosn.bucket.region: your-region +``` + +有关Hadoop Credential Provider API的更多信息,请参见: [Credential Provider API](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/CredentialProviderAPI.html). + +腾讯云COS相关配置可参考:[Tencent Hadoop-COS文档](https://doc.fincloud.tencent.cn/tcloud/Storage/COS/846365/hadoop) + +使用前请将如下jar添加到lib目录下: +- [hadoop-cos-3.4.1.jar](https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-cos/3.4.1) +- [cos_api-bundle-5.6.69.jar](https://mvnrepository.com/artifact/com.qcloud/cos_api-bundle/5.6.69) +- [hadoop-shaded-guava-1.1.1.jar](https://mvnrepository.com/artifact/org.apache.hadoop.thirdparty/hadoop-shaded-guava/1.1.1) + #### S3 S3基于hdfs-file,所以你可以参考[Hadoop s3文档](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html)来配置s3。 diff --git a/seatunnel-engine/seatunnel-engine-storage/checkpoint-storage-plugins/checkpoint-storage-hdfs/pom.xml b/seatunnel-engine/seatunnel-engine-storage/checkpoint-storage-plugins/checkpoint-storage-hdfs/pom.xml index f7107f9f32b..8ae75cddd55 100644 --- a/seatunnel-engine/seatunnel-engine-storage/checkpoint-storage-plugins/checkpoint-storage-hdfs/pom.xml +++ b/seatunnel-engine/seatunnel-engine-storage/checkpoint-storage-plugins/checkpoint-storage-hdfs/pom.xml @@ -65,6 +65,12 @@ 1.11.271 provided + + org.apache.hadoop + hadoop-cos + 3.4.1 + provided + diff --git a/seatunnel-engine/seatunnel-engine-storage/checkpoint-storage-plugins/checkpoint-storage-hdfs/src/main/java/org/apache/seatunnel/engine/checkpoint/storage/hdfs/common/CosConfiguration.java b/seatunnel-engine/seatunnel-engine-storage/checkpoint-storage-plugins/checkpoint-storage-hdfs/src/main/java/org/apache/seatunnel/engine/checkpoint/storage/hdfs/common/CosConfiguration.java new file mode 100644 index 00000000000..56fdc743621 --- /dev/null +++ b/seatunnel-engine/seatunnel-engine-storage/checkpoint-storage-plugins/checkpoint-storage-hdfs/src/main/java/org/apache/seatunnel/engine/checkpoint/storage/hdfs/common/CosConfiguration.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ + +package org.apache.seatunnel.engine.checkpoint.storage.hdfs.common; + +import org.apache.hadoop.conf.Configuration; + +import java.util.Map; + +import static org.apache.hadoop.fs.FileSystem.FS_DEFAULT_NAME_KEY; + +public class CosConfiguration extends AbstractConfiguration { + public static final String COS_BUCKET_KEY = "cos.bucket"; + private static final String COS_IMPL_KEY = "fs.cosn.impl"; + private static final String HDFS_COS_IMPL = "org.apache.hadoop.fs.cosn.CosNFileSystem"; + private static final String COS_KEY = "fs.cosn."; + + @Override + public Configuration buildConfiguration(Map config) { + checkConfiguration(config, COS_BUCKET_KEY); + Configuration hadoopConf = new Configuration(); + hadoopConf.set(FS_DEFAULT_NAME_KEY, config.get(COS_BUCKET_KEY)); + hadoopConf.set(COS_IMPL_KEY, HDFS_COS_IMPL); + setExtraConfiguration(hadoopConf, config, COS_KEY); + return hadoopConf; + } +} diff --git a/seatunnel-engine/seatunnel-engine-storage/checkpoint-storage-plugins/checkpoint-storage-hdfs/src/main/java/org/apache/seatunnel/engine/checkpoint/storage/hdfs/common/FileConfiguration.java b/seatunnel-engine/seatunnel-engine-storage/checkpoint-storage-plugins/checkpoint-storage-hdfs/src/main/java/org/apache/seatunnel/engine/checkpoint/storage/hdfs/common/FileConfiguration.java index a9b30346ed7..a1904e5fcb9 100644 --- a/seatunnel-engine/seatunnel-engine-storage/checkpoint-storage-plugins/checkpoint-storage-hdfs/src/main/java/org/apache/seatunnel/engine/checkpoint/storage/hdfs/common/FileConfiguration.java +++ b/seatunnel-engine/seatunnel-engine-storage/checkpoint-storage-plugins/checkpoint-storage-hdfs/src/main/java/org/apache/seatunnel/engine/checkpoint/storage/hdfs/common/FileConfiguration.java @@ -24,7 +24,8 @@ public enum FileConfiguration { LOCAL("local", new LocalConfiguration()), HDFS("hdfs", new HdfsConfiguration()), S3("s3", new S3Configuration()), - OSS("oss", new OssConfiguration()); + OSS("oss", new OssConfiguration()), + COS("cos", new CosConfiguration()); /** file system type */ private final String name;