From cb30d41fea3af7674f106bbafda5400f64002cd8 Mon Sep 17 00:00:00 2001 From: David Sloan <33483659+davidsloan@users.noreply.github.com> Date: Mon, 18 Sep 2023 15:45:38 +0100 Subject: [PATCH] Partition Name Keys & Naming Strategies Merge & Adding Extra Configuration Options (#984) * Merges naming strategies to reduce code complexity and make consistent. Ensuring that hierarchical and custom partition modes follow mostly the same code paths and have not diverged as much. Starting addressing issues around the padding. Ensuring the key and value can be included within the partition name for both hierarchical and custom partitioning modes. Adding unit tests. * Padding strategy configuration. * update partition selection * Removing default prefix. Ensuring source ignores indexes * Fix from review - treat all negatives as unset * Adding additional unit testing around config, remove from property names, change kcql properties parsing to support Map properties, adding checks to ensure both config methods cannot be attempted to be used simultaneously, addressing review comments * Fixing tests * Returning DeprecationConfigDefProcessor, fix from review --- .../connect/S3CompressionTest.scala | 2 +- .../lenses/streamreactor/connect/S3Test.scala | 2 +- .../aws/s3/sink/S3AvroWriterManagerTest.scala | 24 +- .../aws/s3/sink/S3JsonWriterManagerTest.scala | 40 ++- .../s3/sink/S3ParquetWriterManagerTest.scala | 22 +- .../s3/sink/S3SinkTaskAvroEnvelopeTest.scala | 4 +- .../s3/sink/S3SinkTaskJsonEnvelopeTest.scala | 10 +- .../sink/S3SinkTaskParquetEnvelopeTest.scala | 2 +- .../connect/aws/s3/sink/S3SinkTaskTest.scala | 298 ++++++++++-------- .../aws/s3/sink/S3WriterManagerTest.scala | 7 +- .../connect/aws/s3/source/BucketSetup.scala | 38 +++ .../source/S3SourceTaskBucketRootTest.scala | 104 ++++++ .../aws/s3/source/S3SourceTaskTest.scala | 21 +- .../aws/s3/storage/ListDirectoryTest.scala | 4 + .../aws/s3/config/FormatSelection.scala | 11 +- .../aws/s3/config/S3ConfigSettings.scala | 8 +- .../config/kcqlprops/S3PropsKeyEnum.scala | 10 +- .../DeprecationConfigDefProcessor.scala | 57 ++++ .../connect/aws/s3/model/S3StoredFile.scala | 53 ---- .../aws/s3/model/TopicPartitionOffset.scala | 5 +- .../connect/aws/s3/sink/S3WriterManager.scala | 69 ++-- .../sink/config/PaddingStrategySettings.scala | 52 --- .../aws/s3/sink/config/PartitionDisplay.scala | 23 +- .../aws/s3/sink/config/PartitionField.scala | 32 +- .../s3/sink/config/PartitionSelection.scala | 39 ++- .../aws/s3/sink/config/S3SinkConfig.scala | 43 ++- .../aws/s3/sink/config/S3SinkConfigDef.scala | 4 +- .../config/kcqlprops/S3SinkPropsSchema.scala | 50 +++ .../sink/config/padding/PaddingService.scala | 107 +++++++ .../padding/PaddingStrategySettings.scala | 41 +++ .../s3/sink/config/padding/PaddingType.scala | 39 +++ .../connect/aws/s3/sink/naming/KeyNamer.scala | 46 +++ .../aws/s3/sink/naming/S3FileNamer.scala | 47 +++ .../S3KeyNamer.scala} | 185 +++-------- .../aws/s3/sink/seek/IndexManager.scala | 13 +- .../config/PartitionSearcherOptions.scala | 15 +- .../aws/s3/source/config/ReadTextMode.scala | 4 +- .../aws/s3/source/config/S3SourceConfig.scala | 3 +- .../s3/source/config/S3SourceConfigDef.scala | 3 +- .../SourcePartitionSearcherSettings.scala | 2 + ...Schema.scala => S3SourcePropsSchema.scala} | 32 +- .../distribution/PartitionSearcher.scala | 2 + .../aws/s3/source/reader/ResultReader.scala | 11 +- .../aws/s3/storage/AwsS3DirectoryLister.scala | 46 ++- .../aws/s3/config/CommonConfigDefTest.scala | 24 +- .../DeprecationConfigDefProcessorTest.scala | 57 ++++ .../aws/s3/model/PartitionDisplayTest.scala | 47 --- .../aws/s3/model/PartitionFieldTest.scala | 2 +- .../aws/s3/model/S3StoredFileTest.scala | 56 ---- .../aws/s3/sink/CommittedFileNameTest.scala | 94 ------ .../aws/s3/sink/PaddingStrategyTest.scala | 4 - .../s3/sink/config/LocalStagingAreaTest.scala | 14 +- .../s3/sink/config/PartitionDisplayTest.scala | 72 +++++ .../s3/sink/config/TestConfigDefBuilder.scala | 30 ++ .../config/padding/PaddingServiceTest.scala | 100 ++++++ .../padding/PaddingStrategySettingsTest.scala | 49 +++ .../aws/s3/sink/naming/S3FileNamerTest.scala | 42 +++ .../aws/s3/sink/naming/S3KeyNamerTest.scala | 108 +++++++ .../aws/s3/sink/seek/IndexManagerTest.scala | 9 +- .../ReadTextModeTestFormatSelection.scala | 8 +- .../source/config/S3SourceConfigTests.scala | 21 +- ...st.scala => S3SourcePropsSchemaTest.scala} | 8 +- .../reader/PartitionDiscoveryTest.scala | 12 +- .../s3/storage/AwsS3DirectoryListerTest.scala | 52 ++- .../config/kcqlprops/KcqlProperties.scala | 48 ++- .../config/kcqlprops/KcqlPropsSchema.scala | 3 +- .../config/kcqlprops/PropsSchema.scala | 6 + .../config/kcqlprops/KcqlPropertiesTest.scala | 95 ++++++ 68 files changed, 1762 insertions(+), 829 deletions(-) create mode 100644 kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/source/S3SourceTaskBucketRootTest.scala rename kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/{source => }/config/kcqlprops/S3PropsKeyEnum.scala (84%) create mode 100644 kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/config/processors/DeprecationConfigDefProcessor.scala delete mode 100644 kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/model/S3StoredFile.scala delete mode 100644 kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/PaddingStrategySettings.scala create mode 100644 kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/kcqlprops/S3SinkPropsSchema.scala create mode 100644 kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/padding/PaddingService.scala create mode 100644 kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/padding/PaddingStrategySettings.scala create mode 100644 kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/padding/PaddingType.scala create mode 100644 kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/naming/KeyNamer.scala create mode 100644 kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/naming/S3FileNamer.scala rename kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/{FileNamingStrategy.scala => naming/S3KeyNamer.scala} (53%) rename kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/kcqlprops/{S3PropsSchema.scala => S3SourcePropsSchema.scala} (56%) create mode 100644 kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/config/processors/DeprecationConfigDefProcessorTest.scala delete mode 100644 kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/model/PartitionDisplayTest.scala delete mode 100644 kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/model/S3StoredFileTest.scala delete mode 100644 kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/CommittedFileNameTest.scala create mode 100644 kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/PartitionDisplayTest.scala create mode 100644 kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/TestConfigDefBuilder.scala create mode 100644 kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/padding/PaddingServiceTest.scala create mode 100644 kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/padding/PaddingStrategySettingsTest.scala create mode 100644 kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/naming/S3FileNamerTest.scala create mode 100644 kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/naming/S3KeyNamerTest.scala rename kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/source/config/kcqlprops/{S3PropsSchemaTest.scala => S3SourcePropsSchemaTest.scala} (88%) create mode 100644 kafka-connect-common/src/test/scala/io/lenses/streamreactor/connect/config/kcqlprops/KcqlPropertiesTest.scala diff --git a/kafka-connect-aws-s3/src/fun/scala/io/lenses/streamreactor/connect/S3CompressionTest.scala b/kafka-connect-aws-s3/src/fun/scala/io/lenses/streamreactor/connect/S3CompressionTest.scala index e4c624524..e8e33d57d 100644 --- a/kafka-connect-aws-s3/src/fun/scala/io/lenses/streamreactor/connect/S3CompressionTest.scala +++ b/kafka-connect-aws-s3/src/fun/scala/io/lenses/streamreactor/connect/S3CompressionTest.scala @@ -117,7 +117,7 @@ class S3CompressionTest } }.asserting { file => - file.key() should be(s"$prefix/$topic/000000000000/000000000000.$format") + file.key() should be(s"$prefix/$topic/0/000000000000.$format") } } } diff --git a/kafka-connect-aws-s3/src/fun/scala/io/lenses/streamreactor/connect/S3Test.scala b/kafka-connect-aws-s3/src/fun/scala/io/lenses/streamreactor/connect/S3Test.scala index d0a3d584c..417f30c18 100644 --- a/kafka-connect-aws-s3/src/fun/scala/io/lenses/streamreactor/connect/S3Test.scala +++ b/kafka-connect-aws-s3/src/fun/scala/io/lenses/streamreactor/connect/S3Test.scala @@ -85,7 +85,7 @@ class S3Test assert(files.contents().size() == 1) } - readKeyToOrder(s3Client, bucketName, "myfiles/orders/000000000000/000000000000.json") + readKeyToOrder(s3Client, bucketName, "myfiles/orders/0/000000000000.json") }.asserting { key: Order => key should be(order) diff --git a/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3AvroWriterManagerTest.scala b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3AvroWriterManagerTest.scala index 3b1fe888f..c6b2b5071 100644 --- a/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3AvroWriterManagerTest.scala +++ b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3AvroWriterManagerTest.scala @@ -28,10 +28,15 @@ import io.lenses.streamreactor.connect.aws.s3.model._ import io.lenses.streamreactor.connect.aws.s3.model.location.S3Location import io.lenses.streamreactor.connect.aws.s3.sink.commit.CommitPolicy import io.lenses.streamreactor.connect.aws.s3.sink.commit.Count +import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionSelection.defaultPartitionSelection import io.lenses.streamreactor.connect.aws.s3.sink.config.LocalStagingArea import io.lenses.streamreactor.connect.aws.s3.sink.config.OffsetSeekerOptions +import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionDisplay.Values import io.lenses.streamreactor.connect.aws.s3.sink.config.S3SinkConfig import io.lenses.streamreactor.connect.aws.s3.sink.config.SinkBucketOptions +import io.lenses.streamreactor.connect.aws.s3.sink.config.padding.PaddingService +import io.lenses.streamreactor.connect.aws.s3.sink.naming.OffsetS3FileNamer +import io.lenses.streamreactor.connect.aws.s3.sink.naming.S3KeyNamer import io.lenses.streamreactor.connect.aws.s3.utils.ITSampleSchemaAndData._ import io.lenses.streamreactor.connect.aws.s3.utils.S3ProxyContainerTest import org.apache.avro.generic.GenericRecord @@ -63,11 +68,22 @@ class S3AvroWriterManagerTest extends AnyFlatSpec with Matchers with S3ProxyCont SinkBucketOptions( TopicName.some, bucketAndPrefix, - commitPolicy = CommitPolicy(Count(2)), - formatSelection = AvroFormatSelection, - fileNamingStrategy = new HierarchicalS3FileNamingStrategy(AvroFormatSelection, NoOpPaddingStrategy), + commitPolicy = CommitPolicy(Count(2)), + formatSelection = AvroFormatSelection, + keyNamer = new S3KeyNamer( + AvroFormatSelection, + defaultPartitionSelection(Values), + new OffsetS3FileNamer( + identity[String], + AvroFormatSelection.extension, + ), + new PaddingService(Map[String, PaddingStrategy]( + "partition" -> NoOpPaddingStrategy, + "offset" -> LeftPadPaddingStrategy(12, 0), + )), + ), localStagingArea = LocalStagingArea(localRoot), - partitionSelection = None, + partitionSelection = defaultPartitionSelection(Values), dataStorage = DataStorageSettings.disabled, ), ), diff --git a/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3JsonWriterManagerTest.scala b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3JsonWriterManagerTest.scala index 72a1ce8b4..d05ae14bf 100644 --- a/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3JsonWriterManagerTest.scala +++ b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3JsonWriterManagerTest.scala @@ -27,10 +27,15 @@ import io.lenses.streamreactor.connect.aws.s3.model._ import io.lenses.streamreactor.connect.aws.s3.model.location.S3Location import io.lenses.streamreactor.connect.aws.s3.sink.commit.CommitPolicy import io.lenses.streamreactor.connect.aws.s3.sink.commit.Count +import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionSelection.defaultPartitionSelection import io.lenses.streamreactor.connect.aws.s3.sink.config.LocalStagingArea import io.lenses.streamreactor.connect.aws.s3.sink.config.OffsetSeekerOptions +import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionDisplay.Values import io.lenses.streamreactor.connect.aws.s3.sink.config.S3SinkConfig import io.lenses.streamreactor.connect.aws.s3.sink.config.SinkBucketOptions +import io.lenses.streamreactor.connect.aws.s3.sink.config.padding.PaddingService +import io.lenses.streamreactor.connect.aws.s3.sink.naming.OffsetS3FileNamer +import io.lenses.streamreactor.connect.aws.s3.sink.naming.S3KeyNamer import io.lenses.streamreactor.connect.aws.s3.utils.S3ProxyContainerTest import org.apache.kafka.connect.data.Struct import org.scalatest.flatspec.AnyFlatSpec @@ -61,11 +66,22 @@ class S3JsonWriterManagerTest extends AnyFlatSpec with Matchers with S3ProxyCont SinkBucketOptions( TopicName.some, bucketAndPrefix, - commitPolicy = CommitPolicy(Count(1)), - formatSelection = JsonFormatSelection, - fileNamingStrategy = new HierarchicalS3FileNamingStrategy(JsonFormatSelection, NoOpPaddingStrategy), + commitPolicy = CommitPolicy(Count(1)), + formatSelection = JsonFormatSelection, + keyNamer = new S3KeyNamer( + JsonFormatSelection, + defaultPartitionSelection(Values), + new OffsetS3FileNamer( + identity[String], + JsonFormatSelection.extension, + ), + new PaddingService(Map[String, PaddingStrategy]( + "partition" -> NoOpPaddingStrategy, + "offset" -> LeftPadPaddingStrategy(12, 0), + )), + ), localStagingArea = LocalStagingArea(localRoot), - partitionSelection = None, + partitionSelection = defaultPartitionSelection(Values), dataStorage = DataStorageSettings.disabled, ), // JsonS3Format ), @@ -106,10 +122,20 @@ class S3JsonWriterManagerTest extends AnyFlatSpec with Matchers with S3ProxyCont bucketAndPrefix, commitPolicy = CommitPolicy(Count(3)), formatSelection = JsonFormatSelection, - fileNamingStrategy = - new HierarchicalS3FileNamingStrategy(JsonFormatSelection, NoOpPaddingStrategy), // JsonS3Format + keyNamer = new S3KeyNamer( + AvroFormatSelection, + defaultPartitionSelection(Values), + new OffsetS3FileNamer( + identity[String], + JsonFormatSelection.extension, + ), + new PaddingService(Map[String, PaddingStrategy]( + "partition" -> NoOpPaddingStrategy, + "offset" -> LeftPadPaddingStrategy(12, 0), + )), + ), localStagingArea = LocalStagingArea(localRoot), - partitionSelection = None, + partitionSelection = defaultPartitionSelection(Values), dataStorage = DataStorageSettings.disabled, ), ), diff --git a/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3ParquetWriterManagerTest.scala b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3ParquetWriterManagerTest.scala index 8e7ee5afa..159f13b1f 100644 --- a/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3ParquetWriterManagerTest.scala +++ b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3ParquetWriterManagerTest.scala @@ -28,10 +28,15 @@ import io.lenses.streamreactor.connect.aws.s3.model._ import io.lenses.streamreactor.connect.aws.s3.model.location.S3Location import io.lenses.streamreactor.connect.aws.s3.sink.commit.CommitPolicy import io.lenses.streamreactor.connect.aws.s3.sink.commit.Count +import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionSelection.defaultPartitionSelection import io.lenses.streamreactor.connect.aws.s3.sink.config.LocalStagingArea import io.lenses.streamreactor.connect.aws.s3.sink.config.OffsetSeekerOptions +import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionDisplay.Values import io.lenses.streamreactor.connect.aws.s3.sink.config.S3SinkConfig import io.lenses.streamreactor.connect.aws.s3.sink.config.SinkBucketOptions +import io.lenses.streamreactor.connect.aws.s3.sink.config.padding.PaddingService +import io.lenses.streamreactor.connect.aws.s3.sink.naming.OffsetS3FileNamer +import io.lenses.streamreactor.connect.aws.s3.sink.naming.S3KeyNamer import io.lenses.streamreactor.connect.aws.s3.utils.ITSampleSchemaAndData._ import io.lenses.streamreactor.connect.aws.s3.utils.S3ProxyContainerTest import org.apache.avro.generic.GenericRecord @@ -64,11 +69,22 @@ class S3ParquetWriterManagerTest extends AnyFlatSpec with Matchers with S3ProxyC SinkBucketOptions( TopicName.some, bucketAndPrefix, - commitPolicy = CommitPolicy(Count(2)), - fileNamingStrategy = new HierarchicalS3FileNamingStrategy(ParquetFormatSelection, NoOpPaddingStrategy), + commitPolicy = CommitPolicy(Count(2)), + keyNamer = new S3KeyNamer( + ParquetFormatSelection, + defaultPartitionSelection(Values), + new OffsetS3FileNamer( + identity[String], + ParquetFormatSelection.extension, + ), + new PaddingService(Map[String, PaddingStrategy]( + "partition" -> NoOpPaddingStrategy, + "offset" -> LeftPadPaddingStrategy(12, 0), + )), + ), formatSelection = ParquetFormatSelection, localStagingArea = LocalStagingArea(localRoot), - partitionSelection = None, + partitionSelection = defaultPartitionSelection(Values), dataStorage = DataStorageSettings.disabled, ), ), diff --git a/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3SinkTaskAvroEnvelopeTest.scala b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3SinkTaskAvroEnvelopeTest.scala index ab17b4cdd..0133f6d2d 100644 --- a/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3SinkTaskAvroEnvelopeTest.scala +++ b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3SinkTaskAvroEnvelopeTest.scala @@ -103,7 +103,7 @@ class S3SinkTaskAvroEnvelopeTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS AVRO WITH_FLUSH_COUNT = 3 PROPERTIES('store.envelope'=true)", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS AVRO WITH_FLUSH_COUNT = 3 PROPERTIES('store.envelope'=true, 'padding.length.partition'='12', 'padding.length.offset'='12')", ), ).asJava @@ -237,7 +237,7 @@ class S3SinkTaskAvroEnvelopeTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS AVRO WITH_FLUSH_INTERVAL = 1 WITH_FLUSH_COUNT = 3 PROPERTIES('store.envelope'=true)", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS AVRO WITH_FLUSH_INTERVAL = 1 WITH_FLUSH_COUNT = 3 PROPERTIES('store.envelope'=true,'padding.length.partition'='12', 'padding.length.offset'='12')", ), ).asJava diff --git a/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3SinkTaskJsonEnvelopeTest.scala b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3SinkTaskJsonEnvelopeTest.scala index a243e0ca7..e444cb9c7 100644 --- a/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3SinkTaskJsonEnvelopeTest.scala +++ b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3SinkTaskJsonEnvelopeTest.scala @@ -102,7 +102,7 @@ class S3SinkTaskJsonEnvelopeTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `JSON` WITH_FLUSH_COUNT = 3 PROPERTIES('store.envelope'=true)", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `JSON` WITH_FLUSH_COUNT = 3 PROPERTIES('store.envelope'=true,'padding.length.partition'='12', 'padding.length.offset'='12')", ), ).asJava @@ -173,7 +173,7 @@ class S3SinkTaskJsonEnvelopeTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `JSON` WITH_FLUSH_COUNT = 3 PROPERTIES('store.envelope'=true)", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `JSON` WITH_FLUSH_COUNT = 3 PROPERTIES('store.envelope'=true,'padding.length.partition'='12', 'padding.length.offset'='12')", ), ).asJava @@ -247,7 +247,7 @@ class S3SinkTaskJsonEnvelopeTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `JSON` WITH_FLUSH_COUNT = 3 PROPERTIES('store.envelope'=true)", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `JSON` WITH_FLUSH_COUNT = 3 PROPERTIES('store.envelope'=true,'padding.length.partition'='12', 'padding.length.offset'='12')", ), ).asJava @@ -322,7 +322,7 @@ class S3SinkTaskJsonEnvelopeTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `JSON` WITH_FLUSH_COUNT = 3 PROPERTIES('store.envelope'=false)", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `JSON` WITH_FLUSH_COUNT = 3 PROPERTIES('store.envelope'=false,'padding.length.partition'='12', 'padding.length.offset'='12')", ), ).asJava @@ -390,7 +390,7 @@ class S3SinkTaskJsonEnvelopeTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `JSON` WITH_FLUSH_COUNT = 3 PROPERTIES('store.envelope'=true)", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `JSON` WITH_FLUSH_COUNT = 3 PROPERTIES('store.envelope'=true,'padding.length.partition'='12', 'padding.length.offset'='12')", ), ).asJava diff --git a/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3SinkTaskParquetEnvelopeTest.scala b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3SinkTaskParquetEnvelopeTest.scala index 6d5a425fd..c4f23226c 100644 --- a/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3SinkTaskParquetEnvelopeTest.scala +++ b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3SinkTaskParquetEnvelopeTest.scala @@ -84,7 +84,7 @@ class S3SinkTaskParquetEnvelopeTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `PARQUET` WITH_FLUSH_COUNT = 3 PROPERTIES('store.envelope'=true)", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `PARQUET` WITH_FLUSH_COUNT = 3 PROPERTIES('store.envelope'=true,'padding.length.partition'='12', 'padding.length.offset'='12')", ), ).asJava diff --git a/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3SinkTaskTest.scala b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3SinkTaskTest.scala index 10cc4b747..d05aacaff 100644 --- a/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3SinkTaskTest.scala +++ b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3SinkTaskTest.scala @@ -16,6 +16,9 @@ package io.lenses.streamreactor.connect.aws.s3.sink +import cats.effect.unsafe.implicits.global +import cats.effect.IO +import cats.effect.Resource import cats.implicits._ import com.opencsv.CSVReader import com.typesafe.scalalogging.LazyLogging @@ -47,9 +50,9 @@ import org.scalatest.matchers.should.Matchers import java.io.File import java.io.StringReader import java.nio.file.Files +import java.time.format.DateTimeFormatter import java.time.LocalDate import java.time.ZoneOffset -import java.time.format.DateTimeFormatter import java.lang import java.util import scala.jdk.CollectionConverters.MapHasAsJava @@ -176,9 +179,8 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest task.close(Seq(new TopicPartition(TopicName, 1)).asJava) task.stop() - listBucketPath(BucketName, "streamReactorBackups/myTopic/000000000001/").size should be(1) - - remoteFileAsString(BucketName, "streamReactorBackups/myTopic/000000000001/000000000002.json") should be( + listBucketPath(BucketName, "streamReactorBackups/myTopic/1/").size should be(1) + remoteFileAsString(BucketName, "streamReactorBackups/myTopic/1/000000000002.json") should be( """{"name":"sam","title":"mr","salary":100.43}{"name":"laura","title":"ms","salary":429.06}{"name":"tom","title":null,"salary":395.44}""", ) @@ -219,15 +221,15 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest task.close(Seq(new TopicPartition(TopicName, 1)).asJava) task.stop() - listBucketPath(BucketName, "streamReactorBackups/myTopic/000000000001/").size should be(3) + listBucketPath(BucketName, "streamReactorBackups/myTopic/1/").size should be(3) - remoteFileAsString(BucketName, "streamReactorBackups/myTopic/000000000001/000000000000.json") should be( + remoteFileAsString(BucketName, "streamReactorBackups/myTopic/1/000000000000.json") should be( """{"name":"sam","title":"mr","salary":100.43}""", ) - remoteFileAsString(BucketName, "streamReactorBackups/myTopic/000000000001/000000000001.json") should be( + remoteFileAsString(BucketName, "streamReactorBackups/myTopic/1/000000000001.json") should be( """{"name":"laura","title":"ms","salary":429.06}""", ) - remoteFileAsString(BucketName, "streamReactorBackups/myTopic/000000000001/000000000002.json") should be( + remoteFileAsString(BucketName, "streamReactorBackups/myTopic/1/000000000002.json") should be( """{"name":"tom","title":null,"salary":395.44}""", ) @@ -255,9 +257,9 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest task.close(Seq(new TopicPartition(TopicName, 1)).asJava) task.stop() - listBucketPath(BucketName, "streamReactorBackups/myTopic/000000000001/").size should be(1) + listBucketPath(BucketName, "streamReactorBackups/myTopic/1/").size should be(1) - remoteFileAsString(BucketName, "streamReactorBackups/myTopic/000000000001/000000000001.json") should be( + remoteFileAsString(BucketName, "streamReactorBackups/myTopic/1/000000000001.json") should be( """{"name":"sam","title":"mr","salary":100.43}{"name":"laura","title":"ms","salary":429.06}""", ) @@ -280,21 +282,17 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest task.close(Seq(new TopicPartition(TopicName, 1)).asJava) task.stop() - listBucketPath(BucketName, "streamReactorBackups/myTopic/000000000001/").size should be(2) - getMetadata(BucketName, "streamReactorBackups/myTopic/000000000001/000000000000.parquet").size should be >= 941L - getMetadata(BucketName, "streamReactorBackups/myTopic/000000000001/000000000001.parquet").size should be >= 954L + listBucketPath(BucketName, "streamReactorBackups/myTopic/1/").size should be(2) + getMetadata(BucketName, "streamReactorBackups/myTopic/1/000000000000.parquet").size should be >= 941L + getMetadata(BucketName, "streamReactorBackups/myTopic/1/000000000001.parquet").size should be >= 954L var genericRecords = - parquetFormatReader.read(remoteFileAsBytes(BucketName, - "streamReactorBackups/myTopic/000000000001/000000000000.parquet", - )) + parquetFormatReader.read(remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/1/000000000000.parquet")) genericRecords.size should be(1) checkRecord(genericRecords.head, "sam", "mr", 100.43) genericRecords = - parquetFormatReader.read(remoteFileAsBytes(BucketName, - "streamReactorBackups/myTopic/000000000001/000000000001.parquet", - )) + parquetFormatReader.read(remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/1/000000000001.parquet")) genericRecords.size should be(1) checkRecord(genericRecords.head, "laura", "ms", 429.06) @@ -332,15 +330,15 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest task.close(Seq(new TopicPartition(TopicName, 1)).asJava) task.stop() - listBucketPath(BucketName, "streamReactorBackups/myTopic/000000000001/").size should be(3) + listBucketPath(BucketName, "streamReactorBackups/myTopic/1/").size should be(3) - remoteFileAsString(BucketName, "streamReactorBackups/myTopic/000000000001/000000000000.json") should be( + remoteFileAsString(BucketName, "streamReactorBackups/myTopic/1/000000000000.json") should be( """{"name":"sam","title":"mr","salary":100.43}""", ) - remoteFileAsString(BucketName, "streamReactorBackups/myTopic/000000000001/000000000001.json") should be( + remoteFileAsString(BucketName, "streamReactorBackups/myTopic/1/000000000001.json") should be( """{"name":"laura","title":"ms","salary":429.06}""", ) - remoteFileAsString(BucketName, "streamReactorBackups/myTopic/000000000001/000000000002.json") should be( + remoteFileAsString(BucketName, "streamReactorBackups/myTopic/1/000000000002.json") should be( """{"name":"tom","title":null,"salary":395.44}""", ) @@ -365,22 +363,22 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest task.close(Seq(new TopicPartition("myTopic", 1)).asJava) task.stop() - val list = listBucketPath(BucketName, "streamReactorBackups/myTopic/000000000001/") + val list = listBucketPath(BucketName, "streamReactorBackups/myTopic/1/") list.size should be(1) - list should contain("streamReactorBackups/myTopic/000000000001/000000000001.parquet") + list should contain("streamReactorBackups/myTopic/1/000000000001.parquet") val modificationDate = - getMetadata(BucketName, "streamReactorBackups/myTopic/000000000001/000000000001.parquet").lastModified + getMetadata(BucketName, "streamReactorBackups/myTopic/1/000000000001.parquet").lastModified task = createTask(context, props) task.open(Seq(new TopicPartition(TopicName, 1)).asJava) verify(context).offset(new TopicPartition("myTopic", 1), 1) task.put(records.asJava) - listBucketPath(BucketName, "streamReactorBackups/myTopic/000000000001/").size should be(1) + listBucketPath(BucketName, "streamReactorBackups/myTopic/1/").size should be(1) // file should not have been overwritten - getMetadata(BucketName, "streamReactorBackups/myTopic/000000000001/000000000001.parquet").lastModified should be( + getMetadata(BucketName, "streamReactorBackups/myTopic/1/000000000001.parquet").lastModified should be( modificationDate, ) @@ -393,10 +391,10 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest // only 1 "real" record so should leave it hanging again task.put(List(records(1), records(2)).asJava) - listBucketPath(BucketName, "streamReactorBackups/myTopic/000000000001/").size should be(1) + listBucketPath(BucketName, "streamReactorBackups/myTopic/1/").size should be(1) // file should not have been overwritten - getMetadata(BucketName, "streamReactorBackups/myTopic/000000000001/000000000001.parquet").lastModified should be( + getMetadata(BucketName, "streamReactorBackups/myTopic/1/000000000001.parquet").lastModified should be( modificationDate, ) @@ -414,10 +412,10 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest toSinkRecord(users(3), 3), ).asJava) - listBucketPath(BucketName, "streamReactorBackups/myTopic/000000000001/").size should be(2) + listBucketPath(BucketName, "streamReactorBackups/myTopic/1/").size should be(2) // file should not have been overwritten - getMetadata(BucketName, "streamReactorBackups/myTopic/000000000001/000000000001.parquet").lastModified should be( + getMetadata(BucketName, "streamReactorBackups/myTopic/1/000000000001.parquet").lastModified should be( modificationDate, ) @@ -443,9 +441,9 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest task.close(Seq(new TopicPartition(TopicName, 1)).asJava) task.stop() - listBucketPath(BucketName, "streamReactorBackups/myTopic/000000000001/").size should be(3) + listBucketPath(BucketName, "streamReactorBackups/myTopic/1/").size should be(3) - val bytes = remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/000000000001/000000000000.parquet") + val bytes = remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/1/000000000000.parquet") val genericRecords = parquetFormatReader.read(bytes) genericRecords.size should be(1) @@ -470,9 +468,9 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest task.close(Seq(new TopicPartition(TopicName, 1)).asJava) task.stop() - listBucketPath(BucketName, "streamReactorBackups/myTopic/000000000001/").size should be(3) + listBucketPath(BucketName, "streamReactorBackups/myTopic/1/").size should be(3) - val bytes = remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/000000000001/000000000000.avro") + val bytes = remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/1/000000000000.avro") val genericRecords = avroFormatReader.read(bytes) genericRecords.size should be(1) @@ -524,12 +522,12 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest task.close(Seq(new TopicPartition(TopicName, 1)).asJava) task.stop() - listBucketPath(BucketName, "streamReactorBackups/myTopic/000000000001/").size should be(2) + listBucketPath(BucketName, "streamReactorBackups/myTopic/1/").size should be(2) - val file1Bytes = remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/000000000001/000000000001.text") + val file1Bytes = remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/1/000000000001.text") new String(file1Bytes) should be("Sausages\nMash\n") - val file2Bytes = remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/000000000001/000000000003.text") + val file2Bytes = remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/1/000000000003.text") new String(file2Bytes) should be("Peas\nGravy\n") } @@ -555,9 +553,9 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest task.close(Seq(new TopicPartition(TopicName, 1)).asJava) task.stop() - listBucketPath(BucketName, "streamReactorBackups/myTopic/000000000001/").size should be(2) + listBucketPath(BucketName, "streamReactorBackups/myTopic/1/").size should be(2) - val file1Bytes = remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/000000000001/000000000001.csv") + val file1Bytes = remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/1/000000000001.csv") val file1Reader = new StringReader(new String(file1Bytes)) val file1CsvReader = new CSVReader(file1Reader) @@ -567,7 +565,7 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest file1CsvReader.readNext() should be(Array("laura", "ms", "429.06")) file1CsvReader.readNext() should be(null) - val file2Bytes = remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/000000000001/000000000003.csv") + val file2Bytes = remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/1/000000000003.csv") val file2Reader = new StringReader(new String(file2Bytes)) val file2CsvReader = new CSVReader(file2Reader) @@ -599,9 +597,9 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest task.close(Seq(new TopicPartition(TopicName, 1)).asJava) task.stop() - listBucketPath(BucketName, "streamReactorBackups/myTopic/000000000001/").size should be(2) + listBucketPath(BucketName, "streamReactorBackups/myTopic/1/").size should be(2) - val file1Bytes = remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/000000000001/000000000001.csv") + val file1Bytes = remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/1/000000000001.csv") val file1Reader = new StringReader(new String(file1Bytes)) val file1CsvReader = new CSVReader(file1Reader) @@ -610,7 +608,7 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest file1CsvReader.readNext() should be(Array("laura", "ms", "429.06")) file1CsvReader.readNext() should be(null) - val file2Bytes = remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/000000000001/000000000003.csv") + val file2Bytes = remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/1/000000000003.csv") val file2Reader = new StringReader(new String(file2Bytes)) val file2CsvReader = new CSVReader(file2Reader) @@ -641,27 +639,27 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest remoteFileAsString( BucketName, - "streamReactorBackups/name=first/title=primary/salary=[missing]/myTopic(000000000001_000000000000).json", + "streamReactorBackups/name=first/title=primary/salary=[missing]/myTopic(1_000000000000).json", ) should be("""{"name":"first","title":"primary","salary":null}""") remoteFileAsString( BucketName, - "streamReactorBackups/name=second/title=secondary/salary=100.0/myTopic(000000000001_000000000001).json", + "streamReactorBackups/name=second/title=secondary/salary=100.0/myTopic(1_000000000001).json", ) should be("""{"name":"second","title":"secondary","salary":100.0}""") remoteFileAsString( BucketName, - "streamReactorBackups/name=third/title=primary/salary=100.0/myTopic(000000000001_000000000002).json", + "streamReactorBackups/name=third/title=primary/salary=100.0/myTopic(1_000000000002).json", ) should be("""{"name":"third","title":"primary","salary":100.0}""") remoteFileAsString( BucketName, - "streamReactorBackups/name=first/title=[missing]/salary=200.0/myTopic(000000000001_000000000003).json", + "streamReactorBackups/name=first/title=[missing]/salary=200.0/myTopic(1_000000000003).json", ) should be("""{"name":"first","title":null,"salary":200.0}""") remoteFileAsString( BucketName, - "streamReactorBackups/name=second/title=[missing]/salary=100.0/myTopic(000000000001_000000000004).json", + "streamReactorBackups/name=second/title=[missing]/salary=100.0/myTopic(1_000000000004).json", ) should be("""{"name":"second","title":null,"salary":100.0}""") remoteFileAsString( BucketName, - "streamReactorBackups/name=third/title=[missing]/salary=100.0/myTopic(000000000001_000000000005).json", + "streamReactorBackups/name=third/title=[missing]/salary=100.0/myTopic(1_000000000005).json", ) should be("""{"name":"third","title":null,"salary":100.0}""") } @@ -705,22 +703,22 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest listBucketPath(BucketName, "streamReactorBackups/").size should be(4) remoteFileAsString(BucketName, - "streamReactorBackups/name=first/title=primary/myTopic(000000000001_000000000002).json", + "streamReactorBackups/name=first/title=primary/myTopic(1_000000000002).json", ) should be( """{"name":"first","title":"primary","salary":null}{"name":"first","title":"primary","salary":100.0}""", ) remoteFileAsString(BucketName, - "streamReactorBackups/name=first/title=secondary/myTopic(000000000001_000000000001).json", + "streamReactorBackups/name=first/title=secondary/myTopic(1_000000000001).json", ) should be( """{"name":"first","title":"secondary","salary":100.0}""", ) remoteFileAsString(BucketName, - "streamReactorBackups/name=second/title=secondary/myTopic(000000000001_000000000005).json", + "streamReactorBackups/name=second/title=secondary/myTopic(1_000000000005).json", ) should be( """{"name":"second","title":"secondary","salary":200.0}{"name":"second","title":"secondary","salary":100.0}""", ) remoteFileAsString(BucketName, - "streamReactorBackups/name=second/title=primary/myTopic(000000000001_000000000004).json", + "streamReactorBackups/name=second/title=primary/myTopic(1_000000000004).json", ) should be( """{"name":"second","title":"primary","salary":100.0}""", ) @@ -747,33 +745,27 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest listBucketPath(BucketName, "streamReactorBackups/").size should be(6) remoteFileAsString(BucketName, - "streamReactorBackups/first/primary/[missing]/myTopic(000000000001_000000000000).json", + "streamReactorBackups/first/primary/[missing]/myTopic(1_000000000000).json", ) should be( """{"name":"first","title":"primary","salary":null}""", ) remoteFileAsString(BucketName, - "streamReactorBackups/second/secondary/100.0/myTopic(000000000001_000000000001).json", + "streamReactorBackups/second/secondary/100.0/myTopic(1_000000000001).json", ) should be( """{"name":"second","title":"secondary","salary":100.0}""", ) - remoteFileAsString(BucketName, - "streamReactorBackups/third/primary/100.0/myTopic(000000000001_000000000002).json", - ) should be( + remoteFileAsString(BucketName, "streamReactorBackups/third/primary/100.0/myTopic(1_000000000002).json") should be( """{"name":"third","title":"primary","salary":100.0}""", ) - remoteFileAsString(BucketName, - "streamReactorBackups/first/[missing]/200.0/myTopic(000000000001_000000000003).json", - ) should be( + remoteFileAsString(BucketName, "streamReactorBackups/first/[missing]/200.0/myTopic(1_000000000003).json") should be( """{"name":"first","title":null,"salary":200.0}""", ) remoteFileAsString(BucketName, - "streamReactorBackups/second/[missing]/100.0/myTopic(000000000001_000000000004).json", + "streamReactorBackups/second/[missing]/100.0/myTopic(1_000000000004).json", ) should be( """{"name":"second","title":null,"salary":100.0}""", ) - remoteFileAsString(BucketName, - "streamReactorBackups/third/[missing]/100.0/myTopic(000000000001_000000000005).json", - ) should be( + remoteFileAsString(BucketName, "streamReactorBackups/third/[missing]/100.0/myTopic(1_000000000005).json") should be( """{"name":"third","title":null,"salary":100.0}""", ) @@ -821,9 +813,9 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest task.close(Seq(new TopicPartition(TopicName, 1)).asJava) task.stop() - listBucketPath(BucketName, "streamReactorBackups/myTopic/000000000001/").size should be(1) + listBucketPath(BucketName, "streamReactorBackups/myTopic/1/").size should be(1) - remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/000000000001/000000000000.bytes") should be(bytes) + remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/1/000000000000.bytes") should be(bytes) } @@ -903,17 +895,17 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest listBucketPath(BucketName, "streamReactorBackups/").size should be(3) val file1CsvReader: CSVReader = - openCsvReaderToBucketFile("streamReactorBackups/phonePrefix=+44/region=8/myTopic(000000000001_000000000000).csv") + openCsvReaderToBucketFile("streamReactorBackups/phonePrefix=+44/region=8/myTopic(1_000000000000).csv") file1CsvReader.readNext() should be(Array("sam", "mr", "100.43")) file1CsvReader.readNext() should be(null) val file2CsvReader: CSVReader = - openCsvReaderToBucketFile("streamReactorBackups/phonePrefix=+49/region=5/myTopic(000000000001_000000000001).csv") + openCsvReaderToBucketFile("streamReactorBackups/phonePrefix=+49/region=5/myTopic(1_000000000001).csv") file2CsvReader.readNext() should be(Array("laura", "ms", "429.06")) file2CsvReader.readNext() should be(null) val file3CsvReader: CSVReader = - openCsvReaderToBucketFile("streamReactorBackups/phonePrefix=+49/region=5/myTopic(000000000001_000000000002).csv") + openCsvReaderToBucketFile("streamReactorBackups/phonePrefix=+49/region=5/myTopic(1_000000000002).csv") file3CsvReader.readNext() should be(Array("tom", "", "395.44")) file3CsvReader.readNext() should be(null) } @@ -939,12 +931,12 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest fileList.size should be(6) fileList should contain allOf ( - "streamReactorBackups/headerPartitionKey=0/name=first/myTopic(000000000001_000000000000).csv", - "streamReactorBackups/headerPartitionKey=1/name=second/myTopic(000000000001_000000000001).csv", - "streamReactorBackups/headerPartitionKey=0/name=third/myTopic(000000000001_000000000002).csv", - "streamReactorBackups/headerPartitionKey=1/name=first/myTopic(000000000001_000000000003).csv", - "streamReactorBackups/headerPartitionKey=0/name=second/myTopic(000000000001_000000000004).csv", - "streamReactorBackups/headerPartitionKey=1/name=third/myTopic(000000000001_000000000005).csv" + "streamReactorBackups/headerPartitionKey=0/name=first/myTopic(1_000000000000).csv", + "streamReactorBackups/headerPartitionKey=1/name=second/myTopic(1_000000000001).csv", + "streamReactorBackups/headerPartitionKey=0/name=third/myTopic(1_000000000002).csv", + "streamReactorBackups/headerPartitionKey=1/name=first/myTopic(1_000000000003).csv", + "streamReactorBackups/headerPartitionKey=0/name=second/myTopic(1_000000000004).csv", + "streamReactorBackups/headerPartitionKey=1/name=third/myTopic(1_000000000005).csv" ) } @@ -979,6 +971,30 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest } + "S3SinkTask" should "fail with message when deprecated properties are used" in { + Resource.make(IO(new S3SinkTask())) { sinkTask => + IO(sinkTask.stop()) + }.use { sinkTask => + IO { + val exMessage = intercept[IllegalArgumentException] { + sinkTask.start( + DefaultProps.combine( + Map( + "aws.access.key" -> "myAccessKey", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `CSV` WITH_FLUSH_COUNT = 1", + ), + ).asJava, + ) + }.getMessage + + exMessage should startWith("The following properties have been deprecated: `aws.access.key`") + exMessage should include("Change `aws.access.key` to `connect.s3.aws.access.key`") + + listBucketPath(BucketName, "streamReactorBackups/").size should be(0) + } + }.unsafeRunSync() + } + "S3SinkTask" should "support numeric header data types" in { val textRecords = List( @@ -1005,9 +1021,9 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest fileList.size should be(3) fileList should contain allOf ( - "streamReactorBackups/intheader=1/longheader=2/myTopic(000000000001_000000000000).csv", - "streamReactorBackups/intheader=2/longheader=2/myTopic(000000000001_000000000001).csv", - "streamReactorBackups/intheader=1/longheader=1/myTopic(000000000002_000000000002).csv", + "streamReactorBackups/intheader=1/longheader=2/myTopic(1_000000000000).csv", + "streamReactorBackups/intheader=2/longheader=2/myTopic(1_000000000001).csv", + "streamReactorBackups/intheader=1/longheader=1/myTopic(2_000000000002).csv", ) } @@ -1037,12 +1053,12 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest fileList.size should be(6) fileList should contain allOf ( - "streamReactorBackups/key=0/myTopic(000000000001_000000000000).csv", - "streamReactorBackups/key=1/myTopic(000000000001_000000000001).csv", - "streamReactorBackups/key=0/myTopic(000000000001_000000000002).csv", - "streamReactorBackups/key=1/myTopic(000000000001_000000000003).csv", - "streamReactorBackups/key=0/myTopic(000000000001_000000000004).csv", - "streamReactorBackups/key=1/myTopic(000000000001_000000000005).csv" + "streamReactorBackups/key=0/myTopic(1_000000000000).csv", + "streamReactorBackups/key=1/myTopic(1_000000000001).csv", + "streamReactorBackups/key=0/myTopic(1_000000000002).csv", + "streamReactorBackups/key=1/myTopic(1_000000000003).csv", + "streamReactorBackups/key=0/myTopic(1_000000000004).csv", + "streamReactorBackups/key=1/myTopic(1_000000000005).csv" ) } @@ -1072,12 +1088,12 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest fileList.size should be(6) fileList should contain allOf ( - "streamReactorBackups/0/myTopic(000000000001_000000000000).csv", - "streamReactorBackups/1/myTopic(000000000001_000000000001).csv", - "streamReactorBackups/0/myTopic(000000000001_000000000002).csv", - "streamReactorBackups/1/myTopic(000000000001_000000000003).csv", - "streamReactorBackups/0/myTopic(000000000001_000000000004).csv", - "streamReactorBackups/1/myTopic(000000000001_000000000005).csv" + "streamReactorBackups/0/myTopic(1_000000000000).csv", + "streamReactorBackups/1/myTopic(1_000000000001).csv", + "streamReactorBackups/0/myTopic(1_000000000002).csv", + "streamReactorBackups/1/myTopic(1_000000000003).csv", + "streamReactorBackups/0/myTopic(1_000000000004).csv", + "streamReactorBackups/1/myTopic(1_000000000005).csv" ) } @@ -1093,7 +1109,7 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName PARTITIONBY _topic, _partition STOREAS `CSV` WITHPARTITIONER=Values WITH_FLUSH_COUNT = 1", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName PARTITIONBY _topic, _partition STOREAS `CSV` WITHPARTITIONER=Values WITH_FLUSH_COUNT = 1 PROPERTIES('padding.length.partition'='12', 'padding.length.offset'='12')", ), ).asJava @@ -1159,7 +1175,7 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName PARTITIONBY _key STOREAS `CSV` WITHPARTITIONER=Values WITH_FLUSH_COUNT = 1", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName PARTITIONBY _key STOREAS `CSV` WITHPARTITIONER=Values WITH_FLUSH_COUNT = 1 PROPERTIES('padding.length.partition'='12', 'padding.length.offset'='12')", ), ).asJava @@ -1189,7 +1205,7 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName PARTITIONBY _key.region, _key.phonePrefix STOREAS `CSV` WITH_FLUSH_COUNT = 1", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName PARTITIONBY _key.region, _key.phonePrefix STOREAS `CSV` WITH_FLUSH_COUNT = 1 PROPERTIES('padding.length.partition'='12', 'padding.length.offset'='12')", ), ).asJava @@ -1216,7 +1232,7 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName PARTITIONBY _key.region, name WITH_FLUSH_COUNT = 1", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName PARTITIONBY _key.region, name WITH_FLUSH_COUNT = 1 PROPERTIES('padding.length.partition'='12', 'padding.length.offset'='12')", ), ).asJava @@ -1236,6 +1252,36 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest ) } + "S3SinkTask" should "write to root without a prefix" in { + + val task = new S3SinkTask() + + val props = DefaultProps + .combine( + Map( + "connect.s3.kcql" -> s"insert into $BucketName select * from $TopicName PARTITIONBY _key.region, name WITH_FLUSH_COUNT = 1 PROPERTIES('padding.length.partition'='12', 'padding.length.offset'='12')", + ), + ).asJava + + task.start(props) + task.open(Seq(new TopicPartition(TopicName, 1)).asJava) + task.put(keyPartitionedRecords.asJava) + task.close(Seq(new TopicPartition(TopicName, 1)).asJava) + task.stop() + + val fileList = listBucketPath(BucketName, "") + fileList.size should be(4) + + // the results do contain the index. The sink always looks for the index at the root of the bucket when offset synching. + // The source excludes the index files. + fileList should contain allOf ( + ".indexes/s3SinkTaskBuildLocalTest/myTopic/00001/00000000000000000002", + "region=8/name=sam/myTopic(000000000001_000000000000).json", + "region=5/name=laura/myTopic(000000000001_000000000001).json", + "region=5/name=tom/myTopic(000000000001_000000000002).json" + ) + } + /** * This should write partition 1 but not partition 0 */ @@ -1256,7 +1302,9 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest val props = DefaultProps .combine( - Map("connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName WITH_FLUSH_COUNT = 2"), + Map( + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName WITH_FLUSH_COUNT = 2 PROPERTIES('padding.length.partition'='12', 'padding.length.offset'='12')", + ), ).asJava task.start(props) @@ -1302,7 +1350,7 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName PARTITIONBY jedi WITH_FLUSH_COUNT = 1", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName PARTITIONBY jedi WITH_FLUSH_COUNT = 1 PROPERTIES('padding.length.partition'='12', 'padding.length.offset'='12')", ), ).asJava @@ -1341,7 +1389,9 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest val props = DefaultProps .combine( - Map("connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName WITH_FLUSH_COUNT = 1"), + Map( + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName WITH_FLUSH_COUNT = 1 PROPERTIES('padding.length.partition'='12', 'padding.length.offset'='12')", + ), ).asJava task.start(props) @@ -1381,7 +1431,7 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `AVRO` WITH_FLUSH_COUNT = 1", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `AVRO` WITH_FLUSH_COUNT = 1 PROPERTIES('padding.length.partition'='12', 'padding.length.offset'='12')", ), ).asJava @@ -1422,7 +1472,7 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `AVRO` WITH_FLUSH_COUNT = 1", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `AVRO` WITH_FLUSH_COUNT = 1 PROPERTIES('padding.length.partition'='12', 'padding.length.offset'='12')", ), ).asJava @@ -1475,7 +1525,7 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `AVRO` WITH_FLUSH_COUNT = 1", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `AVRO` WITH_FLUSH_COUNT = 1 PROPERTIES('padding.length.partition'='12', 'padding.length.offset'='12')", ), ).asJava @@ -1532,7 +1582,7 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `JSON` WITH_FLUSH_COUNT = 1", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `JSON` WITH_FLUSH_COUNT = 1 PROPERTIES('padding.length.partition'='12', 'padding.length.offset'='12')", ), ).asJava @@ -1579,7 +1629,7 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `AVRO` WITH_FLUSH_COUNT = 1", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `AVRO` WITH_FLUSH_COUNT = 1 PROPERTIES('padding.length.partition'='12', 'padding.length.offset'='12')", ), ).asJava @@ -1674,7 +1724,7 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName PARTITIONBY _value.user.name WITH_FLUSH_COUNT = 1", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName PARTITIONBY _value.user.name WITH_FLUSH_COUNT = 1 PROPERTIES('padding.length.partition'='12', 'padding.length.offset'='12')", ), ).asJava @@ -1772,7 +1822,7 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName PARTITIONBY _key.favourites.band, _key.`cost.centre.id` WITH_FLUSH_COUNT = 1", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName PARTITIONBY _key.favourites.band, _key.`cost.centre.id` WITH_FLUSH_COUNT = 1 PROPERTIES('padding.length.partition'='12', 'padding.length.offset'='12')", ), ).asJava @@ -1844,7 +1894,7 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName PARTITIONBY _header.header1.user.name WITH_FLUSH_COUNT = 1", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName PARTITIONBY _header.header1.user.name WITH_FLUSH_COUNT = 1 PROPERTIES('padding.length.partition'='12', 'padding.length.offset'='12')", ), ).asJava @@ -1872,7 +1922,7 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName WITH_FLUSH_COUNT = 1", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName WITH_FLUSH_COUNT = 1 PROPERTIES('padding.length.partition'='12', 'padding.length.offset'='12')", "connect.s3.write.mode" -> "BuildLocal", ), ).asJava @@ -1914,7 +1964,7 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest Map( "name" -> "s3SinkTaskBuildLocalTest", "connect.s3.local.tmp.directory" -> tempDir.toString, - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName WITH_FLUSH_COUNT = 1", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName WITH_FLUSH_COUNT = 1 PROPERTIES('padding.length.partition'='12', 'padding.length.offset'='12')", "connect.s3.write.mode" -> "BuildLocal", ), ).asJava @@ -1959,15 +2009,15 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest task.close(Seq(new TopicPartition(TopicName, 1)).asJava) task.stop() - listBucketPath(BucketName, "streamReactorBackups/myTopic/000000000001/").size should be(3) + listBucketPath(BucketName, "streamReactorBackups/myTopic/1/").size should be(3) - remoteFileAsString(BucketName, "streamReactorBackups/myTopic/000000000001/000000000000.json") should be( + remoteFileAsString(BucketName, "streamReactorBackups/myTopic/1/000000000000.json") should be( """{"name":"sam","title":"mr","salary":100.43}""", ) - remoteFileAsString(BucketName, "streamReactorBackups/myTopic/000000000001/000000000001.json") should be( + remoteFileAsString(BucketName, "streamReactorBackups/myTopic/1/000000000001.json") should be( """{"name":"laura","title":"ms","salary":429.06}""", ) - remoteFileAsString(BucketName, "streamReactorBackups/myTopic/000000000001/000000000002.json") should be( + remoteFileAsString(BucketName, "streamReactorBackups/myTopic/1/000000000002.json") should be( """{"name":"tom","title":null,"salary":395.44}""", ) @@ -1992,9 +2042,9 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest task.close(Seq(new TopicPartition(TopicName, 1)).asJava) task.stop() - listBucketPath(BucketName, "streamReactorBackups/myTopic/000000000001/").size should be(1) + listBucketPath(BucketName, "streamReactorBackups/myTopic/1/").size should be(1) - val bytes = remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/000000000001/000000000002.avro") + val bytes = remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/1/000000000002.avro") val genericRecords1 = avroFormatReader.read(bytes) genericRecords1.size should be(3) @@ -2038,9 +2088,9 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest task.close(Seq(new TopicPartition(TopicName, 1)).asJava) task.stop() - listBucketPath(BucketName, "streamReactorBackups/myTopic/000000000001/").size should be(1) + listBucketPath(BucketName, "streamReactorBackups/myTopic/1/").size should be(1) - remoteFileAsString(BucketName, "streamReactorBackups/myTopic/000000000001/000000000002.json") should be( + remoteFileAsString(BucketName, "streamReactorBackups/myTopic/1/000000000002.json") should be( """{"name":"sam","title":"mr","salary":100.43}{"name":"laura","title":"ms","salary":429.06}{"name":"tom","title":null,"salary":395.44}""", ) @@ -2081,9 +2131,9 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest task.close(Seq(new TopicPartition(TopicName, 1)).asJava) task.stop() - listBucketPath(BucketName, "streamReactorBackups/myTopic/000000000001/").size should be(3) + listBucketPath(BucketName, "streamReactorBackups/myTopic/1/").size should be(3) - val bytes = remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/000000000001/000000000000.avro") + val bytes = remoteFileAsBytes(BucketName, "streamReactorBackups/myTopic/1/000000000000.avro") val genericRecords1 = avroFormatReader.read(bytes) genericRecords1.size should be(1) @@ -2101,7 +2151,7 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `json` WITH_FLUSH_COUNT = 1", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName STOREAS `json` WITH_FLUSH_COUNT = 1 PROPERTIES('padding.length.partition'='12', 'padding.length.offset'='12')", ERROR_POLICY -> "RETRY", ERROR_RETRY_INTERVAL -> "10", HTTP_NBR_OF_RETRIES -> "5", @@ -2158,7 +2208,7 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from `*` WITH_FLUSH_COUNT = 3", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from `*` WITH_FLUSH_COUNT = 3 PROPERTIES('padding.length.partition'='12', 'padding.length.offset'='12')", ), ).asJava @@ -2206,7 +2256,7 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName PARTITIONBY _date.uuuu,_date.LL,_date.dd WITHPARTITIONER=Values WITH_FLUSH_COUNT = 1", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName PARTITIONBY _date.uuuu,_date.LL,_date.dd WITHPARTITIONER=Values WITH_FLUSH_COUNT = 1 PROPERTIES('padding.length.partition'='12', 'padding.length.offset'='12')", ), ).asJava @@ -2230,7 +2280,7 @@ class S3SinkTaskTest extends AnyFlatSpec with Matchers with S3ProxyContainerTest val props = DefaultProps .combine( Map( - "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName WITH_FLUSH_COUNT = 3", + "connect.s3.kcql" -> s"insert into $BucketName:$PrefixName select * from $TopicName WITH_FLUSH_COUNT = 3)", "connect.s3.padding.strategy" -> "NoOp", "connect.s3.padding.length" -> "10", ), diff --git a/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3WriterManagerTest.scala b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3WriterManagerTest.scala index 1d8c919d1..6c5d1ae3e 100644 --- a/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3WriterManagerTest.scala +++ b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3WriterManagerTest.scala @@ -2,7 +2,6 @@ package io.lenses.streamreactor.connect.aws.s3.sink import cats.implicits.catsSyntaxEitherId import io.lenses.streamreactor.connect.aws.s3.config.ConnectorTaskId -import io.lenses.streamreactor.connect.aws.s3.config.CsvFormatSelection import io.lenses.streamreactor.connect.aws.s3.formats.writer.S3FormatWriter import io.lenses.streamreactor.connect.aws.s3.model.Topic import io.lenses.streamreactor.connect.aws.s3.model.location.S3Location @@ -10,6 +9,7 @@ import io.lenses.streamreactor.connect.aws.s3.sink.commit.CommitPolicy import io.lenses.streamreactor.connect.aws.s3.sink.commit.Count import io.lenses.streamreactor.connect.aws.s3.sink.commit.FileSize import io.lenses.streamreactor.connect.aws.s3.sink.commit.Interval +import io.lenses.streamreactor.connect.aws.s3.sink.naming.S3KeyNamer import io.lenses.streamreactor.connect.aws.s3.sink.seek.IndexManager import io.lenses.streamreactor.connect.aws.s3.utils.S3ProxyContainerTest import org.apache.kafka.clients.consumer.OffsetAndMetadata @@ -25,12 +25,13 @@ class S3WriterManagerTest extends AnyFlatSpec with Matchers with S3ProxyContaine private val topicPartition = Topic("topic").withPartition(10) + private val s3KeyNamer = mock[S3KeyNamer] "S3WriterManager" should "return empty map when no offset or metadata writers can be found" in { val wm = new S3WriterManager( commitPolicyFn = _ => CommitPolicy(FileSize(5L), Interval(5.seconds), Count(5L)).asRight, bucketAndPrefixFn = _ => S3Location("bucketAndPath:location").asRight, - fileNamingStrategyFn = - _ => new HierarchicalS3FileNamingStrategy(CsvFormatSelection(Set.empty), NoOpPaddingStrategy).asRight, + keyNamerFn = + _ => s3KeyNamer.asRight, stagingFilenameFn = (_, _) => new File("blah.csv").asRight, finalFilenameFn = (_, _, _) => mock[S3Location].asRight, formatWriterFn = (_, _) => mock[S3FormatWriter].asRight, diff --git a/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/source/BucketSetup.scala b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/source/BucketSetup.scala index 4c6ee771c..abf78d38d 100644 --- a/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/source/BucketSetup.scala +++ b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/source/BucketSetup.scala @@ -44,6 +44,44 @@ class BucketSetup(implicit storageInterface: StorageInterface) extends Matchers ) should be(Right(true)) } + def writeDataToBucket( + bucketName: String, + pathName: String, + ): Unit = { + storageInterface.writeStringToFile( + bucketName, + pathName, + "someData", + ) + () + } + + def setUpRootBucketData(bucketName: String, format: Format, formatOption: Option[FormatOptions]): Unit = + 1 to 5 foreach { + fileNum => + copyResourceToBucket( + s"/${format.entryName.toLowerCase}${generateFormatString(formatOption)}/$fileNum.${format.entryName.toLowerCase}", + bucketName, + s"0/${fileNum * 200 - 1}.${format.entryName.toLowerCase}", + ) + + storageInterface.pathExists( + bucketName, + s"0/${fileNum * 200 - 1}.${format.entryName.toLowerCase}", + ) should be(Right(true)) + + copyResourceToBucket( + s"/${format.entryName.toLowerCase}${generateFormatString(formatOption)}/$fileNum.${format.entryName.toLowerCase}", + bucketName, + s"0/${fileNum * 200 - 1}.${format.entryName.toLowerCase}", + ) + + // not really a real index file but anything that has .indexes in the name should be ignored + writeDataToBucket( + bucketName, + s".indexes/00001/00000000000000000002", + ) + } def totalFileLengthBytes(format: Format): Int = { 1 to 5 map { fileNum: Int => diff --git a/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/source/S3SourceTaskBucketRootTest.scala b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/source/S3SourceTaskBucketRootTest.scala new file mode 100644 index 000000000..db245e568 --- /dev/null +++ b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/source/S3SourceTaskBucketRootTest.scala @@ -0,0 +1,104 @@ +package io.lenses.streamreactor.connect.aws.s3.source + +import cats.implicits._ +import io.lenses.streamreactor.connect.aws.s3.config.AuthMode +import io.lenses.streamreactor.connect.aws.s3.config.S3ConfigSettings._ +import io.lenses.streamreactor.connect.aws.s3.source.S3SourceTaskTest.formats +import io.lenses.streamreactor.connect.aws.s3.utils.S3ProxyContainerTest +import org.scalatest.EitherValues +import org.scalatest.concurrent.Eventually.eventually +import org.scalatest.flatspec.AnyFlatSpecLike +import org.scalatest.matchers.should.Matchers +import org.scalatest.prop.TableDrivenPropertyChecks._ +import software.amazon.awssdk.services.s3.model.CreateBucketRequest + +import scala.jdk.CollectionConverters.ListHasAsScala +import scala.jdk.CollectionConverters.MapHasAsJava +import scala.util.Try +class S3SourceTaskBucketRootTest extends S3ProxyContainerTest with AnyFlatSpecLike with Matchers with EitherValues { + + def DefaultProps: Map[String, String] = Map( + AWS_ACCESS_KEY -> Identity, + AWS_SECRET_KEY -> Credential, + AWS_REGION -> "eu-west-1", + AUTH_MODE -> AuthMode.Credentials.toString, + CUSTOM_ENDPOINT -> uri(), + ENABLE_VIRTUAL_HOST_BUCKETS -> "true", + TASK_INDEX -> "0:1", + "name" -> "s3-source", + SOURCE_PARTITION_SEARCH_INTERVAL_MILLIS -> "1000", + ) + + private val TopicName = "myTopic" + + override def cleanUpEnabled: Boolean = false + + "task" should "read files from root of bucket" in { + forAll(formats) { + (format, formatExtension, _) => + withClue(s"Format:$format") { + val bucketSetup = new BucketSetup()(storageInterface) + val bucketName = (BucketName + format.entryName + formatExtension.map(_.entryName).getOrElse("")).toLowerCase + createBucket(bucketName) should be(Right(())) + bucketSetup.setUpRootBucketData( + bucketName, + format, + formatExtension, + ) + val task = new S3SourceTask() + + val props = DefaultProps + .combine( + Map( + KCQL_CONFIG -> s"insert into $TopicName select * from $bucketName STOREAS `${format.entryName}${formatExtension.fold("")("_" + _)}` LIMIT 190", + ), + ).asJava + + task.start(props) + + withCleanup(task.stop()) { + val sourceRecords1 = eventually { + val records = task.poll() + records.size() shouldBe 190 + records + } + + val sourceRecords2 = task.poll() + val sourceRecords3 = task.poll() + val sourceRecords4 = task.poll() + val sourceRecords5 = task.poll() + val sourceRecords6 = task.poll() + val sourceRecords7 = task.poll() + + task.stop() + + sourceRecords2 should have size 190 + sourceRecords3 should have size 190 + sourceRecords4 should have size 190 + sourceRecords5 should have size 190 + sourceRecords6 should have size 50 + sourceRecords7 should have size 0 + + sourceRecords1.asScala + .concat(sourceRecords2.asScala) + .concat(sourceRecords3.asScala) + .concat(sourceRecords4.asScala) + .concat(sourceRecords5.asScala) + .concat(sourceRecords6.asScala) + .toSet should have size 1000 + } + } + } + + def withCleanup[T](cleanup: => Unit)(fn: => T): Unit = + try { + fn + () + } finally { + cleanup + } + } + + private def createBucket(bucketName: String) = + Try(s3Client.createBucket(CreateBucketRequest.builder().bucket(bucketName).build())).toEither.map(_ => ()) +} diff --git a/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/source/S3SourceTaskTest.scala b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/source/S3SourceTaskTest.scala index 67382416b..34dbe8a53 100644 --- a/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/source/S3SourceTaskTest.scala +++ b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/source/S3SourceTaskTest.scala @@ -10,6 +10,7 @@ import io.lenses.streamreactor.connect.aws.s3.config.Format import io.lenses.streamreactor.connect.aws.s3.config.FormatOptions import io.lenses.streamreactor.connect.aws.s3.config.S3ConfigSettings._ import io.lenses.streamreactor.connect.aws.s3.model.location.S3Location +import io.lenses.streamreactor.connect.aws.s3.source.S3SourceTaskTest.formats import io.lenses.streamreactor.connect.aws.s3.storage.AwsS3DirectoryLister import io.lenses.streamreactor.connect.aws.s3.storage.DirectoryFindCompletionConfig import io.lenses.streamreactor.connect.aws.s3.storage.DirectoryFindResults @@ -29,7 +30,17 @@ import java.util import scala.jdk.CollectionConverters.IteratorHasAsScala import scala.jdk.CollectionConverters.ListHasAsScala import scala.jdk.CollectionConverters.MapHasAsJava +object S3SourceTaskTest { + val formats = Table( + ("format", "formatOptionOption", "dirName"), + (Format.Avro, None, "avro"), + (Format.Json, None, "json"), + (Format.Parquet, None, "parquet"), + (Format.Csv, Some(FormatOptions.WithHeaders), "csvheaders"), + (Format.Csv, None, "csvnoheaders"), + ) +} class S3SourceTaskTest extends AnyFlatSpec with Matchers @@ -57,15 +68,6 @@ class S3SourceTaskTest SOURCE_PARTITION_SEARCH_RECURSE_LEVELS -> "0", ) - private val formats = Table( - ("format", "formatOptionOption", "dirName"), - (Format.Avro, None, "avro"), - (Format.Json, None, "json"), - (Format.Parquet, None, "parquet"), - (Format.Csv, Some(FormatOptions.WithHeaders), "csvheaders"), - (Format.Csv, None, "csvnoheaders"), - ) - "blobstore get input stream" should "reveal availability" in { val inputStream = @@ -87,6 +89,7 @@ class S3SourceTaskTest root, DirectoryFindCompletionConfig(0), Set.empty, + Set.empty, s3Client.listObjectsV2Paginator(_).iterator().asScala, ConnectorTaskId("name", 1, 1), ) diff --git a/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/storage/ListDirectoryTest.scala b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/storage/ListDirectoryTest.scala index 191ccd6b8..138e13d16 100644 --- a/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/storage/ListDirectoryTest.scala +++ b/kafka-connect-aws-s3/src/it/scala/io/lenses/streamreactor/connect/aws/s3/storage/ListDirectoryTest.scala @@ -45,6 +45,7 @@ class ListDirectoryTest extends AnyFlatSpec with Matchers with S3ProxyContainerT topicRoot, DirectoryFindCompletionConfig(0), Set.empty, + Set.empty, s3Client.listObjectsV2Paginator(_).iterator().asScala, connectorTaskId, ).unsafeRunSync() @@ -65,6 +66,7 @@ class ListDirectoryTest extends AnyFlatSpec with Matchers with S3ProxyContainerT bucketRoot, DirectoryFindCompletionConfig(2), Set.empty, + Set.empty, s3Client.listObjectsV2Paginator(_).iterator().asScala, taskId, ).unsafeRunSync() @@ -84,6 +86,7 @@ class ListDirectoryTest extends AnyFlatSpec with Matchers with S3ProxyContainerT bucketRoot, DirectoryFindCompletionConfig(3), Set.empty, + Set.empty, s3Client.listObjectsV2Paginator(_).iterator().asScala, taskId, ).unsafeRunSync() @@ -102,6 +105,7 @@ class ListDirectoryTest extends AnyFlatSpec with Matchers with S3ProxyContainerT bucketRoot, DirectoryFindCompletionConfig(1), Set.empty, + Set.empty, s3Client.listObjectsV2Paginator(_).iterator().asScala, taskId, ).unsafeRunSync() diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/config/FormatSelection.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/config/FormatSelection.scala index 88d0d5072..ffff49b22 100644 --- a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/config/FormatSelection.scala +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/config/FormatSelection.scala @@ -18,6 +18,8 @@ package io.lenses.streamreactor.connect.aws.s3.config import cats.implicits.catsSyntaxEitherId import com.datamountaineer.kcql.Kcql import io.lenses.streamreactor.connect.aws.s3.config.FormatOptions.WithHeaders +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEntry +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEnum import io.lenses.streamreactor.connect.aws.s3.formats.reader._ import io.lenses.streamreactor.connect.aws.s3.formats.reader.converters._ import io.lenses.streamreactor.connect.aws.s3.model.CompressionCodecName._ @@ -25,7 +27,7 @@ import io.lenses.streamreactor.connect.aws.s3.model.CompressionCodecName import io.lenses.streamreactor.connect.aws.s3.model.Topic import io.lenses.streamreactor.connect.aws.s3.model.location.S3Location import io.lenses.streamreactor.connect.aws.s3.source.config.ReadTextMode -import io.lenses.streamreactor.connect.aws.s3.source.config.kcqlprops.S3PropsSchema +import io.lenses.streamreactor.connect.config.kcqlprops.KcqlPropsSchema import java.io.InputStream import java.time.Instant @@ -55,14 +57,13 @@ sealed trait FormatSelection { } case object FormatSelection { - private val schema = S3PropsSchema.schema - def fromKcql( - kcql: Kcql, + kcql: Kcql, + kcqlPropsSchema: KcqlPropsSchema[S3PropsKeyEntry, S3PropsKeyEnum.type], ): Either[Throwable, FormatSelection] = Option(kcql.getStoredAs) match { case Some(storedAs) => - fromString(storedAs, () => ReadTextMode(schema.readProps(kcql.getProperties.asScala.toMap))) + fromString(storedAs, () => ReadTextMode(kcqlPropsSchema.readPropsMap(kcql.getProperties.asScala.toMap))) case None => Right(JsonFormatSelection) } diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/config/S3ConfigSettings.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/config/S3ConfigSettings.scala index 6ce38de0f..50892267a 100644 --- a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/config/S3ConfigSettings.scala +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/config/S3ConfigSettings.scala @@ -111,11 +111,11 @@ object S3ConfigSettings { val PADDING_STRATEGY = s"$CONNECTOR_PREFIX.padding.strategy" val PADDING_STRATEGY_DOC = "Configure in order to pad the partition and offset on the sink output files. Options are `LeftPad`, `RightPad` or `NoOp` (does not add padding). Defaults to `LeftPad`." - val PADDING_STRATEGY_DEFAULT = "LeftPad" + val PADDING_STRATEGY_DEFAULT = "" - val PADDING_LENGTH = s"$CONNECTOR_PREFIX.padding.length" - val PADDING_LENGTH_DOC = s"Length to pad the string up to if $PADDING_STRATEGY is set." - val PADDING_LENGTH_DEFAULT = 12 + val PADDING_LENGTH = s"$CONNECTOR_PREFIX.padding.length" + val PADDING_LENGTH_DOC = s"Length to pad the string up to if $PADDING_STRATEGY is set." + val PADDING_LENGTH_DEFAULT: Int = -1 // TASK_INDEX isn't exposed as a connector property. It is provided to the task from the connector in order // to distribute partitions between the different tasks. diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/kcqlprops/S3PropsKeyEnum.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/config/kcqlprops/S3PropsKeyEnum.scala similarity index 84% rename from kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/kcqlprops/S3PropsKeyEnum.scala rename to kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/config/kcqlprops/S3PropsKeyEnum.scala index 8e3835d1a..c825b2fbe 100644 --- a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/kcqlprops/S3PropsKeyEnum.scala +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/config/kcqlprops/S3PropsKeyEnum.scala @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.lenses.streamreactor.connect.aws.s3.source.config.kcqlprops +package io.lenses.streamreactor.connect.aws.s3.config.kcqlprops import enumeratum.Enum import enumeratum.EnumEntry @@ -44,4 +44,12 @@ object S3PropsKeyEnum extends Enum[S3PropsKeyEntry] { case object StoreEnvelopeHeaders extends S3PropsKeyEntry(DataStorageSettings.StoreHeadersKey) case object StoreEnvelopeValue extends S3PropsKeyEntry(DataStorageSettings.StoreValueKey) case object StoreEnvelopeMetadata extends S3PropsKeyEntry(DataStorageSettings.StoreMetadataKey) + + case object PaddingLength extends S3PropsKeyEntry("padding.length") + + case object PaddingCharacter extends S3PropsKeyEntry("padding.char") + + case object PaddingSelection extends S3PropsKeyEntry("padding.type") + + case object PartitionIncludeKeys extends S3PropsKeyEntry("partition.include.keys") } diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/config/processors/DeprecationConfigDefProcessor.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/config/processors/DeprecationConfigDefProcessor.scala new file mode 100644 index 000000000..8eda68e4e --- /dev/null +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/config/processors/DeprecationConfigDefProcessor.scala @@ -0,0 +1,57 @@ +/* + * Copyright 2017-2023 Lenses.io Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.lenses.streamreactor.connect.aws.s3.config.processors + +import com.typesafe.scalalogging.LazyLogging +import io.lenses.streamreactor.connect.aws.s3.config.S3ConfigSettings._ + +import scala.collection.MapView +import scala.collection.immutable.ListMap + +/** + * For consistency of configuration, some properties are deprecated in the connector. To ensure users update their + * connector configuration, this will fail during connector initialisation advising of the errors and how to update the + * properties. This will be removed in a future release. + */ +class DeprecationConfigDefProcessor extends ConfigDefProcessor with LazyLogging { + + private val deprecatedProps: Map[String, String] = ListMap( + DEP_AUTH_MODE -> AUTH_MODE, + DEP_AWS_ACCESS_KEY -> AWS_ACCESS_KEY, + DEP_AWS_SECRET_KEY -> AWS_SECRET_KEY, + DEP_ENABLE_VIRTUAL_HOST_BUCKETS -> ENABLE_VIRTUAL_HOST_BUCKETS, + DEP_CUSTOM_ENDPOINT -> CUSTOM_ENDPOINT, + ) + + override def process(input: Map[String, Any]): Either[Exception, Map[String, Any]] = { + val inputKeys = input.keys.toSet + val failProps = deprecatedProps.view.filterKeys(inputKeys.contains) + Either.cond( + failProps.isEmpty, + input, + createError(failProps), + ) + + } + + private def createError(failProps: MapView[String, String]): IllegalArgumentException = { + val keyPrintOut = failProps.keys.map(k => s"`$k`").mkString(", ") + val detailedInstructions = failProps.map { case (k, v) => s"Change `$k` to `$v`." }.mkString(" ") + new IllegalArgumentException( + s"The following properties have been deprecated: $keyPrintOut. Please change to using the keys prefixed by `connect.s3`. $detailedInstructions", + ) + } +} diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/model/S3StoredFile.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/model/S3StoredFile.scala deleted file mode 100644 index 4c91bc951..000000000 --- a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/model/S3StoredFile.scala +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright 2017-2023 Lenses.io Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.lenses.streamreactor.connect.aws.s3.model - -import com.typesafe.scalalogging.LazyLogging -import io.lenses.streamreactor.connect.aws.s3.sink.CommittedFileName -import io.lenses.streamreactor.connect.aws.s3.sink.S3FileNamingStrategy - -object S3StoredFile extends LazyLogging { - def apply(path: String)(implicit fileNamingStrategy: S3FileNamingStrategy): Option[S3StoredFile] = - path match { - case originalValue @ CommittedFileName(topic, partition, end, format) - if format.toLowerCase == fileNamingStrategy.getFormat.extension => - Some(S3StoredFile( - originalValue, - TopicPartitionOffset(topic, partition, end), - )) - case _ => logger.debug(s"Invalid file type in S3 bucket - no match found for file $path") - None - } -} - -case class S3StoredFile( - path: String, - topicPartitionOffset: TopicPartitionOffset, -) - -object S3StoredFileSorter { - - def sort(inputFiles: List[S3StoredFile]): List[S3StoredFile] = - inputFiles.sortBy { - storedFile: S3StoredFile => - ( - storedFile.topicPartitionOffset.topic.value, - storedFile.topicPartitionOffset.partition, - storedFile.topicPartitionOffset.offset.value, - ) - } - -} diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/model/TopicPartitionOffset.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/model/TopicPartitionOffset.scala index e3654a6d2..7b0c2ebb3 100644 --- a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/model/TopicPartitionOffset.scala +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/model/TopicPartitionOffset.scala @@ -37,6 +37,9 @@ object TopicPartition { } case class TopicPartition(topic: Topic, partition: Int) { + + def atOffset(offset: Long): TopicPartitionOffset = withOffset(Offset(offset)) + def withOffset(offset: Offset): TopicPartitionOffset = TopicPartitionOffset(topic, partition, offset) def toKafka = new KafkaTopicPartition(topic.value, partition) @@ -45,5 +48,5 @@ case class TopicPartition(topic: Topic, partition: Int) { case class TopicPartitionOffset(topic: Topic, partition: Int, offset: Offset) { def toTopicPartition: TopicPartition = TopicPartition(topic, partition) - def toTopicPartitionOffsetTuple = (toTopicPartition, offset) + def toTopicPartitionOffsetTuple: (TopicPartition, Offset) = (toTopicPartition, offset) } diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3WriterManager.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3WriterManager.scala index ed510aac4..c87373e4e 100644 --- a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3WriterManager.scala +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/S3WriterManager.scala @@ -28,6 +28,7 @@ import io.lenses.streamreactor.connect.aws.s3.sink.commit.CommitPolicy import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionField import io.lenses.streamreactor.connect.aws.s3.sink.config.S3SinkConfig import io.lenses.streamreactor.connect.aws.s3.sink.config.SinkBucketOptions +import io.lenses.streamreactor.connect.aws.s3.sink.naming.KeyNamer import io.lenses.streamreactor.connect.aws.s3.sink.seek._ import io.lenses.streamreactor.connect.aws.s3.sink.transformers.TopicsTransformers import io.lenses.streamreactor.connect.aws.s3.sink.writer.S3Writer @@ -52,14 +53,14 @@ case class MapKey(topicPartition: TopicPartition, partitionValues: immutable.Map * sinks, since file handles cannot be safely shared without considerable overhead. */ class S3WriterManager( - commitPolicyFn: TopicPartition => Either[SinkError, CommitPolicy], - bucketAndPrefixFn: TopicPartition => Either[SinkError, S3Location], - fileNamingStrategyFn: TopicPartition => Either[SinkError, S3FileNamingStrategy], - stagingFilenameFn: (TopicPartition, Map[PartitionField, String]) => Either[SinkError, File], - finalFilenameFn: (TopicPartition, Map[PartitionField, String], Offset) => Either[SinkError, S3Location], - formatWriterFn: (TopicPartition, File) => Either[SinkError, S3FormatWriter], - indexManager: IndexManager, - transformerF: MessageDetail => Either[RuntimeException, MessageDetail], + commitPolicyFn: TopicPartition => Either[SinkError, CommitPolicy], + bucketAndPrefixFn: TopicPartition => Either[SinkError, S3Location], + keyNamerFn: TopicPartition => Either[SinkError, KeyNamer], + stagingFilenameFn: (TopicPartition, Map[PartitionField, String]) => Either[SinkError, File], + finalFilenameFn: (TopicPartition, Map[PartitionField, String], Offset) => Either[SinkError, S3Location], + formatWriterFn: (TopicPartition, File) => Either[SinkError, S3FormatWriter], + indexManager: IndexManager, + transformerF: MessageDetail => Either[RuntimeException, MessageDetail], )( implicit connectorTaskId: ConnectorTaskId, @@ -145,9 +146,9 @@ class S3WriterManager( ): Either[SinkError, Option[TopicPartitionOffset]] = { logger.debug(s"[{}] seekOffsetsForTopicPartition {}", connectorTaskId.show, topicPartition) for { - fileNamingStrategy <- fileNamingStrategyFn(topicPartition) - bucketAndPrefix <- bucketAndPrefixFn(topicPartition) - offset <- indexManager.seek(topicPartition, fileNamingStrategy, bucketAndPrefix.bucket) + keyNamer <- keyNamerFn(topicPartition) + bucketAndPrefix <- bucketAndPrefixFn(topicPartition) + offset <- indexManager.seek(topicPartition, bucketAndPrefix.bucket) } yield offset } @@ -202,15 +203,11 @@ class S3WriterManager( } private def processPartitionValues( - messageDetail: MessageDetail, - fileNamingStrategy: S3FileNamingStrategy, - topicPartition: TopicPartition, + messageDetail: MessageDetail, + keyNamer: KeyNamer, + topicPartition: TopicPartition, ): Either[SinkError, immutable.Map[PartitionField, String]] = - if (fileNamingStrategy.shouldProcessPartitionValues) { - fileNamingStrategy.processPartitionValues(messageDetail, topicPartition) - } else { - Map.empty[PartitionField, String].asRight - } + keyNamer.processPartitionValues(messageDetail, topicPartition) /** * Returns a writer that can write records for a particular topic and partition. @@ -218,11 +215,11 @@ class S3WriterManager( */ private def writer(topicPartition: TopicPartition, messageDetail: MessageDetail): Either[SinkError, S3Writer] = for { - bucketAndPrefix <- bucketAndPrefixFn(topicPartition) - fileNamingStrategy <- fileNamingStrategyFn(topicPartition) - partitionValues <- processPartitionValues(messageDetail, fileNamingStrategy, topicPartition) - key = MapKey(topicPartition, partitionValues) - maybeWriter = writers.get(key) + bucketAndPrefix <- bucketAndPrefixFn(topicPartition) + keyNamer <- keyNamerFn(topicPartition) + partitionValues <- processPartitionValues(messageDetail, keyNamer, topicPartition) + key = MapKey(topicPartition, partitionValues) + maybeWriter = writers.get(key) writer <- maybeWriter match { case Some(w) => w.asRight case None => @@ -320,9 +317,9 @@ object S3WriterManager extends LazyLogging { case None => fatalErrorTopicNotConfigured(topicPartition).asLeft } - val fileNamingStrategyFn: TopicPartition => Either[SinkError, S3FileNamingStrategy] = topicPartition => + val keyNamerFn: TopicPartition => Either[SinkError, KeyNamer] = topicPartition => bucketOptsForTopic(config, topicPartition.topic) match { - case Some(bucketOptions) => bucketOptions.fileNamingStrategy.asRight + case Some(bucketOptions) => bucketOptions.keyNamer.asRight case None => fatalErrorTopicNotConfigured(topicPartition).asLeft } @@ -331,11 +328,11 @@ object S3WriterManager extends LazyLogging { bucketOptsForTopic(config, topicPartition.topic) match { case Some(bucketOptions) => for { - fileNamingStrategy <- fileNamingStrategyFn(topicPartition) - stagingFilename <- fileNamingStrategy.stagingFile(bucketOptions.localStagingArea.dir, - bucketOptions.bucketAndPrefix, - topicPartition, - partitionValues, + keyNamer <- keyNamerFn(topicPartition) + stagingFilename <- keyNamer.stagingFile(bucketOptions.localStagingArea.dir, + bucketOptions.bucketAndPrefix, + topicPartition, + partitionValues, ) } yield stagingFilename case None => fatalErrorTopicNotConfigured(topicPartition).asLeft @@ -349,10 +346,10 @@ object S3WriterManager extends LazyLogging { bucketOptsForTopic(config, topicPartition.topic) match { case Some(bucketOptions) => for { - fileNamingStrategy <- fileNamingStrategyFn(topicPartition) - stagingFilename <- fileNamingStrategy.finalFilename(bucketOptions.bucketAndPrefix, - topicPartition.withOffset(offset), - partitionValues, + keyNamer <- keyNamerFn(topicPartition) + stagingFilename <- keyNamer.finalFilename(bucketOptions.bucketAndPrefix, + topicPartition.withOffset(offset), + partitionValues, ) } yield stagingFilename case None => fatalErrorTopicNotConfigured(topicPartition).asLeft @@ -378,7 +375,7 @@ object S3WriterManager extends LazyLogging { new S3WriterManager( commitPolicyFn, bucketAndPrefixFn, - fileNamingStrategyFn, + keyNamerFn, stagingFilenameFn, finalFilenameFn, formatWriterFn, diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/PaddingStrategySettings.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/PaddingStrategySettings.scala deleted file mode 100644 index 12ec6d5ad..000000000 --- a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/PaddingStrategySettings.scala +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright 2017-2023 Lenses.io Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.lenses.streamreactor.connect.aws.s3.sink.config - -import com.datamountaineer.streamreactor.common.config.base.traits.BaseSettings -import enumeratum.Enum -import enumeratum.EnumEntry -import io.lenses.streamreactor.connect.aws.s3.config.S3ConfigSettings.PADDING_LENGTH -import io.lenses.streamreactor.connect.aws.s3.config.S3ConfigSettings.PADDING_STRATEGY -import io.lenses.streamreactor.connect.aws.s3.sink.LeftPadPaddingStrategy -import io.lenses.streamreactor.connect.aws.s3.sink.NoOpPaddingStrategy -import io.lenses.streamreactor.connect.aws.s3.sink.PaddingStrategy -import io.lenses.streamreactor.connect.aws.s3.sink.RightPadPaddingStrategy - -sealed trait PaddingStrategyOptions extends EnumEntry - -object PaddingStrategyOptions extends Enum[PaddingStrategyOptions] { - - val values = findValues - - case object LeftPad extends PaddingStrategyOptions - case object RightPad extends PaddingStrategyOptions - case object NoOp extends PaddingStrategyOptions - -} - -trait PaddingStrategySettings extends BaseSettings { - - private val paddingChar: Char = '0' - - def getPaddingStrategy(): PaddingStrategy = { - val paddingLength = getInt(PADDING_LENGTH) - PaddingStrategyOptions.withNameInsensitive(getString(PADDING_STRATEGY)) match { - case PaddingStrategyOptions.LeftPad => LeftPadPaddingStrategy(paddingLength, paddingChar) - case PaddingStrategyOptions.RightPad => RightPadPaddingStrategy(paddingLength, paddingChar) - case _ => NoOpPaddingStrategy - } - } -} diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/PartitionDisplay.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/PartitionDisplay.scala index ba32f8f59..1d235d316 100644 --- a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/PartitionDisplay.scala +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/PartitionDisplay.scala @@ -18,6 +18,10 @@ package io.lenses.streamreactor.connect.aws.s3.sink.config import com.datamountaineer.kcql.Kcql import enumeratum.Enum import enumeratum.EnumEntry +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEntry +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEnum +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEnum.PartitionIncludeKeys +import io.lenses.streamreactor.connect.config.kcqlprops.KcqlProperties import scala.collection.immutable @@ -31,11 +35,20 @@ object PartitionDisplay extends Enum[PartitionDisplay] { case object Values extends PartitionDisplay - def apply(kcql: Kcql): PartitionDisplay = - Option(kcql.getWithPartitioner).fold[PartitionDisplay](KeysAndValues) { - PartitionDisplay - .withNameInsensitiveOption(_) - .getOrElse(KeysAndValues) + def apply( + kcql: Kcql, + props: KcqlProperties[S3PropsKeyEntry, S3PropsKeyEnum.type], + default: PartitionDisplay, + ): PartitionDisplay = fromProps(props).orElse(fromKcql(kcql)).getOrElse(default) + + private def fromProps(props: KcqlProperties[S3PropsKeyEntry, S3PropsKeyEnum.type]): Option[PartitionDisplay] = + props.getOptionalBoolean(PartitionIncludeKeys).map { + case true => KeysAndValues + case false => Values } + private def fromKcql( + kcql: Kcql, + ): Option[PartitionDisplay] = Option(kcql.getWithPartitioner).flatMap(PartitionDisplay.withNameInsensitiveOption) + } diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/PartitionField.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/PartitionField.scala index 08747962f..6f6309ffb 100644 --- a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/PartitionField.scala +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/PartitionField.scala @@ -22,7 +22,9 @@ import java.util.TimeZone import scala.jdk.CollectionConverters.IteratorHasAsScala sealed trait PartitionField { - def valuePrefixDisplay(): String + def name(): String + + def supportsPadding: Boolean = false } object PartitionField { @@ -43,9 +45,9 @@ object PartitionField { def apply(partitionSpecifier: PartitionSpecifier): PartitionField = partitionSpecifier match { - case PartitionSpecifier.Key => WholeKeyPartitionField() - case PartitionSpecifier.Topic => TopicPartitionField() - case PartitionSpecifier.Partition => PartitionPartitionField() + case PartitionSpecifier.Key => WholeKeyPartitionField + case PartitionSpecifier.Topic => TopicPartitionField + case PartitionSpecifier.Partition => PartitionPartitionField case PartitionSpecifier.Header => throw new IllegalArgumentException("cannot partition by Header partition field without path") case PartitionSpecifier.Value => @@ -70,37 +72,39 @@ object PartitionField { } case class HeaderPartitionField(path: PartitionNamePath) extends PartitionField { - override def valuePrefixDisplay(): String = path.toString + override def name(): String = path.toString path.validateProtectedCharacters() } case class KeyPartitionField(path: PartitionNamePath) extends PartitionField { - override def valuePrefixDisplay(): String = path.toString + override def name(): String = path.toString path.validateProtectedCharacters() } case class ValuePartitionField(path: PartitionNamePath) extends PartitionField { - override def valuePrefixDisplay(): String = path.toString + override def name(): String = path.toString path.validateProtectedCharacters() } -case class WholeKeyPartitionField() extends PartitionField { - override def valuePrefixDisplay(): String = "key" +case object WholeKeyPartitionField extends PartitionField { + override def name(): String = "key" } -case class TopicPartitionField() extends PartitionField { - override def valuePrefixDisplay(): String = "topic" +case object TopicPartitionField extends PartitionField { + override def name(): String = "topic" } -case class PartitionPartitionField() extends PartitionField { - override def valuePrefixDisplay(): String = "partition" +case object PartitionPartitionField extends PartitionField { + override def name(): String = "partition" + + override def supportsPadding: Boolean = true } case class DatePartitionField(format: String) extends PartitionField { - override def valuePrefixDisplay(): String = "date" + override def name(): String = "date" def formatter = DateTimeFormatter.ofPattern(format).withZone(TimeZone.getDefault.toZoneId) } diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/PartitionSelection.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/PartitionSelection.scala index a33cacf4d..b68199546 100644 --- a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/PartitionSelection.scala +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/PartitionSelection.scala @@ -16,24 +16,41 @@ package io.lenses.streamreactor.connect.aws.s3.sink.config import com.datamountaineer.kcql.Kcql +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEntry +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEnum +import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionDisplay.KeysAndValues +import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionDisplay.Values +import io.lenses.streamreactor.connect.config.kcqlprops.KcqlProperties case class PartitionSelection( + isCustom: Boolean, partitions: Seq[PartitionField], - partitionDisplay: PartitionDisplay = PartitionDisplay.Values, + partitionDisplay: PartitionDisplay, ) +object PartitionSelection { -case object PartitionSelection { + private val DefaultPartitionFields: Seq[PartitionField] = Seq(TopicPartitionField, PartitionPartitionField) - def apply(kcql: Kcql): Option[PartitionSelection] = { - val partitions: Seq[PartitionField] = PartitionField(kcql) - if (partitions.isEmpty) None - else - Some( - PartitionSelection( - partitions, - PartitionDisplay(kcql), - ), + def defaultPartitionSelection(partitionDisplay: PartitionDisplay): PartitionSelection = + PartitionSelection(isCustom = false, DefaultPartitionFields, partitionDisplay) + + def apply( + kcql: Kcql, + props: KcqlProperties[S3PropsKeyEntry, S3PropsKeyEnum.type], + ): PartitionSelection = { + val fields: Seq[PartitionField] = PartitionField(kcql) + if (fields.isEmpty) { + defaultPartitionSelection( + PartitionDisplay(kcql, props, Values), + ) + } else { + PartitionSelection( + isCustom = true, + fields, + PartitionDisplay(kcql, props, KeysAndValues), ) + } + } } diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/S3SinkConfig.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/S3SinkConfig.scala index 4d5384d8e..872588061 100644 --- a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/S3SinkConfig.scala +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/S3SinkConfig.scala @@ -17,13 +17,23 @@ package io.lenses.streamreactor.connect.aws.s3.sink.config import cats.syntax.all._ import com.datamountaineer.kcql.Kcql import com.typesafe.scalalogging.LazyLogging +import io.lenses.streamreactor.connect.aws.s3.config.ConnectorTaskId +import io.lenses.streamreactor.connect.aws.s3.config.DataStorageSettings +import io.lenses.streamreactor.connect.aws.s3.config.FormatSelection +import io.lenses.streamreactor.connect.aws.s3.config.S3Config import io.lenses.streamreactor.connect.aws.s3.config.S3ConfigSettings.SEEK_MAX_INDEX_FILES import io.lenses.streamreactor.connect.aws.s3.config._ import io.lenses.streamreactor.connect.aws.s3.model.CompressionCodec import io.lenses.streamreactor.connect.aws.s3.model.location.S3Location -import io.lenses.streamreactor.connect.aws.s3.sink._ import io.lenses.streamreactor.connect.aws.s3.sink.commit.CommitPolicy import io.lenses.streamreactor.connect.aws.s3.sink.commit.Count +import io.lenses.streamreactor.connect.aws.s3.sink.config.kcqlprops.S3SinkProps +import io.lenses.streamreactor.connect.aws.s3.sink.config.kcqlprops.S3SinkPropsSchema +import io.lenses.streamreactor.connect.aws.s3.sink.config.padding.PaddingService +import io.lenses.streamreactor.connect.aws.s3.sink.naming.OffsetS3FileNamer +import io.lenses.streamreactor.connect.aws.s3.sink.naming.KeyNamer +import io.lenses.streamreactor.connect.aws.s3.sink.naming.TopicPartitionOffsetS3FileNamer +import io.lenses.streamreactor.connect.aws.s3.sink.naming.S3KeyNamer import java.util import scala.jdk.CollectionConverters._ @@ -77,13 +87,24 @@ object SinkBucketOptions extends LazyLogging { ): Either[Throwable, Seq[SinkBucketOptions]] = config.getKCQL.map { kcql: Kcql => for { - formatSelection <- FormatSelection.fromKcql(kcql) - partitionSelection = PartitionSelection(kcql) - namingStrategy = partitionSelection match { - case Some(partSel) => - new PartitionedS3FileNamingStrategy(formatSelection, config.getPaddingStrategy(), partSel) - case None => new HierarchicalS3FileNamingStrategy(formatSelection, config.getPaddingStrategy()) + formatSelection <- FormatSelection.fromKcql(kcql, S3SinkPropsSchema.schema) + sinkProps = S3SinkProps.fromKcql(kcql) + partitionSelection = PartitionSelection(kcql, sinkProps) + paddingService <- PaddingService.fromConfig(config, sinkProps) + + fileNamer = if (partitionSelection.isCustom) { + new TopicPartitionOffsetS3FileNamer( + paddingService.padderFor("partition"), + paddingService.padderFor("offset"), + formatSelection.extension, + ) + } else { + new OffsetS3FileNamer( + paddingService.padderFor("offset"), + formatSelection.extension, + ) } + keyNamer = S3KeyNamer(formatSelection, partitionSelection, fileNamer, paddingService) stagingArea <- LocalStagingArea(config) target <- S3Location.splitAndValidate(kcql.getTarget, allowSlash = false) storageSettings <- DataStorageSettings.from( @@ -97,7 +118,7 @@ object SinkBucketOptions extends LazyLogging { Option(kcql.getSource).filterNot(Set("*", "`*`").contains(_)), target, formatSelection = formatSelection, - fileNamingStrategy = namingStrategy, + keyNamer = keyNamer, partitionSelection = partitionSelection, commitPolicy = config.commitPolicy(kcql), localStagingArea = stagingArea, @@ -135,9 +156,9 @@ case class SinkBucketOptions( sourceTopic: Option[String], bucketAndPrefix: S3Location, formatSelection: FormatSelection, - fileNamingStrategy: S3FileNamingStrategy, - partitionSelection: Option[PartitionSelection] = None, - commitPolicy: CommitPolicy = CommitPolicy.Default, + keyNamer: KeyNamer, + partitionSelection: PartitionSelection, + commitPolicy: CommitPolicy = CommitPolicy.Default, localStagingArea: LocalStagingArea, dataStorage: DataStorageSettings, ) diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/S3SinkConfigDef.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/S3SinkConfigDef.scala index 70ee1e011..fd63c41a2 100644 --- a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/S3SinkConfigDef.scala +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/S3SinkConfigDef.scala @@ -20,6 +20,7 @@ import com.datamountaineer.streamreactor.common.config.base.traits._ import com.typesafe.scalalogging.LazyLogging import io.lenses.streamreactor.connect.aws.s3.config._ import io.lenses.streamreactor.connect.aws.s3.config.processors.ConfigDefProcessor +import io.lenses.streamreactor.connect.aws.s3.config.processors.DeprecationConfigDefProcessor import io.lenses.streamreactor.connect.aws.s3.config.processors.LowerCaseKeyConfigDefProcessor import io.lenses.streamreactor.connect.aws.s3.config.processors.YamlProfileProcessor import org.apache.kafka.common.config.ConfigDef @@ -29,6 +30,7 @@ import org.apache.kafka.common.config.ConfigDef.Type import java.util import scala.jdk.CollectionConverters._ import S3ConfigSettings._ +import io.lenses.streamreactor.connect.aws.s3.sink.config.padding.PaddingStrategySettings object S3SinkConfigDef { @@ -77,7 +79,7 @@ object S3SinkConfigDef { class S3SinkConfigDef() extends ConfigDef with LazyLogging { private val processorChain: List[ConfigDefProcessor] = - List(new LowerCaseKeyConfigDefProcessor, new YamlProfileProcessor) + List(new LowerCaseKeyConfigDefProcessor, new DeprecationConfigDefProcessor, new YamlProfileProcessor) override def parse(jProps: util.Map[_, _]): util.Map[String, AnyRef] = { val scalaProps: Map[Any, Any] = jProps.asScala.toMap diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/kcqlprops/S3SinkPropsSchema.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/kcqlprops/S3SinkPropsSchema.scala new file mode 100644 index 000000000..00a14fd25 --- /dev/null +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/kcqlprops/S3SinkPropsSchema.scala @@ -0,0 +1,50 @@ +/* + * Copyright 2017-2023 Lenses.io Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.lenses.streamreactor.connect.aws.s3.sink.config.kcqlprops + +import com.datamountaineer.kcql.Kcql +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEntry +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEnum +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEnum._ +import io.lenses.streamreactor.connect.aws.s3.sink.config.padding.PaddingType +import io.lenses.streamreactor.connect.config.kcqlprops._ + +import scala.jdk.CollectionConverters.MapHasAsScala + +object S3SinkPropsSchema { + + private[sink] val keys = Map[S3PropsKeyEntry, PropsSchema]( + PaddingCharacter -> CharPropsSchema, + PaddingLength -> MapPropsSchema[String, Int](), + PaddingSelection -> EnumPropsSchema(PaddingType), + PartitionIncludeKeys -> BooleanPropsSchema, + StoreEnvelope -> BooleanPropsSchema, + StoreEnvelopeKey -> BooleanPropsSchema, + StoreEnvelopeHeaders -> BooleanPropsSchema, + StoreEnvelopeValue -> BooleanPropsSchema, + StoreEnvelopeMetadata -> BooleanPropsSchema, + ) + + private[sink] val schema: KcqlPropsSchema[S3PropsKeyEntry, S3PropsKeyEnum.type] = + KcqlPropsSchema(S3PropsKeyEnum, keys) + +} + +object S3SinkProps { + private[sink] def fromKcql(kcql: Kcql): KcqlProperties[S3PropsKeyEntry, S3PropsKeyEnum.type] = + S3SinkPropsSchema.schema.readPropsMap(kcql.getProperties.asScala.toMap) + +} diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/padding/PaddingService.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/padding/PaddingService.scala new file mode 100644 index 000000000..b86dcbcc7 --- /dev/null +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/padding/PaddingService.scala @@ -0,0 +1,107 @@ +/* + * Copyright 2017-2023 Lenses.io Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.lenses.streamreactor.connect.aws.s3.sink.config.padding + +import cats.implicits.catsSyntaxEitherId +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEnum.PaddingCharacter +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEnum.PaddingLength +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEnum.PaddingSelection +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEntry +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEnum +import io.lenses.streamreactor.connect.aws.s3.sink.NoOpPaddingStrategy +import io.lenses.streamreactor.connect.aws.s3.sink.PaddingStrategy +import io.lenses.streamreactor.connect.aws.s3.sink.config.S3SinkConfigDefBuilder +import io.lenses.streamreactor.connect.aws.s3.sink.config.padding.PaddingType.LeftPad +import io.lenses.streamreactor.connect.config.kcqlprops.KcqlProperties +import io.lenses.streamreactor.connect.config.kcqlprops.KcqlProperties.stringToInt +import io.lenses.streamreactor.connect.config.kcqlprops.KcqlProperties.stringToString + +object PaddingService { + + val DefaultPaddingStrategy: PaddingType = LeftPad + val DefaultPadLength: Int = 12 + val DefaultPadChar: Char = '0' + val DefaultPadFields: Set[String] = Set("offset") + + private trait PaddingConfigDetector[C] { + + def configApplied(config: C): Boolean + + def processConfig(config: C): PaddingService + + } + + private object ConfigDefPaddingConfigDetector extends PaddingConfigDetector[S3SinkConfigDefBuilder] { + override def configApplied(config: S3SinkConfigDefBuilder): Boolean = config.getPaddingStrategy.nonEmpty + + override def processConfig(config: S3SinkConfigDefBuilder): PaddingService = + config.getPaddingStrategy.map(ps => new PaddingService(Map("offset" -> ps))).get + } + + private object KcqlPropsPaddingConfigDetector + extends PaddingConfigDetector[KcqlProperties[S3PropsKeyEntry, S3PropsKeyEnum.type]] { + override def configApplied(config: KcqlProperties[S3PropsKeyEntry, S3PropsKeyEnum.type]): Boolean = + config.containsKeyStartingWith("padding.") + + override def processConfig(config: KcqlProperties[S3PropsKeyEntry, S3PropsKeyEnum.type]): PaddingService = + fromDefaults( + config.getOptionalChar(PaddingCharacter), + config.getOptionalMap[String, Int](PaddingLength, stringToString, stringToInt), + config.getEnumValue[PaddingType, PaddingType.type](PaddingType, PaddingSelection), + ) + + private def fromDefaults( + maybeChar: Option[Char], + maybeFields: Option[Map[String, Int]], + maybeType: Option[PaddingType], + ): PaddingService = { + val fields: Map[String, Int] = maybeFields.getOrElse(DefaultPadFields.map(f => f -> DefaultPadLength)).toMap + val pChar: Char = maybeChar.getOrElse(DefaultPadChar) + val pType: PaddingType = maybeType.getOrElse(DefaultPaddingStrategy) + val paddingStrategies = fields.map(f => f._1 -> pType.toPaddingStrategy(f._2, pChar)) + new PaddingService(paddingStrategies) + } + + } + def fromConfig( + confDef: S3SinkConfigDefBuilder, + props: KcqlProperties[S3PropsKeyEntry, S3PropsKeyEnum.type], + ): Either[Throwable, PaddingService] = { + val cdConf = ConfigDefPaddingConfigDetector.configApplied(confDef) + val kpConf = KcqlPropsPaddingConfigDetector.configApplied(props) + (cdConf, kpConf) match { + case (true, true) => + new IllegalStateException( + "Unable to process both padding Kafka Connect config properties and KCQL properties. Please use one or the other. We recommend KCQL properties for additional configuration ability.", + ).asLeft + case (true, false) => ConfigDefPaddingConfigDetector.processConfig(confDef).asRight + case _ => KcqlPropsPaddingConfigDetector.processConfig(props).asRight + } + + } + +} + +class PaddingService( + fields: Map[String, PaddingStrategy], +) { + + def padderFor(field: String): PaddingStrategy = + fields.get(field) match { + case Some(fieldPaddingStrategy) => fieldPaddingStrategy + case None => NoOpPaddingStrategy + } +} diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/padding/PaddingStrategySettings.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/padding/PaddingStrategySettings.scala new file mode 100644 index 000000000..00fceb81a --- /dev/null +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/padding/PaddingStrategySettings.scala @@ -0,0 +1,41 @@ +/* + * Copyright 2017-2023 Lenses.io Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.lenses.streamreactor.connect.aws.s3.sink.config.padding + +import com.datamountaineer.streamreactor.common.config.base.traits.BaseSettings +import io.lenses.streamreactor.connect.aws.s3.config.S3ConfigSettings.PADDING_LENGTH +import io.lenses.streamreactor.connect.aws.s3.config.S3ConfigSettings.PADDING_STRATEGY +import io.lenses.streamreactor.connect.aws.s3.sink.PaddingStrategy +import io.lenses.streamreactor.connect.aws.s3.sink.config.padding.PaddingService.DefaultPadChar +import io.lenses.streamreactor.connect.aws.s3.sink.config.padding.PaddingService.DefaultPadLength + +/** + * Retrieves Padding settings from KCQL + */ +trait PaddingStrategySettings extends BaseSettings { + + def getPaddingStrategy: Option[PaddingStrategy] = { + val paddingLength = Option(getInt(PADDING_LENGTH)).filterNot(_ < 0).map(_.toInt) + for { + paddingType <- Option(getString(PADDING_STRATEGY)).filterNot(_ == "").flatMap( + PaddingType.withNameInsensitiveOption, + ) + } yield paddingType.toPaddingStrategy( + paddingLength.getOrElse(DefaultPadLength), + DefaultPadChar, + ) + } +} diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/padding/PaddingType.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/padding/PaddingType.scala new file mode 100644 index 000000000..bcee2dd13 --- /dev/null +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/padding/PaddingType.scala @@ -0,0 +1,39 @@ +/* + * Copyright 2017-2023 Lenses.io Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.lenses.streamreactor.connect.aws.s3.sink.config.padding + +import enumeratum._ +import io.lenses.streamreactor.connect.aws.s3.sink._ + +sealed trait PaddingType extends EnumEntry { + def toPaddingStrategy(length: Int, char: Char): PaddingStrategy +} + +object PaddingType extends Enum[PaddingType] { + + val values = findValues + + case object LeftPad extends PaddingType { + override def toPaddingStrategy(length: Int, char: Char): PaddingStrategy = LeftPadPaddingStrategy(length, char) + } + case object RightPad extends PaddingType { + override def toPaddingStrategy(length: Int, char: Char): PaddingStrategy = RightPadPaddingStrategy(length, char) + } + case object NoOp extends PaddingType { + override def toPaddingStrategy(length: Int, char: Char): PaddingStrategy = NoOpPaddingStrategy + } + +} diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/naming/KeyNamer.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/naming/KeyNamer.scala new file mode 100644 index 000000000..a373ac92e --- /dev/null +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/naming/KeyNamer.scala @@ -0,0 +1,46 @@ +/* + * Copyright 2017-2023 Lenses.io Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.lenses.streamreactor.connect.aws.s3.sink.naming +import io.lenses.streamreactor.connect.aws.s3.formats.writer.MessageDetail +import io.lenses.streamreactor.connect.aws.s3.model.TopicPartition +import io.lenses.streamreactor.connect.aws.s3.model.TopicPartitionOffset +import io.lenses.streamreactor.connect.aws.s3.model.location.S3Location +import io.lenses.streamreactor.connect.aws.s3.sink.FatalS3SinkError +import io.lenses.streamreactor.connect.aws.s3.sink.SinkError +import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionField + +import java.io.File + +trait KeyNamer { + + def stagingFile( + stagingDirectory: File, + bucketAndPrefix: S3Location, + topicPartition: TopicPartition, + partitionValues: Map[PartitionField, String], + ): Either[FatalS3SinkError, File] + + def finalFilename( + bucketAndPrefix: S3Location, + topicPartitionOffset: TopicPartitionOffset, + partitionValues: Map[PartitionField, String], + ): Either[FatalS3SinkError, S3Location] + + def processPartitionValues( + messageDetail: MessageDetail, + topicPartition: TopicPartition, + ): Either[SinkError, Map[PartitionField, String]] +} diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/naming/S3FileNamer.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/naming/S3FileNamer.scala new file mode 100644 index 000000000..fac98fc11 --- /dev/null +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/naming/S3FileNamer.scala @@ -0,0 +1,47 @@ +/* + * Copyright 2017-2023 Lenses.io Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.lenses.streamreactor.connect.aws.s3.sink.naming + +import io.lenses.streamreactor.connect.aws.s3.model.TopicPartitionOffset +import io.lenses.streamreactor.connect.aws.s3.sink.PaddingStrategy + +trait S3FileNamer { + def fileName( + topicPartitionOffset: TopicPartitionOffset, + ): String +} +class OffsetS3FileNamer( + offsetPaddingStrategy: PaddingStrategy, + extension: String, +) extends S3FileNamer { + def fileName( + topicPartitionOffset: TopicPartitionOffset, + ): String = + s"${offsetPaddingStrategy.padString(topicPartitionOffset.offset.value.toString)}.$extension" +} +class TopicPartitionOffsetS3FileNamer( + partitionPaddingStrategy: PaddingStrategy, + offsetPaddingStrategy: PaddingStrategy, + extension: String, +) extends S3FileNamer { + def fileName( + topicPartitionOffset: TopicPartitionOffset, + ): String = + s"${topicPartitionOffset.topic.value}(${partitionPaddingStrategy.padString( + topicPartitionOffset.partition.toString, + )}_${offsetPaddingStrategy.padString(topicPartitionOffset.offset.value.toString)}).$extension" + +} diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/FileNamingStrategy.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/naming/S3KeyNamer.scala similarity index 53% rename from kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/FileNamingStrategy.scala rename to kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/naming/S3KeyNamer.scala index 6db0ae171..e29ce4d49 100644 --- a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/FileNamingStrategy.scala +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/naming/S3KeyNamer.scala @@ -13,20 +13,23 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.lenses.streamreactor.connect.aws.s3.sink +package io.lenses.streamreactor.connect.aws.s3.sink.naming import cats.implicits.catsSyntaxEitherId import cats.implicits.toTraverseOps -import io.lenses.streamreactor.connect.aws.s3.config.Format import io.lenses.streamreactor.connect.aws.s3.config.FormatSelection import io.lenses.streamreactor.connect.aws.s3.formats.writer.MessageDetail import io.lenses.streamreactor.connect.aws.s3.formats.writer.NullSinkData import io.lenses.streamreactor.connect.aws.s3.formats.writer.SinkData -import io.lenses.streamreactor.connect.aws.s3.model._ +import io.lenses.streamreactor.connect.aws.s3.model.TopicPartition +import io.lenses.streamreactor.connect.aws.s3.model.TopicPartitionOffset import io.lenses.streamreactor.connect.aws.s3.model.location.FileUtils.createFileAndParents import io.lenses.streamreactor.connect.aws.s3.model.location.S3Location +import io.lenses.streamreactor.connect.aws.s3.sink.FatalS3SinkError +import io.lenses.streamreactor.connect.aws.s3.sink.SinkError import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionDisplay.KeysAndValues import io.lenses.streamreactor.connect.aws.s3.sink.config._ +import io.lenses.streamreactor.connect.aws.s3.sink.config.padding.PaddingService import io.lenses.streamreactor.connect.aws.s3.sink.extractors.ExtractorErrorAdaptor.adaptErrorResponse import io.lenses.streamreactor.connect.aws.s3.sink.extractors.SinkDataExtractor @@ -35,115 +38,35 @@ import java.util.UUID import scala.util.Failure import scala.util.Success import scala.util.Try -import scala.util.matching.Regex -trait S3FileNamingStrategy { - - private val DefaultPrefix = "streamreactor" - - def getFormat: FormatSelection - - def prefix(bucketAndPrefix: S3Location): String = bucketAndPrefix.prefix.getOrElse(DefaultPrefix) - - def stagingFile( - stagingDirectory: File, - bucketAndPrefix: S3Location, - topicPartition: TopicPartition, - partitionValues: Map[PartitionField, String], - ): Either[FatalS3SinkError, File] - - def finalFilename( - bucketAndPrefix: S3Location, - topicPartitionOffset: TopicPartitionOffset, - partitionValues: Map[PartitionField, String], - ): Either[FatalS3SinkError, S3Location] - - def shouldProcessPartitionValues: Boolean - - def processPartitionValues( - messageDetail: MessageDetail, - topicPartition: TopicPartition, - ): Either[SinkError, Map[PartitionField, String]] - - def topicPartitionPrefix(bucketAndPrefix: S3Location, topicPartition: TopicPartition): S3Location - - val committedFilenameRegex: Regex - -} - -/** * - * Stores the data in {{{$bucket:[$prefix]/$topic/$partition}}}, mirroring the Kafka topic partitions. - * @param formatSelection - * @param paddingStrategy - */ -class HierarchicalS3FileNamingStrategy(formatSelection: FormatSelection, paddingStrategy: PaddingStrategy) - extends S3FileNamingStrategy { - - import paddingStrategy._ - - override def stagingFile( - stagingDirectory: File, - bucketAndPrefix: S3Location, - topicPartition: TopicPartition, - partitionValues: Map[PartitionField, String], - ): Either[FatalS3SinkError, File] = - Try { - val uuid = UUID.randomUUID().toString - val file = stagingDirectory - .toPath - .resolve(prefix(bucketAndPrefix)) - .resolve(padString(topicPartition.topic.value)) - .resolve(s"${padString(topicPartition.partition.toString)}.${formatSelection.extension}") - .resolve(uuid) - .toFile - createFileAndParents(file) - file - }.toEither.left.map(ex => FatalS3SinkError(ex.getMessage, ex, topicPartition)) - - override def finalFilename( - bucketAndPrefix: S3Location, - topicPartitionOffset: TopicPartitionOffset, - partitionValues: Map[PartitionField, String], - ): Either[FatalS3SinkError, S3Location] = - Try( - bucketAndPrefix.withPath( - s"${prefix(bucketAndPrefix)}/${topicPartitionOffset.topic.value}/${padString( - topicPartitionOffset.partition.toString, - )}/${padString(topicPartitionOffset.offset.value.toString)}.${formatSelection.extension}", - ), - ).toEither.left.map(ex => FatalS3SinkError(ex.getMessage, topicPartitionOffset.toTopicPartition)) - - override def getFormat: FormatSelection = formatSelection - - override def shouldProcessPartitionValues: Boolean = false - - override def processPartitionValues( - messageDetail: MessageDetail, - topicPartition: TopicPartition, - ): Either[SinkError, Map[PartitionField, String]] = - FatalS3SinkError("This should never be called for this object", topicPartition).asLeft[Map[PartitionField, String]] - - override val committedFilenameRegex: Regex = s".+/(.+)/(\\d+)/(\\d+).(.+)".r - - override def topicPartitionPrefix( - bucketAndPrefix: S3Location, - topicPartition: TopicPartition, - ): S3Location = - bucketAndPrefix.withPath( - s"${prefix(bucketAndPrefix)}/${topicPartition.topic.value}/${padString(topicPartition.partition.toString)}/", +object S3KeyNamer { + + def apply( + formatSelection: FormatSelection, + partitionSelection: PartitionSelection, + fileNamer: S3FileNamer, + paddingService: PaddingService, + ): S3KeyNamer = + new S3KeyNamer( + formatSelection, + partitionSelection, + fileNamer, + paddingService, ) - } - -class PartitionedS3FileNamingStrategy( +class S3KeyNamer( formatSelection: FormatSelection, - paddingStrategy: PaddingStrategy, partitionSelection: PartitionSelection, -) extends S3FileNamingStrategy { + fileNamer: S3FileNamer, + paddingService: PaddingService, +) extends KeyNamer { - import paddingStrategy._ + private val DefaultPrefix = "" - override def getFormat: FormatSelection = formatSelection + private def addTrailingSlash(s: String): String = if (s.last == '/') s else s + '/' + + private def prefix(bucketAndPrefix: S3Location): String = + bucketAndPrefix.prefix.map(addTrailingSlash).getOrElse(DefaultPrefix) override def stagingFile( stagingDirectory: File, @@ -157,8 +80,6 @@ class PartitionedS3FileNamingStrategy( .toPath .resolve(prefix(bucketAndPrefix)) .resolve(buildPartitionPrefix(partitionValues)) - .resolve(topicPartition.topic.value) - .resolve(padString(topicPartition.partition.toString)) .resolve(formatSelection.extension) .resolve(uuid) .toFile @@ -169,12 +90,17 @@ class PartitionedS3FileNamingStrategy( private def buildPartitionPrefix(partitionValues: Map[PartitionField, String]): String = partitionSelection.partitions.map { (partition: PartitionField) => - partitionValuePrefix(partition) + partitionValues.getOrElse(partition, "[missing]") + partitionValues.get(partition) match { + case Some(partVal) if partition.supportsPadding => + partitionValuePrefix(partition) + paddingService.padderFor(partition.name()).padString(partVal) + case Some(partVal) => partitionValuePrefix(partition) + partVal + case None => "[missing]" + } } .mkString("/") private def partitionValuePrefix(partition: PartitionField): String = - if (partitionSelection.partitionDisplay == KeysAndValues) s"${partition.valuePrefixDisplay()}=" else "" + if (partitionSelection.partitionDisplay == KeysAndValues) s"${partition.name()}=" else "" override def finalFilename( bucketAndPrefix: S3Location, @@ -183,9 +109,7 @@ class PartitionedS3FileNamingStrategy( ): Either[FatalS3SinkError, S3Location] = Try( bucketAndPrefix.withPath( - s"${prefix(bucketAndPrefix)}/${buildPartitionPrefix(partitionValues)}/${topicPartitionOffset.topic.value}(${padString( - topicPartitionOffset.partition.toString, - )}_${padString(topicPartitionOffset.offset.value.toString)}).${formatSelection.extension}", + s"${prefix(bucketAndPrefix)}${buildPartitionPrefix(partitionValues)}/${fileNamer.fileName(topicPartitionOffset)}", ), ).toEither.left.map(ex => FatalS3SinkError(ex.getMessage, topicPartitionOffset.toTopicPartition)) @@ -219,11 +143,12 @@ class PartitionedS3FileNamingStrategy( getPartitionValueFromSinkData(_, name), ) - case partition @ WholeKeyPartitionField() => + case partition @ WholeKeyPartitionField => getPartitionByWholeKeyValue(messageDetail.key, topicPartition).map(partition -> _) - case partition @ TopicPartitionField() => (partition -> topicPartition.topic.value).asRight[SinkError] - case partition @ PartitionPartitionField() => - (partition -> padString(topicPartition.partition.toString)).asRight[SinkError] + case partition @ TopicPartitionField => (partition -> topicPartition.topic.value).asRight[SinkError] + case partition @ PartitionPartitionField => + val partitionPaddingStrategy = paddingService.padderFor("partition") + (partition -> partitionPaddingStrategy.padString(topicPartition.partition.toString)).asRight[SinkError] case partition @ DatePartitionField(_) => messageDetail.timestamp match { case Some(value) => (partition -> partition.formatter.format(value)).asRight[SinkError] @@ -263,8 +188,6 @@ class PartitionedS3FileNamingStrategy( } } - val reservedCharacters: Set[String] = Set("/", "\\") - private def getFieldStringValue(struct: SinkData, partitionName: Option[PartitionNamePath]) = adaptErrorResponse(SinkDataExtractor.extractPathFromSinkData(struct)(partitionName)).fold(Option.empty[String])( fieldVal => @@ -276,30 +199,4 @@ class PartitionedS3FileNamingStrategy( private def getPartitionValueFromSinkData(sinkData: SinkData, partitionName: PartitionNamePath): String = getFieldStringValue(sinkData, Option(partitionName)).getOrElse("[missing]") - override def shouldProcessPartitionValues: Boolean = true - - override val committedFilenameRegex: Regex = s"^[^/]+?/(?:.+/)*(.+)\\((\\d+)_(\\d+)\\).(.+)".r - - override def topicPartitionPrefix( - bucketAndPrefix: S3Location, - topicPartition: TopicPartition, - ): S3Location = bucketAndPrefix.withPath(s"${prefix(bucketAndPrefix)}/") -} - -object CommittedFileName { - - private val supportedExtensions: Set[String] = Format.values.toSet.map { f: Format => f.entryName.toLowerCase() } - - def unapply( - filename: String, - )( - implicit - s3FileNamingStrategy: S3FileNamingStrategy, - ): Option[(Topic, Int, Offset, String)] = - filename match { - case s3FileNamingStrategy.committedFilenameRegex(topic, partition, end, extension) - if supportedExtensions.contains(extension) => - Some((Topic(topic), partition.toInt, Offset(end.toLong), extension)) - case _ => None - } } diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/seek/IndexManager.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/seek/IndexManager.scala index 0ce0bcd8e..5436c0856 100644 --- a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/seek/IndexManager.scala +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/sink/seek/IndexManager.scala @@ -22,13 +22,12 @@ import io.lenses.streamreactor.connect.aws.s3.model.TopicPartition import io.lenses.streamreactor.connect.aws.s3.model.TopicPartitionOffset import io.lenses.streamreactor.connect.aws.s3.sink.FatalS3SinkError import io.lenses.streamreactor.connect.aws.s3.sink.NonFatalS3SinkError -import io.lenses.streamreactor.connect.aws.s3.sink.S3FileNamingStrategy import io.lenses.streamreactor.connect.aws.s3.sink.SinkError +import io.lenses.streamreactor.connect.aws.s3.storage.ResultProcessors.processAsKey import io.lenses.streamreactor.connect.aws.s3.storage.FileDeleteError import io.lenses.streamreactor.connect.aws.s3.storage.FileLoadError -import io.lenses.streamreactor.connect.aws.s3.storage.StorageInterface import io.lenses.streamreactor.connect.aws.s3.storage.ListResponse -import io.lenses.streamreactor.connect.aws.s3.storage.ResultProcessors.processAsKey +import io.lenses.streamreactor.connect.aws.s3.storage.StorageInterface class IndexManager( maxIndexes: Int, )( @@ -66,7 +65,7 @@ class IndexManager( if (indexes.size > maxIndexes) { logAndReturnMaxExceededError(topicPartition, indexes) } else if (filtered.size == indexes.size) { - val logLine = s"Latest file not found in index (${mostRecentIndexFile})" + val logLine = s"Latest file not found in index ($mostRecentIndexFile)" logger.error("[{}] {}", connectorTaskId.show, logLine) NonFatalS3SinkError(logLine).asLeft } else { @@ -129,14 +128,12 @@ class IndexManager( * Seeks the filesystem to find the latyest offsets for a topic/partition. * * @param topicPartition the TopicPartition for which to retrieve the offsets - * @param fileNamingStrategy the S3FileNamingStrategy to use in the case that a fallback offset seeker is required. * @param bucket the configured bucket * @return either a SinkError or an option to a TopicPartitionOffset with the seek result. */ def seek( - topicPartition: TopicPartition, - fileNamingStrategy: S3FileNamingStrategy, - bucket: String, + topicPartition: TopicPartition, + bucket: String, ): Either[SinkError, Option[TopicPartitionOffset]] = { val indexLocation = IndexFilenames.indexForTopicPartition(topicPartition.topic.value, topicPartition.partition) storageInterface.listRecursive( diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/PartitionSearcherOptions.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/PartitionSearcherOptions.scala index 1d452a1bc..8a09ea6bf 100644 --- a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/PartitionSearcherOptions.scala +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/PartitionSearcherOptions.scala @@ -17,8 +17,17 @@ package io.lenses.streamreactor.connect.aws.s3.source.config import scala.concurrent.duration.FiniteDuration +object PartitionSearcherOptions { + val ExcludeIndexes: Set[String] = Set(".indexes") + +} + +/** + * @param wildcardExcludes allows ignoring paths containing certain strings. Mainly it is used to prevent us from reading anything inside the .indexes key prefix, as these should be ignored by the source. + */ case class PartitionSearcherOptions( - recurseLevels: Int, - continuous: Boolean, - interval: FiniteDuration, + recurseLevels: Int, + continuous: Boolean, + interval: FiniteDuration, + wildcardExcludes: Set[String], ) diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/ReadTextMode.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/ReadTextMode.scala index 033fe9eb8..34ecd4d10 100644 --- a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/ReadTextMode.scala +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/ReadTextMode.scala @@ -15,12 +15,12 @@ */ package io.lenses.streamreactor.connect.aws.s3.source.config +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEntry +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEnum import io.lenses.streamreactor.connect.aws.s3.formats.reader.CustomTextStreamReader import io.lenses.streamreactor.connect.aws.s3.formats.reader.S3DataIterator import io.lenses.streamreactor.connect.aws.s3.source.config.kcqlprops.ReadTextModeEntry import io.lenses.streamreactor.connect.aws.s3.source.config.kcqlprops.ReadTextModeEnum -import io.lenses.streamreactor.connect.aws.s3.source.config.kcqlprops.S3PropsKeyEntry -import io.lenses.streamreactor.connect.aws.s3.source.config.kcqlprops.S3PropsKeyEnum import io.lenses.streamreactor.connect.config.kcqlprops.KcqlProperties import io.lenses.streamreactor.connect.io.text.LineStartLineEndReader import io.lenses.streamreactor.connect.io.text.PrefixSuffixReader diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/S3SourceConfig.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/S3SourceConfig.scala index 6c0e3caed..69b9dbf86 100644 --- a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/S3SourceConfig.scala +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/S3SourceConfig.scala @@ -27,6 +27,7 @@ import io.lenses.streamreactor.connect.aws.s3.config.S3ConfigSettings.SOURCE_PAR import io.lenses.streamreactor.connect.aws.s3.config.S3ConfigSettings.SOURCE_PARTITION_EXTRACTOR_TYPE import io.lenses.streamreactor.connect.aws.s3.model.CompressionCodec import io.lenses.streamreactor.connect.aws.s3.model.location.S3Location +import io.lenses.streamreactor.connect.aws.s3.source.config.kcqlprops.S3SourcePropsSchema import io.lenses.streamreactor.connect.aws.s3.storage.FileListError import io.lenses.streamreactor.connect.aws.s3.storage.FileMetadata import io.lenses.streamreactor.connect.aws.s3.storage.ListResponse @@ -111,7 +112,7 @@ object SourceBucketOptions { kcql: Kcql => for { source <- S3Location.splitAndValidate(kcql.getSource, allowSlash = true) - format <- FormatSelection.fromKcql(kcql) + format <- FormatSelection.fromKcql(kcql, S3SourcePropsSchema.schema) //extract the envelope. of not present default to false hasEnvelope <- extractEnvelope(Option(kcql.getProperties).map(_.asScala.toMap).getOrElse(Map.empty)) diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/S3SourceConfigDef.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/S3SourceConfigDef.scala index 4cddd7650..37cfac8ac 100644 --- a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/S3SourceConfigDef.scala +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/S3SourceConfigDef.scala @@ -21,6 +21,7 @@ import com.typesafe.scalalogging.LazyLogging import io.lenses.streamreactor.connect.aws.s3.config.S3ConfigSettings._ import io.lenses.streamreactor.connect.aws.s3.config._ import io.lenses.streamreactor.connect.aws.s3.config.processors.ConfigDefProcessor +import io.lenses.streamreactor.connect.aws.s3.config.processors.DeprecationConfigDefProcessor import io.lenses.streamreactor.connect.aws.s3.config.processors.LowerCaseKeyConfigDefProcessor import io.lenses.streamreactor.connect.aws.s3.config.processors.YamlProfileProcessor import org.apache.kafka.common.config.ConfigDef @@ -105,7 +106,7 @@ object S3SourceConfigDef { class S3SourceConfigDef() extends ConfigDef with LazyLogging { private val processorChain: List[ConfigDefProcessor] = - List(new LowerCaseKeyConfigDefProcessor, new YamlProfileProcessor) + List(new LowerCaseKeyConfigDefProcessor, new DeprecationConfigDefProcessor, new YamlProfileProcessor) override def parse(jProps: util.Map[_, _]): util.Map[String, AnyRef] = { val scalaProps: Map[Any, Any] = jProps.asScala.toMap diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/SourcePartitionSearcherSettings.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/SourcePartitionSearcherSettings.scala index 7335c55e9..704bece16 100644 --- a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/SourcePartitionSearcherSettings.scala +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/SourcePartitionSearcherSettings.scala @@ -18,6 +18,7 @@ package io.lenses.streamreactor.connect.aws.s3.source.config import com.datamountaineer.streamreactor.common.config.base.traits.BaseSettings import io.lenses.streamreactor.connect.aws.s3.config.S3Config import io.lenses.streamreactor.connect.aws.s3.config.S3ConfigSettings._ +import io.lenses.streamreactor.connect.aws.s3.source.config.PartitionSearcherOptions.ExcludeIndexes import scala.concurrent.duration.DurationLong @@ -30,5 +31,6 @@ trait SourcePartitionSearcherSettings extends BaseSettings { interval = S3Config.getLong(props, SOURCE_PARTITION_SEARCH_INTERVAL_MILLIS).getOrElse( SOURCE_PARTITION_SEARCH_INTERVAL_MILLIS_DEFAULT, ).millis, + wildcardExcludes = ExcludeIndexes, ) } diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/kcqlprops/S3PropsSchema.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/kcqlprops/S3SourcePropsSchema.scala similarity index 56% rename from kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/kcqlprops/S3PropsSchema.scala rename to kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/kcqlprops/S3SourcePropsSchema.scala index e8e61b19b..bb9efe7e1 100644 --- a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/kcqlprops/S3PropsSchema.scala +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/config/kcqlprops/S3SourcePropsSchema.scala @@ -15,7 +15,9 @@ */ package io.lenses.streamreactor.connect.aws.s3.source.config.kcqlprops -import io.lenses.streamreactor.connect.aws.s3.source.config.kcqlprops.S3PropsKeyEnum._ +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEntry +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEnum +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEnum._ import io.lenses.streamreactor.connect.config.kcqlprops.BooleanPropsSchema import io.lenses.streamreactor.connect.config.kcqlprops.EnumPropsSchema import io.lenses.streamreactor.connect.config.kcqlprops.IntPropsSchema @@ -23,24 +25,20 @@ import io.lenses.streamreactor.connect.config.kcqlprops.KcqlPropsSchema import io.lenses.streamreactor.connect.config.kcqlprops.PropsSchema import io.lenses.streamreactor.connect.config.kcqlprops.StringPropsSchema -object S3PropsSchema { +object S3SourcePropsSchema { - private val keys = Map[S3PropsKeyEntry, PropsSchema]( - ReadTextMode -> EnumPropsSchema(ReadTextModeEnum), - ReadRegex -> StringPropsSchema, - ReadStartTag -> StringPropsSchema, - ReadEndTag -> StringPropsSchema, - ReadStartLine -> StringPropsSchema, - ReadEndLine -> StringPropsSchema, - BufferSize -> IntPropsSchema, - ReadTrimLine -> BooleanPropsSchema, - StoreEnvelope -> BooleanPropsSchema, - StoreEnvelopeKey -> BooleanPropsSchema, - StoreEnvelopeHeaders -> BooleanPropsSchema, - StoreEnvelopeValue -> BooleanPropsSchema, - StoreEnvelopeMetadata -> BooleanPropsSchema, + private[source] val keys = Map[S3PropsKeyEntry, PropsSchema]( + ReadTextMode -> EnumPropsSchema(ReadTextModeEnum), + ReadRegex -> StringPropsSchema, + ReadStartTag -> StringPropsSchema, + ReadEndTag -> StringPropsSchema, + ReadStartLine -> StringPropsSchema, + ReadEndLine -> StringPropsSchema, + BufferSize -> IntPropsSchema, + ReadTrimLine -> BooleanPropsSchema, + StoreEnvelope -> BooleanPropsSchema, ) - val schema = KcqlPropsSchema(S3PropsKeyEnum, keys) + private[source] val schema = KcqlPropsSchema(S3PropsKeyEnum, keys) } diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/distribution/PartitionSearcher.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/distribution/PartitionSearcher.scala index 567c9badb..da8ce055b 100644 --- a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/distribution/PartitionSearcher.scala +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/distribution/PartitionSearcher.scala @@ -60,6 +60,7 @@ class PartitionSearcher( root, config, originalPartitions, + settings.wildcardExcludes, listS3ObjF, connectorTaskId, ) @@ -81,4 +82,5 @@ class PartitionSearcher( Option.empty, ) } + } diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/reader/ResultReader.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/reader/ResultReader.scala index 504311e11..957fb2efd 100644 --- a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/reader/ResultReader.scala +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/source/reader/ResultReader.scala @@ -41,17 +41,8 @@ class ResultReader( * Retrieves the results for a particular reader, or None if no further results are available */ def retrieveResults(limit: Int): Option[Vector[SourceRecord]] = { - val results: Vector[SourceRecord] = accumulate(limit, reader, Vector.empty[SourceRecord]) - - if (results.isEmpty) { - None - } else { - Some( - results, - ) - } - + Option.when(results.nonEmpty)(results) } @tailrec diff --git a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/storage/AwsS3DirectoryLister.scala b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/storage/AwsS3DirectoryLister.scala index ca24c1d25..4f15acccf 100644 --- a/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/storage/AwsS3DirectoryLister.scala +++ b/kafka-connect-aws-s3/src/main/scala/io/lenses/streamreactor/connect/aws/s3/storage/AwsS3DirectoryLister.scala @@ -25,31 +25,46 @@ import software.amazon.awssdk.services.s3.model._ import scala.jdk.CollectionConverters.IteratorHasAsScala object AwsS3DirectoryLister extends LazyLogging { + + /** + * @param wildcardExcludes allows ignoring paths containing certain strings. Mainly it is used to prevent us from reading anything inside the .indexes key prefix, as these should be ignored by the source. + */ def findDirectories( bucketAndPrefix: S3Location, completionConfig: DirectoryFindCompletionConfig, exclude: Set[String], + wildcardExcludes: Set[String], listObjectsF: ListObjectsV2Request => Iterator[ListObjectsV2Response], connectorTaskId: ConnectorTaskId, ): IO[DirectoryFindResults] = for { - iterator <- IO(listObjectsF(createListObjectsRequest(bucketAndPrefix))) - prefixInfo <- extractPrefixesFromResponse(iterator, exclude, connectorTaskId, completionConfig.levelsToRecurse) + iterator <- IO(listObjectsF(createListObjectsRequest(bucketAndPrefix))) + prefixInfo <- extractPrefixesFromResponse(iterator, + exclude, + wildcardExcludes, + connectorTaskId, + completionConfig.levelsToRecurse, + ) flattened <- flattenPrefixes( bucketAndPrefix, prefixInfo.partitions, completionConfig, exclude, + wildcardExcludes, listObjectsF, connectorTaskId, ) } yield DirectoryFindResults(flattened) + /** + * @param wildcardExcludes allows ignoring paths containing certain strings. Mainly it is used to prevent us from reading anything inside the .indexes key prefix, as these should be ignored by the source. + */ private def flattenPrefixes( bucketAndPrefix: S3Location, prefixes: Set[String], completionConfig: DirectoryFindCompletionConfig, exclude: Set[String], + wildcardExcludes: Set[String], listObjectsF: ListObjectsV2Request => Iterator[ListObjectsV2Response], connectorTaskId: ConnectorTaskId, ): IO[Set[String]] = @@ -57,11 +72,13 @@ object AwsS3DirectoryLister extends LazyLogging { else { prefixes.map(bucketAndPrefix.fromRoot).toList .traverse( - findDirectories(_, - completionConfig.copy(levelsToRecurse = completionConfig.levelsToRecurse - 1), - exclude, - listObjectsF, - connectorTaskId, + findDirectories( + _, + completionConfig.copy(levelsToRecurse = completionConfig.levelsToRecurse - 1), + exclude, + wildcardExcludes, + listObjectsF, + connectorTaskId, ).map(_.partitions), ) .map { result => @@ -83,10 +100,11 @@ object AwsS3DirectoryLister extends LazyLogging { } private def extractPrefixesFromResponse( - iterator: Iterator[ListObjectsV2Response], - exclude: Set[String], - connectorTaskId: ConnectorTaskId, - levelsToRecurse: Int, + iterator: Iterator[ListObjectsV2Response], + exclude: Set[String], + wildcardExcludes: Set[String], + connectorTaskId: ConnectorTaskId, + levelsToRecurse: Int, ): IO[DirectoryFindResults] = IO { val paths = iterator.foldLeft(Set.empty[String]) { @@ -98,7 +116,11 @@ object AwsS3DirectoryLister extends LazyLogging { if (levelsToRecurse > 0) { acc + prefix } else { - if (connectorTaskId.ownsDir(prefix) && !exclude.contains(prefix)) acc + prefix + if ( + connectorTaskId.ownsDir(prefix) && !exclude.contains(prefix) && !wildcardExcludes.exists(we => + prefix.contains(we), + ) + ) acc + prefix else acc } } diff --git a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/config/CommonConfigDefTest.scala b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/config/CommonConfigDefTest.scala index 92af2101c..456b66d80 100644 --- a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/config/CommonConfigDefTest.scala +++ b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/config/CommonConfigDefTest.scala @@ -15,15 +15,16 @@ */ package io.lenses.streamreactor.connect.aws.s3.config -import cats.implicits._ import io.lenses.streamreactor.connect.aws.s3.config.S3ConfigSettings._ +import org.scalatest.EitherValues import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers import scala.jdk.CollectionConverters.MapHasAsJava import scala.jdk.CollectionConverters.MapHasAsScala +import scala.util.Try -class CommonConfigDefTest extends AnyFlatSpec with Matchers { +class CommonConfigDefTest extends AnyFlatSpec with Matchers with EitherValues { private val DeprecatedProps: Map[String, String] = Map( DEP_AWS_ACCESS_KEY -> "DepAccessKey", @@ -53,21 +54,10 @@ class CommonConfigDefTest extends AnyFlatSpec with Matchers { resultMap.keys should contain allElementsOf DefaultProps.keys } - "CommonConfigDef" should "parse deprecated properties" in { - val resultMap = CommonConfigDef.config.parse(DeprecatedProps.asJava).asScala - resultMap should have size 18 - DeprecatedProps.filterNot { case (k, _) => k == KCQL_CONFIG }.foreach { - case (k, _) => resultMap.get(k) should be(None) - } - resultMap.keys should contain allElementsOf DefaultProps.keys + "CommonConfigDef" should "not parse deprecated properties" in { + Try(CommonConfigDef.config.parse(DeprecatedProps.asJava)).toEither.left.value.getMessage should startWith( + "The following properties have been deprecated", + ) } - "CommonConfigDef" should "parse merged properties" in { - val mergedProps = DefaultProps.combine(DeprecatedProps) - val resultMap = CommonConfigDef.config.parse(mergedProps.asJava).asScala - DeprecatedProps.filterNot { case (k, _) => k == KCQL_CONFIG }.foreach { - case (k, _) => resultMap.get(k) should be(None) - } - resultMap.keys should contain allElementsOf DefaultProps.keys - } } diff --git a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/config/processors/DeprecationConfigDefProcessorTest.scala b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/config/processors/DeprecationConfigDefProcessorTest.scala new file mode 100644 index 000000000..b3d260f3d --- /dev/null +++ b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/config/processors/DeprecationConfigDefProcessorTest.scala @@ -0,0 +1,57 @@ +/* + * Copyright 2017-2023 Lenses.io Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.lenses.streamreactor.connect.aws.s3.config.processors + +import org.scalatest.EitherValues +import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers + +class DeprecationConfigDefProcessorTest extends AnyFunSuite with Matchers with EitherValues { + + private val okProperties = Map( + "connect.s3.aws.access.key" -> "myKey", + "aws.s3.someProperty" -> "value1", + "aws.s3.anotherProp" -> "value2", + ) + + test("process should return Right when no deprecated properties are found") { + val processor = new DeprecationConfigDefProcessor() + + val result = processor.process(okProperties) + result shouldBe Right(okProperties) + } + + test("process should return Left with error message when deprecated properties are found") { + val processor = new DeprecationConfigDefProcessor() + val inputConfig = okProperties ++ Map( + "aws.access.key" -> "value1", + "aws.vhost.bucket" -> "value2", + ) + + val result = processor.process(inputConfig) + val errorMessage = result.left.value.getMessage + errorMessage should include("The following properties have been deprecated:") + errorMessage should include("Change `aws.access.key` to `connect.s3.aws.access.key`") + errorMessage should include("Change `aws.vhost.bucket` to `connect.s3.vhost.bucket`") + } + + test("process should return Right when empty input configuration is provided") { + val processor = new DeprecationConfigDefProcessor() + val result = processor.process(Map.empty) + result shouldBe Right(Map.empty) + } + +} diff --git a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/model/PartitionDisplayTest.scala b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/model/PartitionDisplayTest.scala deleted file mode 100644 index a74e45039..000000000 --- a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/model/PartitionDisplayTest.scala +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright 2017-2023 Lenses.io Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.lenses.streamreactor.connect.aws.s3.model - -import com.datamountaineer.kcql.Kcql -import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionDisplay -import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionDisplay.KeysAndValues -import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionDisplay.Values -import org.mockito.MockitoSugar -import org.scalatest.flatspec.AnyFlatSpec -import org.scalatest.matchers.should.Matchers - -class PartitionDisplayTest extends AnyFlatSpec with MockitoSugar with Matchers { - - val kcql: Kcql = mock[Kcql] - - "apply" should "recognise KeysAndValues from KCQL" in { - when(kcql.getWithPartitioner).thenReturn("KEYSANDVALUES") - - PartitionDisplay(kcql) should be(KeysAndValues) - } - - "apply" should "recognise Keys from KCQL" in { - when(kcql.getWithPartitioner).thenReturn("values") - - PartitionDisplay(kcql) should be(Values) - } - - "apply" should "default to KeysAndValues when no partitioner specified in kcql" in { - when(kcql.getWithPartitioner).thenReturn(null) - - PartitionDisplay(kcql) should be(KeysAndValues) - } -} diff --git a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/model/PartitionFieldTest.scala b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/model/PartitionFieldTest.scala index d0fbaf5c1..a58f675f3 100644 --- a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/model/PartitionFieldTest.scala +++ b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/model/PartitionFieldTest.scala @@ -44,7 +44,7 @@ class PartitionFieldTest extends AnyFlatSpec with MockitoSugar with Matchers { "partitionField.apply" should "parse partitions by whole key" in { when(kcql.getPartitionBy).thenReturn(Seq("_key").iterator.asJava) - PartitionField(kcql) should be(Seq(WholeKeyPartitionField())) + PartitionField(kcql) should be(Seq(WholeKeyPartitionField)) } "partitionField.apply" should "parse partitions by keys" in { diff --git a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/model/S3StoredFileTest.scala b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/model/S3StoredFileTest.scala deleted file mode 100644 index dda06a18c..000000000 --- a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/model/S3StoredFileTest.scala +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright 2017-2023 Lenses.io Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.lenses.streamreactor.connect.aws.s3.model - -import io.lenses.streamreactor.connect.aws.s3.config.JsonFormatSelection -import io.lenses.streamreactor.connect.aws.s3.sink.HierarchicalS3FileNamingStrategy -import io.lenses.streamreactor.connect.aws.s3.sink.NoOpPaddingStrategy -import io.lenses.streamreactor.connect.aws.s3.sink.PartitionedS3FileNamingStrategy -import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionField -import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionSelection -import org.scalatest.flatspec.AnyFlatSpec -import org.scalatest.matchers.should.Matchers - -class S3StoredFileTest extends AnyFlatSpec with Matchers { - - // aim: we want to support this eventually - //val path = "dragon-test/1.json" - - "apply" should "parse hierarchical scheme" in { - - implicit val hierarchical: HierarchicalS3FileNamingStrategy = - new HierarchicalS3FileNamingStrategy(JsonFormatSelection, NoOpPaddingStrategy) - - S3StoredFile("dragon-test/myTopicName/1/1.json") should be(Some(S3StoredFile( - "dragon-test/myTopicName/1/1.json", - Topic("myTopicName").withPartition(1).withOffset(Offset(1)), - ))) - } - - "apply" should "parse partitioned scheme" in { - - implicit val partitioned: PartitionedS3FileNamingStrategy = new PartitionedS3FileNamingStrategy( - JsonFormatSelection, - NoOpPaddingStrategy, - PartitionSelection(Seq.empty[PartitionField]), - ) - - S3StoredFile("dragon-test/myTopicName(1_2).json") should be(Some(S3StoredFile( - "dragon-test/myTopicName(1_2).json", - Topic("myTopicName").withPartition(1).withOffset(Offset(2)), - ))) - } -} diff --git a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/CommittedFileNameTest.scala b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/CommittedFileNameTest.scala deleted file mode 100644 index c26aa6f44..000000000 --- a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/CommittedFileNameTest.scala +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright 2017-2023 Lenses.io Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.lenses.streamreactor.connect.aws.s3.sink - -import io.lenses.streamreactor.connect.aws.s3.config.AvroFormatSelection -import io.lenses.streamreactor.connect.aws.s3.config.JsonFormatSelection -import io.lenses.streamreactor.connect.aws.s3.model._ -import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionNamePath -import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionSelection -import io.lenses.streamreactor.connect.aws.s3.sink.config.ValuePartitionField -import org.scalatest.flatspec.AnyFlatSpecLike -import org.scalatest.matchers.should.Matchers - -class CommittedFileNameTest extends AnyFlatSpecLike with Matchers { - - class TestContext(fileNamingStrategy: S3FileNamingStrategy) { - implicit val impFileNamingStrategy: S3FileNamingStrategy = fileNamingStrategy - } - - val partitions: PartitionSelection = PartitionSelection(Vector(ValuePartitionField(PartitionNamePath("partition1")), - ValuePartitionField(PartitionNamePath("partition2")), - )) - - class HierarchicalJsonTestContext - extends TestContext(new HierarchicalS3FileNamingStrategy(JsonFormatSelection, NoOpPaddingStrategy)) - - class PartitionedAvroTestContext - extends TestContext(new PartitionedS3FileNamingStrategy(AvroFormatSelection, NoOpPaddingStrategy, partitions)) - - "unapply" should "recognise hierarchical filenames in prefix/topic/927/77.json format" in new HierarchicalJsonTestContext { - CommittedFileName.unapply("prefix/topic/927/77.json") should be(Some((Topic("topic"), 927, Offset(77), "json"))) - } - - "unapply" should "not recognise hierarchical filenames other formats" in new HierarchicalJsonTestContext { - CommittedFileName.unapply("prefix/topic/927/77") should be(None) - } - - "unapply" should "not recognise hierarchical filenames for non-supported file types" in new HierarchicalJsonTestContext { - CommittedFileName.unapply("prefix/topic/927/77.doc") should be(None) - } - - "unapply" should "not recognise hierarchical filenames for a long path" in new HierarchicalJsonTestContext { - CommittedFileName.unapply("extra/long/prefix/topic/927/77.doc") should be(None) - } - - "unapply" should "recognise partitioned filenames in prefix/topic/927/77.json format" in new PartitionedAvroTestContext { - CommittedFileName.unapply("prefix/partition1=something/topic(927_77).json") should be(Some((Topic("topic"), - 927, - Offset(77), - "json", - ))) - CommittedFileName.unapply("prefix/partition1=something/partition2=else/topic(927_77).json") should be( - Some((Topic("topic"), 927, Offset(77), "json")), - ) - CommittedFileName.unapply( - "prefix/partition1=something/partition2=else/partition3=sausages/topic(927_77).json", - ) should be(Some((Topic("topic"), 927, Offset(77), "json"))) - } - - "unapply" should "not recognise partitioned filenames other formats" in new PartitionedAvroTestContext { - CommittedFileName.unapply("prefix/partition1=something/partition2=else/topic(927_77)") should be(None) - } - - "unapply" should "not recognise partitioned filenames for non-supported file types" in new PartitionedAvroTestContext { - CommittedFileName.unapply("prefix/partition1=something/partition2=else/topic(927_77).doc") should be(None) - } - - "unapply" should "not recognise partitioned filenames for a long path" in new PartitionedAvroTestContext { - CommittedFileName.unapply("extra/long/prefix/partition1=something/partition2=else/topic(927_77).doc") should be( - None, - ) - } - - "unapply" should "support valid kafka topic name" in new PartitionedAvroTestContext { - CommittedFileName.unapply( - "extra/long/prefix/partition1=something/partition2=else/REAL_val1d-T0PIC.name(927_77).csv", - ) should - be(Some((Topic("REAL_val1d-T0PIC.name"), 927, Offset(77), "csv"))) - } - -} diff --git a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/PaddingStrategyTest.scala b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/PaddingStrategyTest.scala index 998b52379..52b5101a3 100644 --- a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/PaddingStrategyTest.scala +++ b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/PaddingStrategyTest.scala @@ -20,10 +20,6 @@ import org.scalatest.matchers.should.Matchers class PaddingStrategyTest extends AnyFlatSpecLike with Matchers { - "NoOpPaddingStrategy" should "return string as is" in { - NoOpPaddingStrategy.padString("1") should be("1") - } - "LeftPaddingStrategy" should "pad string left" in { LeftPadPaddingStrategy(5, '0').padString("2") should be("00002") } diff --git a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/LocalStagingAreaTest.scala b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/LocalStagingAreaTest.scala index dececc66e..7eac58cdd 100644 --- a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/LocalStagingAreaTest.scala +++ b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/LocalStagingAreaTest.scala @@ -23,37 +23,29 @@ import org.scalatest.matchers.should.Matchers import java.io.File import java.nio.file.Files -import scala.jdk.CollectionConverters.MapHasAsJava class LocalStagingAreaTest extends AnyFlatSpec with Matchers with EitherValues { private val tmpDir = Files.createTempDirectory("S3OutputStreamOptionsTest") behavior of "LocalStagingArea" - private def adapt(map: Map[String, String]) = { - val newMap = map + { - "connect.s3.kcql" -> "assda" - } - S3SinkConfigDefBuilder(newMap.asJava) - } - it should "create BuildLocalOutputStreamOptions when temp directory has been supplied" in { implicit val connectorTaskId: ConnectorTaskId = ConnectorTaskId("unusedSinkName", 1, 1) - LocalStagingArea(adapt(Map(LOCAL_TMP_DIRECTORY -> s"$tmpDir/my/path"))) should + LocalStagingArea(TestConfigDefBuilder(LOCAL_TMP_DIRECTORY -> s"$tmpDir/my/path")) should be(Right(LocalStagingArea(new File(s"$tmpDir/my/path")))) } it should "create BuildLocalOutputStreamOptions when temp directory and sink name has been supplied" in { implicit val connectorTaskId: ConnectorTaskId = ConnectorTaskId("unusedSinkName", 1, 1) // should ignore the sinkName - LocalStagingArea(adapt(Map(LOCAL_TMP_DIRECTORY -> s"$tmpDir/my/path"))) should + LocalStagingArea(TestConfigDefBuilder((LOCAL_TMP_DIRECTORY -> s"$tmpDir/my/path"))) should be(Right(LocalStagingArea(new File(s"$tmpDir/my/path")))) } it should "create BuildLocalOutputStreamOptions when only sink name has been supplied" in { implicit val connectorTaskId: ConnectorTaskId = ConnectorTaskId("superSleekSinkName", 1, 1) val tempDir = System.getProperty("java.io.tmpdir") - val result = LocalStagingArea(adapt(Map())) + val result = LocalStagingArea(TestConfigDefBuilder()) result.isRight should be(true) result.value match { case LocalStagingArea(file) => file.toString should startWith(s"$tempDir/superSleekSinkName".replace("//", "/")) diff --git a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/PartitionDisplayTest.scala b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/PartitionDisplayTest.scala new file mode 100644 index 000000000..53d44cf50 --- /dev/null +++ b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/PartitionDisplayTest.scala @@ -0,0 +1,72 @@ +/* + * Copyright 2017-2023 Lenses.io Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.lenses.streamreactor.connect.aws.s3.sink.config + +import com.datamountaineer.kcql.Kcql +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEntry +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEnum +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEnum.PartitionIncludeKeys +import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionDisplay.KeysAndValues +import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionDisplay.Values +import io.lenses.streamreactor.connect.aws.s3.sink.config.kcqlprops.S3SinkPropsSchema +import io.lenses.streamreactor.connect.config.kcqlprops.KcqlProperties +import org.mockito.MockitoSugar +import org.scalatest.BeforeAndAfter +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +class PartitionDisplayTest extends AnyFlatSpec with MockitoSugar with Matchers with BeforeAndAfter { + + private val kcql: Kcql = mock[Kcql] + private val emptyProps: KcqlProperties[S3PropsKeyEntry, S3PropsKeyEnum.type] = + KcqlProperties[S3PropsKeyEntry, S3PropsKeyEnum.type](schema = S3SinkPropsSchema.schema, map = Map.empty) + + before { + reset(kcql) + } + + "apply" should "recognise KeysAndValues from KCQL" in { + when(kcql.getWithPartitioner).thenReturn("KEYSANDVALUES") + + PartitionDisplay(kcql, emptyProps, Values) should be(KeysAndValues) + } + + "apply" should "recognise Keys from KCQL" in { + when(kcql.getWithPartitioner).thenReturn("values") + + PartitionDisplay(kcql, emptyProps, KeysAndValues) should be(Values) + } + + "apply" should "recognise Keys from KCQL props" in { + when(kcql.getWithPartitioner).thenReturn(null) + + def keyValueProp(includeKeys: Boolean): KcqlProperties[S3PropsKeyEntry, S3PropsKeyEnum.type] = + KcqlProperties[S3PropsKeyEntry, S3PropsKeyEnum.type](schema = S3SinkPropsSchema.schema, + map = Map( + PartitionIncludeKeys.entryName -> includeKeys.toString, + ), + ) + PartitionDisplay(kcql, keyValueProp(true), Values) should be(KeysAndValues) + PartitionDisplay(kcql, keyValueProp(false), Values) should be(Values) + } + + "apply" should "default to specified default when no partitioner specified in kcql" in { + when(kcql.getWithPartitioner).thenReturn(null) + + PartitionDisplay(kcql, emptyProps, KeysAndValues) should be(KeysAndValues) + PartitionDisplay(kcql, emptyProps, Values) should be(Values) + } +} diff --git a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/TestConfigDefBuilder.scala b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/TestConfigDefBuilder.scala new file mode 100644 index 000000000..1a36d07a9 --- /dev/null +++ b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/TestConfigDefBuilder.scala @@ -0,0 +1,30 @@ +/* + * Copyright 2017-2023 Lenses.io Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.lenses.streamreactor.connect.aws.s3.sink.config + +import scala.jdk.CollectionConverters.MapHasAsJava + +object TestConfigDefBuilder { + + def apply(pairs: (String, String)*): S3SinkConfigDefBuilder = { + val map: Map[String, String] = pairs.toMap + val newMap = map + { + "connect.s3.kcql" -> "dummy value" + } + S3SinkConfigDefBuilder(newMap.asJava) + } + +} diff --git a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/padding/PaddingServiceTest.scala b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/padding/PaddingServiceTest.scala new file mode 100644 index 000000000..02e84c9d7 --- /dev/null +++ b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/padding/PaddingServiceTest.scala @@ -0,0 +1,100 @@ +/* + * Copyright 2017-2023 Lenses.io Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.lenses.streamreactor.connect.aws.s3.sink.config.padding + +import cats.implicits.catsSyntaxOptionId +import cats.implicits.none +import io.lenses.streamreactor.connect.aws.s3.sink.RightPadPaddingStrategy +import io.lenses.streamreactor.connect.aws.s3.sink.config.S3SinkConfigDefBuilder +import io.lenses.streamreactor.connect.aws.s3.sink.config.kcqlprops.S3SinkPropsSchema +import org.mockito.MockitoSugar._ +import org.scalatest.EitherValues +import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers + +class PaddingServiceTest extends AnyFunSuite with Matchers with EitherValues { + private val paddingStrategy = PaddingType.LeftPad.toPaddingStrategy(12, '0') + private val fields = Map("offset" -> paddingStrategy) + private val emptyProps = S3SinkPropsSchema.schema.readProps() + + test("PaddingService should return a padder for the specified field") { + val paddingService = new PaddingService(fields) + paddingService.padderFor("offset").padString("123") shouldEqual "000000000123" + } + + test("PaddingService should return an identity function when field not in fields") { + val paddingService = new PaddingService(fields) + paddingService.padderFor("other").padString("123") shouldEqual "123" + } + test("PaddingService should apply different padding strategies to multiple fields") { + val fields = Map( + "partition" -> RightPadPaddingStrategy(10, '0'), + "offset" -> paddingStrategy, + ) + val paddingService = new PaddingService(fields) + paddingService.padderFor("partition").padString("123") shouldEqual "1230000000" + paddingService.padderFor("offset").padString("123") shouldEqual "000000000123" + } + + test("PaddingService should return an error when both KCQL properties and ConfigDef padding configurations are set") { + val rightPadConfigDef: S3SinkConfigDefBuilder = mockConfigDefPadding(RightPadPaddingStrategy(10, '-').some) + val kcqlProps = S3SinkPropsSchema.schema.readProps( + "padding.length.partition" -> "5", + "padding.char" -> "#", + "padding.type" -> "RightPad", + ) + val paddingService = PaddingService.fromConfig(rightPadConfigDef, kcqlProps) + + paddingService.left.value.getMessage should startWith("Unable to process both padding") + } + + test("PaddingService should respect padding defined in KCQL properties") { + val kcqlProps = S3SinkPropsSchema.schema.readProps( + "padding.length.partition" -> "5", + "padding.char" -> "#", + "padding.type" -> "RightPad", + ) + val paddingService = + PaddingService.fromConfig(mockConfigDefPadding(none), kcqlProps).getOrElse(fail("No padding service found")) + + paddingService.padderFor("partition").padString("123") shouldEqual "123##" + + } + + test("fromConfig should create a PaddingService from S3SinkConfigDefBuilder operating only on 'offset'") { + val rightPadConfigDef: S3SinkConfigDefBuilder = mockConfigDefPadding(RightPadPaddingStrategy(10, '-').some) + val paddingService = + PaddingService.fromConfig(rightPadConfigDef, emptyProps).getOrElse(fail("No padding service found")) + + paddingService.padderFor("offset").padString("123") shouldEqual "123-------" + paddingService.padderFor("other").padString("123") shouldEqual "123" + } + + test("fromConfig should return a PaddingService with default values when no config is provided") { + val emptyConfigDef: S3SinkConfigDefBuilder = mockConfigDefPadding(none) + val paddingService = + PaddingService.fromConfig(emptyConfigDef, emptyProps).getOrElse(fail("No padding service found")) + + paddingService.padderFor("offset").padString("123") shouldEqual "000000000123" + paddingService.padderFor("other").padString("123") shouldEqual "123" + } + + private def mockConfigDefPadding(maybePaddingStrategy: Option[RightPadPaddingStrategy]) = { + val configDef = mock[S3SinkConfigDefBuilder] + when(configDef.getPaddingStrategy).thenReturn(maybePaddingStrategy) + configDef + } +} diff --git a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/padding/PaddingStrategySettingsTest.scala b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/padding/PaddingStrategySettingsTest.scala new file mode 100644 index 000000000..a475a0e5c --- /dev/null +++ b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/config/padding/PaddingStrategySettingsTest.scala @@ -0,0 +1,49 @@ +/* + * Copyright 2017-2023 Lenses.io Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.lenses.streamreactor.connect.aws.s3.sink.config.padding +import cats.implicits.catsSyntaxOptionId +import io.lenses.streamreactor.connect.aws.s3.config.S3ConfigSettings.PADDING_LENGTH +import io.lenses.streamreactor.connect.aws.s3.config.S3ConfigSettings.PADDING_STRATEGY +import io.lenses.streamreactor.connect.aws.s3.sink.LeftPadPaddingStrategy +import io.lenses.streamreactor.connect.aws.s3.sink.config.TestConfigDefBuilder +import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers + +class PaddingStrategySettingsTest extends AnyFunSuite with Matchers { + + test("getPaddingStrategy should return None when padding strategy is not provided") { + val settings = TestConfigDefBuilder() + settings.getPaddingStrategy should equal(None) + } + + test("getPaddingStrategy should return None when padding length is less than 0") { + val settings = TestConfigDefBuilder(PADDING_STRATEGY -> "-1") + settings.getPaddingStrategy should equal(None) + } + + test("getPaddingStrategy should return None when padding length is much less than 0") { + val settings = TestConfigDefBuilder(PADDING_LENGTH -> "-199") + settings.getPaddingStrategy should equal(None) + } + + test("getPaddingStrategy should return the appropriate PaddingStrategy") { + val settings = TestConfigDefBuilder( + PADDING_LENGTH -> "12", + PADDING_STRATEGY -> "LeftPad", + ) + settings.getPaddingStrategy should equal(LeftPadPaddingStrategy(12, '0').some) + } +} diff --git a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/naming/S3FileNamerTest.scala b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/naming/S3FileNamerTest.scala new file mode 100644 index 000000000..d09ea4ef0 --- /dev/null +++ b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/naming/S3FileNamerTest.scala @@ -0,0 +1,42 @@ +/* + * Copyright 2017-2023 Lenses.io Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.lenses.streamreactor.connect.aws.s3.sink.naming + +import io.lenses.streamreactor.connect.aws.s3.model.Topic +import io.lenses.streamreactor.connect.aws.s3.sink.config.padding.PaddingType.LeftPad +import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers +class S3FileNamerTest extends AnyFunSuite with Matchers { + + private val extension = "avro" + private val paddingStrategy = LeftPad.toPaddingStrategy(5, '0') + private val topicPartitionOffset = Topic("topic").withPartition(9).atOffset(81) + + test("OffsetFileNamer.fileName should generate the correct file name") { + + val result = new OffsetS3FileNamer(paddingStrategy, extension).fileName(topicPartitionOffset) + + result shouldEqual "00081.avro" + } + + test("TopicPartitionOffsetFileNamer.fileName should generate the correct file name") { + + val result = + new TopicPartitionOffsetS3FileNamer(paddingStrategy, paddingStrategy, extension).fileName(topicPartitionOffset) + + result shouldEqual "topic(00009_00081).avro" + } +} diff --git a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/naming/S3KeyNamerTest.scala b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/naming/S3KeyNamerTest.scala new file mode 100644 index 000000000..39b985bf4 --- /dev/null +++ b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/naming/S3KeyNamerTest.scala @@ -0,0 +1,108 @@ +/* + * Copyright 2017-2023 Lenses.io Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.lenses.streamreactor.connect.aws.s3.sink.naming + +import cats.implicits.none +import io.lenses.streamreactor.connect.aws.s3.config.FormatSelection +import io.lenses.streamreactor.connect.aws.s3.config.JsonFormatSelection +import io.lenses.streamreactor.connect.aws.s3.model.Topic +import io.lenses.streamreactor.connect.aws.s3.model.location.S3Location +import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionDisplay.Values +import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionSelection.defaultPartitionSelection +import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionField +import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionPartitionField +import io.lenses.streamreactor.connect.aws.s3.sink.config.PartitionSelection +import io.lenses.streamreactor.connect.aws.s3.sink.config.TopicPartitionField +import io.lenses.streamreactor.connect.aws.s3.sink.config.padding.PaddingService +import io.lenses.streamreactor.connect.aws.s3.sink.LeftPadPaddingStrategy +import io.lenses.streamreactor.connect.aws.s3.sink.PaddingStrategy +import org.mockito.ArgumentMatchers.anyString +import org.mockito.MockitoSugar +import org.scalatest.EitherValues +import org.scalatest.OptionValues +import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers + +import java.nio.file.Files +import java.util.UUID + +class S3KeyNamerTest extends AnyFunSuite with Matchers with OptionValues with EitherValues with MockitoSugar { + + private val formatSelection: FormatSelection = JsonFormatSelection + private val paddingStrategy: PaddingStrategy = LeftPadPaddingStrategy(3, '0') + private val partitionSelection: PartitionSelection = defaultPartitionSelection(Values) + + private val fileNamer: S3FileNamer = + new OffsetS3FileNamer(paddingStrategy, JsonFormatSelection.extension) + + private val bucketAndPrefix = S3Location("my-bucket", Some("prefix")) + private val bucketNoPrefix = S3Location("my-bucket", none) + + private val TopicName = "my-topic" + private val Partition = 9 + private val Offset = 81L + + private val topicPartition = Topic(TopicName).withPartition(Partition).atOffset(Offset) + + private val partitionValues = Map[PartitionField, String]( + TopicPartitionField -> TopicName, + PartitionPartitionField -> Partition.toString, + ) + + private val paddingService = mock[PaddingService] + when(paddingService.padderFor(anyString)).thenReturn(paddingStrategy) + + private val s3KeyNamer = S3KeyNamer(formatSelection, partitionSelection, fileNamer, paddingService) + + test("stagingFile should generate the correct staging file path with no prefix") { + val stagingDirectory = Files.createTempDirectory("myTempDir").toFile + + val result = + s3KeyNamer.stagingFile(stagingDirectory, bucketNoPrefix, topicPartition.toTopicPartition, partitionValues) + + val fullPath = result.value.getPath.replace(stagingDirectory.toString, "") + val (path, uuid) = fullPath.splitAt(fullPath.length - 36) + path shouldEqual s"/$TopicName/00$Partition/json/" + UUID.fromString(uuid) + } + + test("stagingFile should generate the correct staging file path") { + val stagingDirectory = Files.createTempDirectory("myTempDir").toFile + + val result = + s3KeyNamer.stagingFile(stagingDirectory, bucketAndPrefix, topicPartition.toTopicPartition, partitionValues) + + val fullPath = result.value.getPath.replace(stagingDirectory.toString, "") + val (path, uuid) = fullPath.splitAt(fullPath.length - 36) + path shouldEqual s"/prefix/$TopicName/00$Partition/json/" + UUID.fromString(uuid) + } + + test("finalFilename should write to the root of the bucket with no prefix") { + + val result = s3KeyNamer.finalFilename(bucketNoPrefix, topicPartition, partitionValues) + + result.value.path.value shouldEqual s"$TopicName/00$Partition/0$Offset.json" + } + + test("finalFilename should generate the correct final S3 location") { + + val result = s3KeyNamer.finalFilename(bucketAndPrefix, topicPartition, partitionValues) + + result.value.path.value shouldEqual s"prefix/$TopicName/00$Partition/0$Offset.json" + } + +} diff --git a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/seek/IndexManagerTest.scala b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/seek/IndexManagerTest.scala index fac9c7ca9..9c23ef713 100644 --- a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/seek/IndexManagerTest.scala +++ b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/sink/seek/IndexManagerTest.scala @@ -22,7 +22,6 @@ import io.lenses.streamreactor.connect.aws.s3.model.Offset import io.lenses.streamreactor.connect.aws.s3.model.Topic import io.lenses.streamreactor.connect.aws.s3.sink.FatalS3SinkError import io.lenses.streamreactor.connect.aws.s3.sink.NonFatalS3SinkError -import io.lenses.streamreactor.connect.aws.s3.sink.S3FileNamingStrategy import io.lenses.streamreactor.connect.aws.s3.storage._ import org.mockito.ArgumentMatchers.any import org.mockito.ArgumentMatchers.anyString @@ -51,12 +50,10 @@ class IndexManagerTest extends AnyFlatSpec with MockitoSugar with EitherValues w private val maxIndexes = 5 - private val fileNamingStrategy = mock[S3FileNamingStrategy] - private val indexManager = new IndexManager(maxIndexes) after { - reset(storageInterface, fileNamingStrategy) + reset(storageInterface) } "write" should "write an index for a topic/partition/offset" in { @@ -252,7 +249,7 @@ class IndexManagerTest extends AnyFlatSpec with MockitoSugar with EitherValues w ).some.asRight, ) when(storageInterface.deleteFiles(eqTo(bucketName), any[List[String]])).thenReturn(().asRight) - val seekRes = indexManager.seek(topicPartition, fileNamingStrategy, bucketName) + val seekRes = indexManager.seek(topicPartition, bucketName) seekRes.value should be(Some(topicPartition.withOffset(Offset(70)))) val seekInOrder = inOrder(storageInterface) @@ -291,7 +288,7 @@ class IndexManagerTest extends AnyFlatSpec with MockitoSugar with EitherValues w when(storageInterface.pathExists(any[String], any[String])).thenReturn(true.asRight) when(storageInterface.deleteFiles(eqTo(bucketName), any[List[String]])).thenReturn(().asRight) - val seekRes = indexManager.seek(topicPartition, fileNamingStrategy, bucketName) + val seekRes = indexManager.seek(topicPartition, bucketName) val capturedEx = seekRes.left.value capturedEx shouldBe a[FatalS3SinkError] capturedEx.message() should startWith("Too many index files have accumulated") diff --git a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/source/config/ReadTextModeTestFormatSelection.scala b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/source/config/ReadTextModeTestFormatSelection.scala index 797ee070d..59e08299a 100644 --- a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/source/config/ReadTextModeTestFormatSelection.scala +++ b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/source/config/ReadTextModeTestFormatSelection.scala @@ -15,10 +15,10 @@ */ package io.lenses.streamreactor.connect.aws.s3.source.config +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEntry +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEnum import io.lenses.streamreactor.connect.aws.s3.source.config.kcqlprops.ReadTextModeEnum -import io.lenses.streamreactor.connect.aws.s3.source.config.kcqlprops.S3PropsKeyEntry -import io.lenses.streamreactor.connect.aws.s3.source.config.kcqlprops.S3PropsKeyEnum -import io.lenses.streamreactor.connect.aws.s3.source.config.kcqlprops.S3PropsSchema +import io.lenses.streamreactor.connect.aws.s3.source.config.kcqlprops.S3SourcePropsSchema import io.lenses.streamreactor.connect.config.kcqlprops.KcqlProperties import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers @@ -137,6 +137,6 @@ class ReadTextModeTestFormatSelection extends AnyFlatSpec with Matchers { } private def readProps(propsMap: Map[String, String]): KcqlProperties[S3PropsKeyEntry, S3PropsKeyEnum.type] = - S3PropsSchema.schema.readProps(propsMap) + S3SourcePropsSchema.schema.readPropsMap(propsMap) } diff --git a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/source/config/S3SourceConfigTests.scala b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/source/config/S3SourceConfigTests.scala index 674347a0a..5e59c3495 100644 --- a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/source/config/S3SourceConfigTests.scala +++ b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/source/config/S3SourceConfigTests.scala @@ -16,6 +16,7 @@ package io.lenses.streamreactor.connect.aws.s3.source.config import io.lenses.streamreactor.connect.aws.s3.config.S3ConfigSettings._ +import io.lenses.streamreactor.connect.aws.s3.source.config.PartitionSearcherOptions.ExcludeIndexes import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers @@ -32,8 +33,9 @@ class S3SourceConfigTests extends AnyFunSuite with Matchers { KCQL_CONFIG -> "INSERT INTO topic SELECT * FROM bucket:/a/b/c", ).asJava, ) match { - case Left(value) => fail(value.toString) - case Right(value) => value.partitionSearcher shouldBe PartitionSearcherOptions(0, false, 1.seconds) + case Left(value) => fail(value.toString) + case Right(value) => + value.partitionSearcher shouldBe PartitionSearcherOptions(0, continuous = false, 1.seconds, ExcludeIndexes) } } test("partition search options disables the continuous search") { @@ -46,8 +48,9 @@ class S3SourceConfigTests extends AnyFunSuite with Matchers { KCQL_CONFIG -> "INSERT INTO topic SELECT * FROM bucket:/a/b/c", ).asJava, ) match { - case Left(value) => fail(value.toString) - case Right(value) => value.partitionSearcher shouldBe PartitionSearcherOptions(1, false, 1.seconds) + case Left(value) => fail(value.toString) + case Right(value) => + value.partitionSearcher shouldBe PartitionSearcherOptions(1, continuous = false, 1.seconds, ExcludeIndexes) } } test("enable continuous partitions polling") { @@ -60,8 +63,9 @@ class S3SourceConfigTests extends AnyFunSuite with Matchers { KCQL_CONFIG -> "INSERT INTO topic SELECT * FROM bucket:/a/b/c", ).asJava, ) match { - case Left(value) => fail(value.toString) - case Right(value) => value.partitionSearcher shouldBe PartitionSearcherOptions(1, true, 1.seconds) + case Left(value) => fail(value.toString) + case Right(value) => + value.partitionSearcher shouldBe PartitionSearcherOptions(1, continuous = true, 1.seconds, ExcludeIndexes) } } test("not specifying the SOURCE_PARTITION_SEARCH_MODE defaults to true") { @@ -73,8 +77,9 @@ class S3SourceConfigTests extends AnyFunSuite with Matchers { KCQL_CONFIG -> "INSERT INTO topic SELECT * FROM bucket:/a/b/c", ).asJava, ) match { - case Left(value) => fail(value.toString) - case Right(value) => value.partitionSearcher shouldBe PartitionSearcherOptions(1, true, 1.seconds) + case Left(value) => fail(value.toString) + case Right(value) => + value.partitionSearcher shouldBe PartitionSearcherOptions(1, continuous = true, 1.seconds, ExcludeIndexes) } } } diff --git a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/source/config/kcqlprops/S3PropsSchemaTest.scala b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/source/config/kcqlprops/S3SourcePropsSchemaTest.scala similarity index 88% rename from kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/source/config/kcqlprops/S3PropsSchemaTest.scala rename to kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/source/config/kcqlprops/S3SourcePropsSchemaTest.scala index 4814b3c77..2b689df93 100644 --- a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/source/config/kcqlprops/S3PropsSchemaTest.scala +++ b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/source/config/kcqlprops/S3SourcePropsSchemaTest.scala @@ -15,15 +15,15 @@ */ package io.lenses.streamreactor.connect.aws.s3.source.config.kcqlprops +import io.lenses.streamreactor.connect.aws.s3.config.kcqlprops.S3PropsKeyEnum import io.lenses.streamreactor.connect.aws.s3.source.config.kcqlprops.ReadTextModeEntry import io.lenses.streamreactor.connect.aws.s3.source.config.kcqlprops.ReadTextModeEnum -import io.lenses.streamreactor.connect.aws.s3.source.config.kcqlprops.S3PropsKeyEnum -import io.lenses.streamreactor.connect.aws.s3.source.config.kcqlprops.S3PropsSchema +import io.lenses.streamreactor.connect.aws.s3.source.config.kcqlprops.S3SourcePropsSchema import io.lenses.streamreactor.connect.aws.s3.source.config.kcqlprops.ReadTextModeEnum.Regex import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers -class S3PropsSchemaTest extends AnyFlatSpec with Matchers { +class S3SourcePropsSchemaTest extends AnyFlatSpec with Matchers { "S3PropsSchema" should "parse expected configs" in { val config = Map[String, String]( @@ -32,7 +32,7 @@ class S3PropsSchemaTest extends AnyFlatSpec with Matchers { "read.text.start.tag" -> "", "read.text.end.tag" -> "", ) - val props = S3PropsSchema.schema.readProps(config) + val props = S3SourcePropsSchema.schema.readPropsMap(config) props.getEnumValue[ReadTextModeEntry, ReadTextModeEnum.type](ReadTextModeEnum, S3PropsKeyEnum.ReadTextMode, ) should be(Some(Regex)) diff --git a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/source/reader/PartitionDiscoveryTest.scala b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/source/reader/PartitionDiscoveryTest.scala index df9fe5a71..8f39567bb 100644 --- a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/source/reader/PartitionDiscoveryTest.scala +++ b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/source/reader/PartitionDiscoveryTest.scala @@ -14,7 +14,7 @@ * limitations under the License. */ package io.lenses.streamreactor.connect.aws.s3.source.reader - +import io.lenses.streamreactor.connect.aws.s3.source.config.PartitionSearcherOptions.ExcludeIndexes import cats.effect.IO import cats.effect.kernel.Ref import cats.effect.unsafe.implicits.global @@ -40,7 +40,7 @@ class PartitionDiscoveryTest extends AnyFlatSpecLike with Matchers with MockitoS "PartitionDiscovery" should "handle failure on PartitionSearcher and resume" in { val fileQueueProcessor: SourceFileQueue = mock[SourceFileQueue] val limit = 10 - val options = PartitionSearcherOptions(1, true, 100.millis) + val options = PartitionSearcherOptions(1, continuous = true, 100.millis, ExcludeIndexes) trait Count { def getCount: IO[Int] @@ -119,7 +119,7 @@ class PartitionDiscoveryTest extends AnyFlatSpecLike with Matchers with MockitoS "prefix2/4.txt", ), ) - val options = PartitionSearcherOptions(1, true, 100.millis) + val options = PartitionSearcherOptions(1, true, 100.millis, ExcludeIndexes) val io = for { cancelledRef <- Ref[IO].of(false) readerRef <- Ref[IO].of(Option.empty[ResultReader]) @@ -174,7 +174,7 @@ class PartitionDiscoveryTest extends AnyFlatSpecLike with Matchers with MockitoS "prefix2/4.txt", ), ) - val options = PartitionSearcherOptions(1, true, 100.millis) + val options = PartitionSearcherOptions(1, true, 100.millis, ExcludeIndexes) val io = for { cancelledRef <- Ref[IO].of(false) readerRef <- Ref[IO].of(Option.empty[ResultReader]) @@ -241,7 +241,7 @@ class PartitionDiscoveryTest extends AnyFlatSpecLike with Matchers with MockitoS "prefix1/three/2.txt", ), ) - val options = PartitionSearcherOptions(1, true, 100.millis) + val options = PartitionSearcherOptions(1, true, 100.millis, ExcludeIndexes) val io = for { cancelledRef <- Ref[IO].of(false) readerRef <- Ref[IO].of(Option.empty[ResultReader]) @@ -296,7 +296,7 @@ class PartitionDiscoveryTest extends AnyFlatSpecLike with Matchers with MockitoS "prefix1/subprefix_untitled/3.txt", ), ) - val options = PartitionSearcherOptions(1, true, 100.millis) + val options = PartitionSearcherOptions(1, true, 100.millis, ExcludeIndexes) List(0 -> "prefix1/subprefix_abc/", 1 -> "prefix1/subprefix_untitled/", 2 -> "prefix1/subprefix_xyz01/").foreach { case (i, partition) => val taskId = ConnectorTaskId("sinkName", 3, i) diff --git a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/storage/AwsS3DirectoryListerTest.scala b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/storage/AwsS3DirectoryListerTest.scala index 217dea601..7d49b3a5c 100644 --- a/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/storage/AwsS3DirectoryListerTest.scala +++ b/kafka-connect-aws-s3/src/test/scala/io/lenses/streamreactor/connect/aws/s3/storage/AwsS3DirectoryListerTest.scala @@ -40,11 +40,11 @@ class AwsS3DirectoryListerTest extends AnyFlatSpecLike with Matchers { ), ) - check(S3Location("bucket", "prefix1/".some), Set.empty, Set("prefix1/"), s3Client) + check(S3Location("bucket", "prefix1/".some), Set.empty, Set.empty, Set("prefix1/"), s3Client) - check(S3Location("bucket", "prefix2/".some), Set.empty, Set("prefix2/"), s3Client) - check(S3Location("bucket", "prefix3/".some), Set.empty, Set.empty, s3Client) - check(S3Location("bucket", None), Set.empty, Set("prefix1/", "prefix2/"), s3Client) + check(S3Location("bucket", "prefix2/".some), Set.empty, Set.empty, Set("prefix2/"), s3Client) + check(S3Location("bucket", "prefix3/".some), Set.empty, Set.empty, Set.empty, s3Client) + check(S3Location("bucket", None), Set.empty, Set.empty, Set("prefix1/", "prefix2/"), s3Client) } "lister" should "list multiple pages" in { @@ -68,6 +68,7 @@ class AwsS3DirectoryListerTest extends AnyFlatSpecLike with Matchers { S3Location("bucket", none), DirectoryFindCompletionConfig(1), Set.empty, + Set.empty, s3Client.listObjectsV2Paginator(_).iterator().asScala, connectorTaskId, ).unsafeRunSync() should be(DirectoryFindResults( @@ -95,6 +96,7 @@ class AwsS3DirectoryListerTest extends AnyFlatSpecLike with Matchers { check( S3Location("bucket", none), Set("prefix1/", "prefix4/"), + Set.empty, Set("prefix2/", "prefix3/"), s3Client, 0, @@ -123,6 +125,7 @@ class AwsS3DirectoryListerTest extends AnyFlatSpecLike with Matchers { check( S3Location("bucket", none), Set.empty, + Set.empty, Set("prefix2/", "prefix4/"), s3Client, 1, @@ -132,6 +135,7 @@ class AwsS3DirectoryListerTest extends AnyFlatSpecLike with Matchers { check( S3Location("bucket", none), Set.empty, + Set.empty, Set("prefix1/", "prefix3/"), s3Client, 1, @@ -142,6 +146,7 @@ class AwsS3DirectoryListerTest extends AnyFlatSpecLike with Matchers { S3Location("bucket", none), Set("prefix2/", "prefix4/"), Set.empty, + Set.empty, s3Client, 0, taskId1, @@ -151,6 +156,7 @@ class AwsS3DirectoryListerTest extends AnyFlatSpecLike with Matchers { S3Location("bucket", none), Set("prefix1/", "prefix3/"), Set.empty, + Set.empty, s3Client, 0, taskId2, @@ -159,6 +165,7 @@ class AwsS3DirectoryListerTest extends AnyFlatSpecLike with Matchers { check( S3Location("bucket", none), Set("prefix2/"), + Set.empty, Set("prefix4/"), s3Client, 1, @@ -168,6 +175,7 @@ class AwsS3DirectoryListerTest extends AnyFlatSpecLike with Matchers { check( S3Location("bucket", none), Set("prefix1/"), + Set.empty, Set("prefix3/"), s3Client, 1, @@ -176,21 +184,45 @@ class AwsS3DirectoryListerTest extends AnyFlatSpecLike with Matchers { } private def check( - location: S3Location, - exclude: Set[String], - expected: Set[String], - s3Client: S3Client, - recursiveLevel: Int = 1, - connectorTaskId: ConnectorTaskId = connectorTaskId, + location: S3Location, + exclude: Set[String], + wildcardExcludes: Set[String], + expected: Set[String], + s3Client: S3Client, + recursiveLevel: Int = 1, + connectorTaskId: ConnectorTaskId = connectorTaskId, ): Unit = { val actual = AwsS3DirectoryLister.findDirectories( location, DirectoryFindCompletionConfig(recursiveLevel), exclude, + wildcardExcludes, s3Client.listObjectsV2Paginator(_).iterator().asScala, connectorTaskId, ).unsafeRunSync() actual should be(DirectoryFindResults(expected)) () } + + "lister" should "exclude indexes directory when configured as wildcard exclude" in { + + val s3Client: S3Client = new MockS3Client( + S3Page( + ".indexes/sinkName/myTopic/00005/00000000000000000050", + ".indexes/sinkName/myTopic/00005/00000000000000000070", + ".indexes/sinkName/myTopic/00005/00000000000000000100", + "prefix1/1.txt", + "prefix1/2.txt", + "prefix2/3.txt", + "prefix2/4.txt", + ), + ) + + check(S3Location("bucket", "prefix1/".some), Set.empty, Set(".indexes"), Set("prefix1/"), s3Client) + + check(S3Location("bucket", "prefix2/".some), Set.empty, Set(".indexes"), Set("prefix2/"), s3Client) + check(S3Location("bucket", "prefix3/".some), Set.empty, Set(".indexes"), Set.empty, s3Client) + check(S3Location("bucket", None), Set.empty, Set(".indexes"), Set("prefix1/", "prefix2/"), s3Client) + } + } diff --git a/kafka-connect-common/src/main/scala/io/lenses/streamreactor/connect/config/kcqlprops/KcqlProperties.scala b/kafka-connect-common/src/main/scala/io/lenses/streamreactor/connect/config/kcqlprops/KcqlProperties.scala index e1acca2a9..750687bec 100644 --- a/kafka-connect-common/src/main/scala/io/lenses/streamreactor/connect/config/kcqlprops/KcqlProperties.scala +++ b/kafka-connect-common/src/main/scala/io/lenses/streamreactor/connect/config/kcqlprops/KcqlProperties.scala @@ -18,29 +18,36 @@ package io.lenses.streamreactor.connect.config.kcqlprops import cats.implicits.catsSyntaxOptionId import enumeratum._ +import scala.reflect.ClassTag import scala.util.Try object KcqlProperties { - def fromStringMap[U <: EnumEntry, T <: Enum[U]]( + + def stringToString: String => String = identity[String] + + def stringToInt: String => Int = _.toInt + + def normaliseCase[U <: EnumEntry, T <: Enum[U]]( schema: KcqlPropsSchema[U, T], map: Map[String, String], ): KcqlProperties[U, T] = new KcqlProperties( schema, map.map { - case (k: String, v: String) => schema.keys.withNameInsensitive(k) -> v + case (k: String, v: String) => k.toLowerCase -> v }, ) - } case class KcqlProperties[U <: EnumEntry, T <: Enum[U]]( schema: KcqlPropsSchema[U, T], - map: Map[U, String], + map: Map[String, String], ) { + def containsKeyStartingWith(str: String): Boolean = map.keys.exists(k => k.startsWith(str)) + def getOptionalInt(key: U): Option[Int] = for { - value: String <- map.get(key) + value: String <- map.get(key.entryName) schema: PropsSchema <- schema.schema.get(key) _ <- schema match { case IntPropsSchema => value.some @@ -49,9 +56,15 @@ case class KcqlProperties[U <: EnumEntry, T <: Enum[U]]( i <- Try(value.toInt).toOption } yield i + def getOptionalChar(key: U): Option[Char] = + for { + value: Char <- map.get(key.entryName).filter(_.length == 1).flatMap(_.toCharArray.headOption) + _: PropsSchema <- schema.schema.get(key).filter(_ == CharPropsSchema) + } yield value + def getOptionalBoolean(key: U): Option[Boolean] = for { - value: String <- map.get(key) + value: String <- map.get(key.entryName) schema: PropsSchema <- schema.schema.get(key) _ <- schema match { case BooleanPropsSchema => value.some @@ -60,20 +73,39 @@ case class KcqlProperties[U <: EnumEntry, T <: Enum[U]]( b <- Try(value.toBoolean).toOption } yield b + def getOptionalSet[V](key: U)(implicit converter: String => V, ct: ClassTag[V]): Option[Set[V]] = + map.get(key.entryName) match { + case Some(value) if schema.schema.get(key).contains(SetPropsSchema()) => + val elements = value.split(',').map(converter).toSet + Some(elements) + case _ => None + } + + def getOptionalMap[K, V](keyPrefix: U, keyConverter: String => K, valueConverter: String => V): Option[Map[K, V]] = { + val mapKeyPrefix = keyPrefix.entryName + "." + val applicableEntries = map.collect { + case (k, v) + if k.startsWith(mapKeyPrefix) && + schema.schema.get(keyPrefix).contains(MapPropsSchema()) => + keyConverter(k.replace(mapKeyPrefix, "")) -> valueConverter(v) + } + Option.when(applicableEntries.nonEmpty)(applicableEntries) + } + /* PKE - props enum (contains the prop keys) VE - target enum */ def getEnumValue[VU <: EnumEntry, VT <: Enum[VU]](e: VT, key: U): Option[VU] = for { - enumString: String <- map.get(key) + enumString: String <- map.get(key.entryName) enu <- e.withNameInsensitiveOption(enumString) //value <- schema.schema } yield enu def getString(key: U): Option[String] = for { - value: String <- map.get(key) + value: String <- map.get(key.entryName) schema: PropsSchema <- schema.schema.get(key) _ <- schema match { case StringPropsSchema => value.some diff --git a/kafka-connect-common/src/main/scala/io/lenses/streamreactor/connect/config/kcqlprops/KcqlPropsSchema.scala b/kafka-connect-common/src/main/scala/io/lenses/streamreactor/connect/config/kcqlprops/KcqlPropsSchema.scala index 455afcf74..30b13e80f 100644 --- a/kafka-connect-common/src/main/scala/io/lenses/streamreactor/connect/config/kcqlprops/KcqlPropsSchema.scala +++ b/kafka-connect-common/src/main/scala/io/lenses/streamreactor/connect/config/kcqlprops/KcqlPropsSchema.scala @@ -21,6 +21,7 @@ case class KcqlPropsSchema[U <: EnumEntry, T <: Enum[U]]( keys: T, schema: Map[U, PropsSchema], ) { - def readProps(props: Map[String, String]): KcqlProperties[U, T] = KcqlProperties.fromStringMap[U, T](this, props) + def readPropsMap(props: Map[String, String]): KcqlProperties[U, T] = KcqlProperties.normaliseCase[U, T](this, props) + def readProps(props: (String, String)*): KcqlProperties[U, T] = readPropsMap(props.toMap) } diff --git a/kafka-connect-common/src/main/scala/io/lenses/streamreactor/connect/config/kcqlprops/PropsSchema.scala b/kafka-connect-common/src/main/scala/io/lenses/streamreactor/connect/config/kcqlprops/PropsSchema.scala index ceb06f465..37e402e02 100644 --- a/kafka-connect-common/src/main/scala/io/lenses/streamreactor/connect/config/kcqlprops/PropsSchema.scala +++ b/kafka-connect-common/src/main/scala/io/lenses/streamreactor/connect/config/kcqlprops/PropsSchema.scala @@ -26,3 +26,9 @@ object StringPropsSchema extends PropsSchema object IntPropsSchema extends PropsSchema object BooleanPropsSchema extends PropsSchema + +object CharPropsSchema extends PropsSchema + +case class SetPropsSchema[T]() extends PropsSchema + +case class MapPropsSchema[K, V]() extends PropsSchema diff --git a/kafka-connect-common/src/test/scala/io/lenses/streamreactor/connect/config/kcqlprops/KcqlPropertiesTest.scala b/kafka-connect-common/src/test/scala/io/lenses/streamreactor/connect/config/kcqlprops/KcqlPropertiesTest.scala new file mode 100644 index 000000000..f589734b7 --- /dev/null +++ b/kafka-connect-common/src/test/scala/io/lenses/streamreactor/connect/config/kcqlprops/KcqlPropertiesTest.scala @@ -0,0 +1,95 @@ +/* + * Copyright 2017-2023 Lenses.io Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.lenses.streamreactor.connect.config.kcqlprops + +import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers +import enumeratum.Enum +import enumeratum.EnumEntry +import io.lenses.streamreactor.connect.config.kcqlprops.KcqlProperties.stringToInt +import io.lenses.streamreactor.connect.config.kcqlprops.KcqlProperties.stringToString +import org.scalatest.OptionValues + +class KcqlPropertiesTest extends AnyFunSuite with Matchers with OptionValues { + + private sealed trait MyEnum extends EnumEntry + + private object MyEnum extends Enum[MyEnum] { + case object Key1_Happy extends MyEnum + + case object Key2_Mismatch extends MyEnum + + case object Key3_Missing extends MyEnum + case object Key4_Missing extends MyEnum + + case object Key5_Multi extends MyEnum + case object Key6_Empty_Map extends MyEnum + case object Key7_Map extends MyEnum + + val values: IndexedSeq[MyEnum] = findValues + } + + private val sampleMap: Map[String, String] = Map( + MyEnum.Key1_Happy.entryName -> "0", + MyEnum.Key2_Mismatch.entryName -> "1", + MyEnum.Key3_Missing.entryName -> "", + MyEnum.Key5_Multi.entryName -> "AB", + MyEnum.Key7_Map.entryName + ".offset" -> "1", + MyEnum.Key7_Map.entryName + ".partition" -> "2", + ) + + private val kcqlPropsSchema: KcqlPropsSchema[MyEnum, MyEnum.type] = KcqlPropsSchema( + MyEnum, + Map[MyEnum, PropsSchema]( + MyEnum.Key1_Happy -> CharPropsSchema, + MyEnum.Key2_Mismatch -> IntPropsSchema, + MyEnum.Key3_Missing -> CharPropsSchema, + MyEnum.Key4_Missing -> CharPropsSchema, + MyEnum.Key5_Multi -> CharPropsSchema, + MyEnum.Key6_Empty_Map -> MapPropsSchema[String, Int](), + MyEnum.Key7_Map -> MapPropsSchema[String, Int](), + ), + ) + private val kcqlProps = KcqlProperties(kcqlPropsSchema, sampleMap) + + test("getOptionalChar should return Some(Char) when schema matches") { + kcqlProps.getOptionalChar(MyEnum.Key1_Happy) shouldEqual Some('0') + } + + test("getOptionalChar should return None when schema doesn't match") { + kcqlProps.getOptionalChar(MyEnum.Key2_Mismatch) shouldEqual None + } + + test("getOptionalChar should return None when key is not found in the map") { + kcqlProps.getOptionalChar(MyEnum.Key3_Missing) shouldEqual None + kcqlProps.getOptionalChar(MyEnum.Key4_Missing) shouldEqual None + } + + test("getOptionalChar should return empty when >1 chars in String") { + kcqlProps.getOptionalChar(MyEnum.Key5_Multi) shouldEqual None + } + + test("getOptionalMap should return empty when no properties specified") { + kcqlProps.getOptionalMap(MyEnum.Key6_Empty_Map, stringToString, stringToInt) shouldEqual None + } + + test("getOptionalMap should return values when multiple properties specified") { + kcqlProps.getOptionalMap(MyEnum.Key7_Map, stringToString, stringToInt).value shouldEqual Map[String, Int]( + "offset" -> 1, + "partition" -> 2, + ) + } +}