From e6ac79839685a7445d5423ccff63d9b56334a402 Mon Sep 17 00:00:00 2001 From: spenes Date: Wed, 13 Nov 2024 15:08:35 +0300 Subject: [PATCH] Add slash at the end of the load path (#1366) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Shredder puts together entities with the same schema model-revision-addition in the same batch under same folder. Let’s say you have events with `1-0-0`, `1-0-1` and `1-0-2` version of the `com.acme.test` in the same batch. In that case, resulting run folder will have following subfolders: ``` output=good/vendor=com.acme/name=test/format=tsv/model=1/revision=0/addition=0 output=good/vendor=com.acme/name=test/format=tsv/model=1/revision=0/addition=1 output=good/vendor=com.acme/name=test/format=tsv/model=1/revision=0/addition=2 ``` Before the fix, Loader was using the s3 paths without slash (/) at the end in the created copy statements. This works fine in most cases. However, when same batch contains events with `1-0-1` and `1-0-11`, then problem starts. In that case, run folder will have following subfolders: ``` output=good/vendor=com.acme/name=test/format=tsv/model=1/revision=0/addition=1 output=good/vendor=com.acme/name=test/format=tsv/model=1/revision=0/addition=11 ``` When entities in the `/model=1/revision=0/addition=1` are tried to be copied to respective table with copy statement, Redshift tries to copy the entities under `/model=1/revision=0/addition=11` as well since they have same prefix and it gives error during the copy since data under `/model=1/revision=0/addition=11` doesn’t have same structure with `1-0-1`. Putting slash at the end of the path solved the problem. After that change, only entities under `model=1/revision=0/addition=1` are copied as expected. --- .../snowplow/rdbloader/discovery/ShreddedType.scala | 4 ++-- .../snowplow/loader/redshift/RedshiftSpec.scala | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/ShreddedType.scala b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/ShreddedType.scala index 733022e65..b3839bc41 100644 --- a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/ShreddedType.scala +++ b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/rdbloader/discovery/ShreddedType.scala @@ -64,7 +64,7 @@ object ShreddedType { */ final case class Json(info: Info, jsonPaths: BlobStorage.Key) extends ShreddedType { def getLoadPath: String = - s"${info.base}${Common.GoodPrefix}/vendor=${info.vendor}/name=${info.name}/format=json/model=${info.version.model}/revision=${info.version.revision}/addition=${info.version.addition}" + s"${info.base}${Common.GoodPrefix}/vendor=${info.vendor}/name=${info.name}/format=json/model=${info.version.model}/revision=${info.version.revision}/addition=${info.version.addition}/" def show: String = s"${info.toCriterion.asString} ($jsonPaths)" } @@ -78,7 +78,7 @@ object ShreddedType { */ final case class Tabular(info: Info) extends ShreddedType { def getLoadPath: String = - s"${info.base}${Common.GoodPrefix}/vendor=${info.vendor}/name=${info.name}/format=tsv/model=${info.version.model}/revision=${info.version.revision}/addition=${info.version.addition}" + s"${info.base}${Common.GoodPrefix}/vendor=${info.vendor}/name=${info.name}/format=tsv/model=${info.version.model}/revision=${info.version.revision}/addition=${info.version.addition}/" def show: String = s"${info.toCriterion.asString} TSV" } diff --git a/modules/redshift-loader/src/test/scala/com/snowplowanalytics/snowplow/loader/redshift/RedshiftSpec.scala b/modules/redshift-loader/src/test/scala/com/snowplowanalytics/snowplow/loader/redshift/RedshiftSpec.scala index 6b37db250..eb1094610 100644 --- a/modules/redshift-loader/src/test/scala/com/snowplowanalytics/snowplow/loader/redshift/RedshiftSpec.scala +++ b/modules/redshift-loader/src/test/scala/com/snowplowanalytics/snowplow/loader/redshift/RedshiftSpec.scala @@ -144,8 +144,8 @@ class RedshiftSpec extends Specification { result.toList must containTheSameElementsAs( List( "COPY events FROM s3://my-bucket/my-path/", // atomic - "COPY com_acme_event_2 FROM s3://my-bucket/my-path/output=good/vendor=com.acme/name=event/format=tsv/model=2/revision=0/addition=0", - "COPY com_acme_event_3 FROM s3://my-bucket/my-path/output=good/vendor=com.acme/name=event/format=tsv/model=3/revision=0/addition=0" + "COPY com_acme_event_2 FROM s3://my-bucket/my-path/output=good/vendor=com.acme/name=event/format=tsv/model=2/revision=0/addition=0/", + "COPY com_acme_event_3 FROM s3://my-bucket/my-path/output=good/vendor=com.acme/name=event/format=tsv/model=3/revision=0/addition=0/" ) ) }