Skip to content

Commit

Permalink
Merge branch 'develop' into feature/#2175-Add-ability-to-configure-ho…
Browse files Browse the repository at this point in the history
…w-Spark-handles-dates-in-parquet-files
  • Loading branch information
TebaleloS authored Mar 27, 2023
2 parents 8df173e + cd7f266 commit dbba2b9
Show file tree
Hide file tree
Showing 28 changed files with 65 additions and 83 deletions.
2 changes: 1 addition & 1 deletion dao/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
<parent>
<groupId>za.co.absa.enceladus</groupId>
<artifactId>parent</artifactId>
<version>3.0.0-SNAPSHOT</version>
<version>3.1.0-SNAPSHOT</version>
</parent>

<properties>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ protected class RestClient(authClient: AuthClient,

val statusCode = response.getStatusCode

/**
/*
* This function handles unauthorized response by trying to authenticate
* (if there are still some retries attempt left) - this might be due to an expired session.
*
Expand Down
2 changes: 1 addition & 1 deletion data-model/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
<parent>
<groupId>za.co.absa.enceladus</groupId>
<artifactId>parent</artifactId>
<version>3.0.0-SNAPSHOT</version>
<version>3.1.0-SNAPSHOT</version>
</parent>

<properties>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,8 @@ case class Dataset(
@(AosSchema@field)(implementation = classOf[OozieSchedule])
@BeanProperty schedule: Option[OozieSchedule] = None, //To be used for backward versioning compatibility

@(AosSchema@field)(implementation = classOf[java.util.Map[String, String]], example = "{" +
"\"field1\": \"value1\"," +
"\"field2\": \"value2\"" +
"}")
@(AosSchema@field)(implementation = classOf[java.util.Map[String, String]],
example = "{\"field1\": \"value1\", \"field2\": \"value2\"}")
@BeanProperty properties: Option[Map[String, String]] = Some(Map.empty),

@(AosSchema@field)(implementation = classOf[Validation])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,8 @@ case class MappingTable(
@(AosSchema@field)(implementation = classOf[Reference])
@BeanProperty parent: Option[Reference] = None,

@(AosSchema@field)(implementation = classOf[DataFrameFilter], example = "{" +
"\"_t\": \"EqualsFilter\"," +
"\"columnName\": \"exampleColumn1\"," +
"\"value\":\"wantedValue1\"," +
"\"valueType\": \"string\"" +
"}"
@(AosSchema@field)(implementation = classOf[DataFrameFilter],
example = "{\"_t\": \"EqualsFilter\", \"columnName\": \"exampleColumn1\", \"value\":\"wantedValue1\", \"valueType\": \"string\"}"
)
@BeanProperty filter: Option[DataFrameFilter] = None
) extends VersionedModel with Auditable[MappingTable] {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,7 @@ import scala.annotation.meta.field
import scala.beans.BeanProperty

@AosSchema(implementation = classOf[Reference],
example = "{" +
"\"collection\": \"dataset\"," +
"\"name\": \"Test\"," +
"\"version\": 4" +
"}"
example = "{\"collection\": \"dataset\", \"name\": \"Test\", \"version\": 4}"
)
case class Reference(
@(AosSchema@field)(implementation = classOf[String], example = "collection1")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,27 +116,19 @@ package object conformanceRule {
@(AosSchema@field)(example = "3")
@BeanProperty mappingTableVersion: Int,

@(AosSchema@field)(implementation = classOf[java.util.Map[String, String]], example = "{" +
"\"field1\": \"mappedField1\"" +
"}")
@(AosSchema@field)(implementation = classOf[java.util.Map[String, String]], example = "{\"field1\": \"mappedField1\"}")
@BeanProperty attributeMappings: Map[String, String], // key = mapping table column, value = input df column
@(AosSchema@field)(example = "CCC")
@BeanProperty targetAttribute: String,
@(AosSchema@field)(example = "ConformedCurrencyX")
@BeanProperty outputColumn: String,

@(AosSchema@field)(implementation = classOf[java.util.Map[String, String]], example = "{" +
"\"newCol\": \"mappedCol1\"" +
"}")
@(AosSchema@field)(implementation = classOf[java.util.Map[String, String]], example = "{\"newCol\": \"mappedCol1\"}")
@BeanProperty additionalColumns: Option[Map[String, String]] = None,
@BeanProperty isNullSafe: Boolean = false,

@(AosSchema@field)(implementation = classOf[DataFrameFilter], example = "{" +
"\"_t\": \"EqualsFilter\"," +
"\"columnName\": \"column1\"," +
"\"value\": \"soughtAfterValue\"," +
"\"valueType\": \"string\"" +
"}")
@(AosSchema@field)(implementation = classOf[DataFrameFilter],
example = "{\"_t\": \"EqualsFilter\", \"columnName\": \"column1\", \"value\": \"soughtAfterValue\", \"valueType\": \"string\"}")
@BeanProperty mappingTableFilter: Option[DataFrameFilter] = None,
overrideMappingTableOwnFilter: Option[Boolean] = Some(DefaultOverrideMappingTableOwnFilter)
) extends ConformanceRule {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
{
"vars": {
"spark-submit": "spark-submit --num-executors 2 --executor-memory 2G --deploy-mode client",
"spark-conf": "--conf 'spark.driver.extraJavaOptions=-Denceladus.rest.uri=http://localhost:8080/rest_api/api -Dspline.mongodb.name=spline -Dspline.mongodb.url=mongodb://127.0.0.1:27017/ -Denceladus.recordId.generation.strategy=stableHashId'",
"enceladus-job-jar": "spark-jobs/target/spark-jobs-2.19.0-SNAPSHOT.jar",
"credentials": "--rest-api-credentials-file ~/.ssh/menasCredential.properties",
"spark-conf": "--conf 'spark.driver.extraJavaOptions=-Denceladus.rest.uri=http://localhost:8080/rest_api/api -Denceladus.recordId.generation.strategy=stableHashId'",
"enceladus-job-jar": "spark-jobs/target/spark-jobs-3.0.0-SNAPSHOT.jar",
"credentials": "--rest-api-credentials-file ~/.ssh/menas-credential.properties",
"ref-std-data-path": "/ref/castingConformanceRule/std",
"new-std-data-path": "/tmp/conformance-output/standardized-castingConformanceRule-1-2020-03-23-1",
"ref-publish-data-path": "/ref/castingConformanceRule/publish",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
{
"vars": {
"spark-submit": "spark-submit --num-executors 2 --executor-memory 2G --deploy-mode client",
"spark-conf": "--conf 'spark.driver.extraJavaOptions=-Denceladus.rest.uri=http://localhost:8080/rest_api/api -Dspline.mongodb.name=spline -Dspline.mongodb.url=mongodb://127.0.0.1:27017/ -Denceladus.recordId.generation.strategy=stableHashId'",
"enceladus-job-jar": "spark-jobs/target/spark-jobs-2.19.0-SNAPSHOT.jar",
"credentials": "--rest-api-credentials-file ~/.ssh/menasCredential.properties",
"spark-conf": "--conf 'spark.driver.extraJavaOptions=-Denceladus.rest.uri=http://localhost:8080/rest_api/api -Denceladus.recordId.generation.strategy=stableHashId'",
"enceladus-job-jar": "spark-jobs/target/spark-jobs-3.0.0-SNAPSHOT.jar",
"credentials": "--rest-api-credentials-file ~/.ssh/menas-credential.properties",
"ref-std-data-path": "/ref/coalesceConformanceRule/std",
"new-std-data-path": "/tmp/conformance-output/standardized-coalesceConformanceRule-1-2020-03-23-1",
"ref-publish-data-path": "/ref/coalesceConformanceRule/publish",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
{
"vars": {
"spark-submit": "spark-submit --num-executors 2 --executor-memory 2G --deploy-mode client",
"spark-conf": "--conf 'spark.driver.extraJavaOptions=-Denceladus.rest.uri=http://localhost:8080/rest_api/api -Dspline.mongodb.name=spline -Dspline.mongodb.url=mongodb://127.0.0.1:27017/ -Denceladus.recordId.generation.strategy=stableHashId'",
"enceladus-job-jar": "spark-jobs/target/spark-jobs-2.19.0-SNAPSHOT.jar",
"credentials": "--rest-api-credentials-file ~/.ssh/menasCredential.properties",
"spark-conf": "--conf 'spark.driver.extraJavaOptions=-Denceladus.rest.uri=http://localhost:8080/rest_api/api -Denceladus.recordId.generation.strategy=stableHashId'",
"enceladus-job-jar": "spark-jobs/target/spark-jobs-3.0.0-SNAPSHOT.jar",
"credentials": "--rest-api-credentials-file ~/.ssh/menas-credential.properties",
"ref-std-data-path": "/ref/concatenationConformanceRule/std",
"new-std-data-path": "/tmp/conformance-output/standardized-concatenationConformanceRule-1-2020-03-23-1",
"ref-publish-data-path": "/ref/concatenationConformanceRule/publish",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
{
"vars": {
"spark-submit": "spark-submit --num-executors 2 --executor-memory 2G --deploy-mode client",
"spark-conf": "--conf 'spark.driver.extraJavaOptions=-Denceladus.rest.uri=http://localhost:8080/rest_api/api -Dspline.mongodb.name=spline -Dspline.mongodb.url=mongodb://127.0.0.1:27017/ -Denceladus.recordId.generation.strategy=stableHashId'",
"enceladus-job-jar": "spark-jobs/target/spark-jobs-2.19.0-SNAPSHOT.jar",
"credentials": "--rest-api-credentials-file ~/.ssh/menasCredential.properties",
"spark-conf": "--conf 'spark.driver.extraJavaOptions=-Denceladus.rest.uri=http://localhost:8080/rest_api/api -Denceladus.recordId.generation.strategy=stableHashId'",
"enceladus-job-jar": "spark-jobs/target/spark-jobs-3.0.0-SNAPSHOT.jar",
"credentials": "--rest-api-credentials-file ~/.ssh/menas-credential.properties",
"ref-std-data-path": "/ref/literalConformanceRule/std",
"new-std-data-path": "/tmp/conformance-output/standardized-literalConformanceRule-1-2020-03-23-1",
"ref-publish-data-path": "/ref/literalConformanceRule/publish",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
{
"vars": {
"spark-submit": "spark-submit --num-executors 2 --executor-memory 2G --deploy-mode client",
"spark-conf": "--conf 'spark.driver.extraJavaOptions=-Denceladus.rest.uri=http://localhost:8080/rest_api/api -Dspline.mongodb.name=spline -Dspline.mongodb.url=mongodb://127.0.0.1:27017/ -Denceladus.recordId.generation.strategy=stableHashId'",
"enceladus-job-jar": "spark-jobs/target/spark-jobs-2.19.0-SNAPSHOT.jar",
"credentials": "--rest-api-credentials-file ~/.ssh/menasCredential.properties",
"spark-conf": "--conf 'spark.driver.extraJavaOptions=-Denceladus.rest.uri=http://localhost:8080/rest_api/api -Denceladus.recordId.generation.strategy=stableHashId'",
"enceladus-job-jar": "spark-jobs/target/spark-jobs-3.0.0-SNAPSHOT.jar",
"credentials": "--rest-api-credentials-file ~/.ssh/menas-credential.properties",
"ref-std-data-path": "/ref/mappingConformanceRule/std",
"new-std-data-path": "/tmp/conformance-output/standardized-mappingConformanceRule-1-2020-03-23-1",
"ref-publish-data-path": "/ref/mappingConformanceRule/publish",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
{
"vars": {
"spark-submit": "spark-submit --num-executors 2 --executor-memory 2G --deploy-mode client",
"spark-conf": "--conf 'spark.driver.extraJavaOptions=-Denceladus.rest.uri=http://localhost:8080/rest_api/api -Dspline.mongodb.name=spline -Dspline.mongodb.url=mongodb://127.0.0.1:27017/ -Denceladus.recordId.generation.strategy=stableHashId'",
"enceladus-job-jar": "spark-jobs/target/spark-jobs-2.19.0-SNAPSHOT.jar",
"credentials": "--rest-api-credentials-file ~/.ssh/menasCredential.properties",
"spark-conf": "--conf 'spark.driver.extraJavaOptions=-Denceladus.rest.uri=http://localhost:8080/rest_api/api -Denceladus.recordId.generation.strategy=stableHashId'",
"enceladus-job-jar": "spark-jobs/target/spark-jobs-3.0.0-SNAPSHOT.jar",
"credentials": "--rest-api-credentials-file ~/.ssh/menas-credential.properties",
"ref-std-data-path": "/ref/negationConformanceRule/std",
"new-std-data-path": "/tmp/conformance-output/standardized-negationConformanceRule-1-2020-03-23-1",
"ref-publish-data-path": "/ref/negationConformanceRule/publish",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
{
"vars": {
"spark-submit": "spark-submit --num-executors 2 --executor-memory 2G --deploy-mode client",
"spark-conf": "--conf 'spark.driver.extraJavaOptions=-Denceladus.rest.uri=http://localhost:8080/rest_api/api -Dspline.mongodb.name=spline -Dspline.mongodb.url=mongodb://127.0.0.1:27017/ -Denceladus.recordId.generation.strategy=stableHashId'",
"enceladus-job-jar": "spark-jobs/target/spark-jobs-2.19.0-SNAPSHOT.jar",
"credentials": "--rest-api-credentials-file ~/.ssh/menasCredential.properties",
"spark-conf": "--conf 'spark.driver.extraJavaOptions=-Denceladus.rest.uri=http://localhost:8080/rest_api/api -Denceladus.recordId.generation.strategy=stableHashId'",
"enceladus-job-jar": "spark-jobs/target/spark-jobs-3.0.0-SNAPSHOT.jar",
"credentials": "--rest-api-credentials-file ~/.ssh/menas-credential.properties",
"ref-std-data-path": "/ref/singleColumnConformanceRule/std",
"new-std-data-path": "/tmp/conformance-output/standardized-singleColumnConformanceRule-1-2020-03-23-1",
"ref-publish-data-path": "/ref/singleColumnConformanceRule/publish",
Expand Down
10 changes: 5 additions & 5 deletions examples/data/e2e_tests/test_jsons/stdNfDnTest.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@

{
"vars": {
"spark-submit": "spark-submit --num-executors 2 --executor-memory 2G --deploy-mode client",
"spark-submit": "spark-submit --num-executors 2 --executor-memory 2G --deploy-mode client --conf spark.sql.parquet.datetimeRebaseModeInRead=LEGACY --conf spark.sql.parquet.datetimeRebaseModeInWrite=LEGACY",
"spark-conf": "--conf 'spark.driver.extraJavaOptions=-Denceladus.rest.uri=http://localhost:8080/rest_api/api -Dspline.mongodb.name=spline -Dspline.mongodb.url=mongodb://127.0.0.1:27017/ -Denceladus.recordId.generation.strategy=stableHashId'",
"enceladus-job-jar": "spark-jobs/target/spark-jobs-2.19.0-SNAPSHOT.jar",
"credentials": "--rest-api-credentials-file ~/.ssh/menasCredential.properties",
"enceladus-job-jar": "spark-jobs/target/spark-jobs-3.0.0-SNAPSHOT.jar",
"credentials": "--rest-api-credentials-file ~/.ssh/menas-credential.properties",
"ref-std-data-path": "/ref/std_nf_dn/std",
"new-std-data-path": "/tmp/conformance-output/standardized-std_nf_dn-1-2019-11-27-1",
"results-log-path": "/std/std_nf_dn/results",
Expand All @@ -15,7 +15,7 @@
"pluginName" : "BashPlugin",
"name": "Standardization",
"order" : 0,
"args" : ["#{spark-submit}# #{spark-conf}# --class za.co.absa.enceladus.standardization.StandardizationJob #{enceladus-job-jar}# #{credentials}# #{dataset}# --raw-format csv --header true "],
"args" : ["#{spark-submit}# #{spark-conf}# --class za.co.absa.enceladus.standardization.StandardizationJob #{enceladus-job-jar}# #{credentials}# #{dataset}# --raw-format json"],
"writeArgs": []
},
{
Expand All @@ -30,7 +30,7 @@
"pluginName" : "DatasetComparison",
"name": "DatasetComparison",
"order" : 1,
"args" : ["--format", "parquet", "--new-path", "#{new-std-data-path}#", "--ref-path", "#{ref-std-data-path}#", "--keys", "name" ],
"args" : ["--format", "parquet", "--new-path", "#{new-std-data-path}#", "--ref-path", "#{ref-std-data-path}#", "--keys", "property"],
"writeArgs": ["--out-path", "#{results-log-path}#/stdDataDiff"],
"dependsOn": "Standardization"
}
Expand Down
10 changes: 5 additions & 5 deletions examples/data/e2e_tests/test_jsons/stdNfDyTest.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@

{
"vars": {
"spark-submit": "spark-submit --num-executors 2 --executor-memory 2G --deploy-mode client",
"spark-submit": "spark-submit --num-executors 2 --executor-memory 2G --deploy-mode client --conf spark.sql.parquet.datetimeRebaseModeInRead=LEGACY --conf spark.sql.parquet.datetimeRebaseModeInWrite=LEGACY",
"spark-conf": "--conf 'spark.driver.extraJavaOptions=-Denceladus.rest.uri=http://localhost:8080/rest_api/api -Dspline.mongodb.name=spline -Dspline.mongodb.url=mongodb://127.0.0.1:27017/ -Denceladus.recordId.generation.strategy=stableHashId'",
"enceladus-job-jar": "spark-jobs/target/spark-jobs-2.19.0-SNAPSHOT.jar",
"credentials": "--rest-api-credentials-file ~/.ssh/menasCredential.properties",
"enceladus-job-jar": "spark-jobs/target/spark-jobs-3.0.0-SNAPSHOT.jar",
"credentials": "--rest-api-credentials-file ~/.ssh/menas-credential.properties",
"ref-std-data-path": "/ref/std_nf_dy/std",
"new-std-data-path": "/tmp/conformance-output/standardized-std_nf_dy-1-2019-11-27-1",
"results-log-path": "/std/std_nf_dy/results",
Expand All @@ -15,7 +15,7 @@
"pluginName" : "BashPlugin",
"name": "Standardization",
"order" : 0,
"args" : ["#{spark-submit}# #{spark-conf}# --class za.co.absa.enceladus.standardization.StandardizationJob #{enceladus-job-jar}# #{credentials}# #{dataset}# --raw-format csv --header true "],
"args" : ["#{spark-submit}# #{spark-conf}# --class za.co.absa.enceladus.standardization.StandardizationJob #{enceladus-job-jar}# #{credentials}# #{dataset}# --raw-format json"],
"writeArgs": []
},
{
Expand All @@ -30,7 +30,7 @@
"pluginName" : "DatasetComparison",
"name": "DatasetComparison",
"order" : 1,
"args" : ["--format", "parquet", "--new-path", "#{new-std-data-path}#", "--ref-path", "#{ref-std-data-path}#", "--keys", "name" ],
"args" : ["--format", "parquet", "--new-path", "#{new-std-data-path}#", "--ref-path", "#{ref-std-data-path}#", "--keys", "property"],
"writeArgs": ["--out-path", "#{results-log-path}#/stdDataDiff"],
"dependsOn": "Standardization"
}
Expand Down
10 changes: 5 additions & 5 deletions examples/data/e2e_tests/test_jsons/stdNtDnTest.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@

{
"vars": {
"spark-submit": "spark-submit --num-executors 2 --executor-memory 2G --deploy-mode client",
"spark-submit": "spark-submit --num-executors 2 --executor-memory 2G --deploy-mode client --conf spark.sql.parquet.datetimeRebaseModeInRead=LEGACY --conf spark.sql.parquet.datetimeRebaseModeInWrite=LEGACY",
"spark-conf": "--conf 'spark.driver.extraJavaOptions=-Denceladus.rest.uri=http://localhost:8080/rest_api/api -Dspline.mongodb.name=spline -Dspline.mongodb.url=mongodb://127.0.0.1:27017/ -Denceladus.recordId.generation.strategy=stableHashId'",
"enceladus-job-jar": "spark-jobs/target/spark-jobs-2.19.0-SNAPSHOT.jar",
"credentials": "--rest-api-credentials-file ~/.ssh/menasCredential.properties",
"enceladus-job-jar": "spark-jobs/target/spark-jobs-3.0.0-SNAPSHOT.jar",
"credentials": "--rest-api-credentials-file ~/.ssh/menas-credential.properties",
"ref-std-data-path": "/ref/std_nt_dn/std",
"new-std-data-path": "/tmp/conformance-output/standardized-std_nt_dn-1-2019-11-27-1",
"results-log-path": "/std/std_nt_dn/results",
Expand All @@ -15,7 +15,7 @@
"pluginName" : "BashPlugin",
"name": "Standardization",
"order" : 0,
"args" : ["#{spark-submit}# #{spark-conf}# --class za.co.absa.enceladus.standardization.StandardizationJob #{enceladus-job-jar}# #{credentials}# #{dataset}# --raw-format csv --header true "],
"args" : ["#{spark-submit}# #{spark-conf}# --class za.co.absa.enceladus.standardization.StandardizationJob #{enceladus-job-jar}# #{credentials}# #{dataset}# --raw-format json"],
"writeArgs": []
},
{
Expand All @@ -30,7 +30,7 @@
"pluginName" : "DatasetComparison",
"name": "DatasetComparison",
"order" : 1,
"args" : ["--format", "parquet", "--new-path", "#{new-std-data-path}#", "--ref-path", "#{ref-std-data-path}#", "--keys", "name" ],
"args" : ["--format", "parquet", "--new-path", "#{new-std-data-path}#", "--ref-path", "#{ref-std-data-path}#", "--keys", "property"],
"writeArgs": ["--out-path", "#{results-log-path}#/stdDataDiff"],
"dependsOn": "Standardization"
}
Expand Down
Loading

0 comments on commit dbba2b9

Please sign in to comment.