From c6e75b0e7eb600b2fa4f08bed2f7482944a7019c Mon Sep 17 00:00:00 2001 From: Flook Peter Date: Thu, 17 Oct 2024 17:23:36 +0800 Subject: [PATCH 1/2] Split up UI mapper into separate files for clarity --- .../delete/DeleteRecordProcessor.scala | 6 +- .../core/ui/mapper/ConfigurationMapper.scala | 142 ++++ .../core/ui/mapper/ConnectionMapper.scala | 108 +++ .../core/ui/mapper/CountMapper.scala | 34 + .../core/ui/mapper/FieldMapper.scala | 82 ++ .../core/ui/mapper/ForeignKeyMapper.scala | 49 ++ .../datacaterer/core/ui/mapper/UiMapper.scala | 565 +------------- .../core/ui/mapper/ValidationMapper.scala | 207 +++++ .../ui/mapper/ConfigurationMapperTest.scala | 141 ++++ .../core/ui/mapper/ConnectionMapperTest.scala | 179 +++++ .../core/ui/mapper/CountMapperTest.scala | 82 ++ .../core/ui/mapper/FieldMapperTest.scala | 57 ++ .../core/ui/mapper/ForeignKeyMapperTest.scala | 29 + .../core/ui/mapper/UiMapperTest.scala | 730 +----------------- .../core/ui/mapper/ValidationMapperTest.scala | 324 ++++++++ gradle.properties | 2 +- 16 files changed, 1457 insertions(+), 1280 deletions(-) create mode 100644 app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/ConfigurationMapper.scala create mode 100644 app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/ConnectionMapper.scala create mode 100644 app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/CountMapper.scala create mode 100644 app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/FieldMapper.scala create mode 100644 app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/ForeignKeyMapper.scala create mode 100644 app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/ValidationMapper.scala create mode 100644 app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/ConfigurationMapperTest.scala create mode 100644 app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/ConnectionMapperTest.scala create mode 100644 app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/CountMapperTest.scala create mode 100644 app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/FieldMapperTest.scala create mode 100644 app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/ForeignKeyMapperTest.scala create mode 100644 app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/ValidationMapperTest.scala diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/delete/DeleteRecordProcessor.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/delete/DeleteRecordProcessor.scala index f3ee0c1..908eac7 100644 --- a/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/delete/DeleteRecordProcessor.scala +++ b/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/delete/DeleteRecordProcessor.scala @@ -81,7 +81,7 @@ class DeleteRecordProcessor(connectionConfigsByName: Map[String, Map[String, Str }) } - def deleteRecords(dataSourceName: String, plan: Plan, step: Step, stepsByName: Map[String, Step] = Map(), + private def deleteRecords(dataSourceName: String, plan: Plan, step: Step, stepsByName: Map[String, Step] = Map(), optSourceForeignKey: Option[String] = None, optFullForeignKey: Option[(ForeignKeyRelation, String)] = None): Unit = { val format = step.options(FORMAT) val subDataSourcePath = getSubDataSourcePath(dataSourceName, plan.name, step, recordTrackingFolderPath) @@ -147,13 +147,13 @@ class DeleteRecordProcessor(connectionConfigsByName: Map[String, Map[String, Str } } - def getTrackedRecords(dataSourcePath: String): DataFrame = { + private def getTrackedRecords(dataSourcePath: String): DataFrame = { sparkSession.read.format(RECORD_TRACKING_VALIDATION_FORMAT) .option(PATH, dataSourcePath) .load() } - def deleteTrackedRecordsFile(dataSourcePath: String): Unit = { + private def deleteTrackedRecordsFile(dataSourcePath: String): Unit = { new Directory(new File(dataSourcePath)).deleteRecursively() } diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/ConfigurationMapper.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/ConfigurationMapper.scala new file mode 100644 index 0000000..29901c5 --- /dev/null +++ b/app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/ConfigurationMapper.scala @@ -0,0 +1,142 @@ +package io.github.datacatering.datacaterer.core.ui.mapper + +import io.github.datacatering.datacaterer.api.DataCatererConfigurationBuilder +import io.github.datacatering.datacaterer.api.connection.ConnectionTaskBuilder +import io.github.datacatering.datacaterer.api.model.Constants._ +import io.github.datacatering.datacaterer.core.ui.model.ConfigurationRequest +import org.apache.log4j.Logger + +object ConfigurationMapper { + + private val LOGGER = Logger.getLogger(getClass.getName) + + def configurationMapping( + configurationRequest: ConfigurationRequest, + installDirectory: String, + connections: List[ConnectionTaskBuilder[_]] + ): DataCatererConfigurationBuilder = { + val isConnectionContainsMetadataSource = connections.exists(conn => conn.connectionConfigWithTaskBuilder.options.contains(METADATA_SOURCE_TYPE)) + val configUpdatedFromConnections = if (isConnectionContainsMetadataSource) { + configurationRequest.copy(flag = configurationRequest.flag ++ Map(CONFIG_FLAGS_GENERATE_PLAN_AND_TASKS -> isConnectionContainsMetadataSource.toString)) + } else configurationRequest + + val baseConfig = DataCatererConfigurationBuilder() + val withFlagConf = mapFlagsConfiguration(configUpdatedFromConnections, baseConfig) + val withFolderConf = mapFolderConfiguration(configUpdatedFromConnections, installDirectory, withFlagConf) + val withMetadataConf = mapMetadataConfiguration(configUpdatedFromConnections, withFolderConf) + val withGenerationConf = mapGenerationConfiguration(configUpdatedFromConnections, withMetadataConf) + val withValidationConf = mapValidationConfiguration(configUpdatedFromConnections, withGenerationConf) + val withAlertConf = mapAlertConfiguration(configUpdatedFromConnections, withValidationConf) + + withAlertConf + } + + def mapFlagsConfiguration(configurationRequest: ConfigurationRequest, baseConfig: DataCatererConfigurationBuilder): DataCatererConfigurationBuilder = { + configurationRequest.flag.foldLeft(baseConfig)((conf, c) => { + val boolVal = c._2.toBoolean + c._1 match { + case CONFIG_FLAGS_COUNT => conf.enableCount(boolVal) + case CONFIG_FLAGS_GENERATE_DATA => conf.enableGenerateData(boolVal) + case CONFIG_FLAGS_RECORD_TRACKING => conf.enableRecordTracking(boolVal) + case CONFIG_FLAGS_DELETE_GENERATED_RECORDS => conf.enableDeleteGeneratedRecords(boolVal) + case CONFIG_FLAGS_GENERATE_PLAN_AND_TASKS => conf.enableGeneratePlanAndTasks(boolVal) + case CONFIG_FLAGS_FAIL_ON_ERROR => conf.enableFailOnError(boolVal) + case CONFIG_FLAGS_UNIQUE_CHECK => conf.enableUniqueCheck(boolVal) + case CONFIG_FLAGS_SINK_METADATA => conf.enableSinkMetadata(boolVal) + case CONFIG_FLAGS_SAVE_REPORTS => conf.enableSaveReports(boolVal) + case CONFIG_FLAGS_VALIDATION => conf.enableValidation(boolVal) + case CONFIG_FLAGS_GENERATE_VALIDATIONS => conf.enableGenerateValidations(boolVal) + case CONFIG_FLAGS_ALERTS => conf.enableAlerts(boolVal) + case _ => + LOGGER.warn(s"Unexpected flags configuration key, key=${c._1}") + conf + } + }) + } + + def mapAlertConfiguration(configurationRequest: ConfigurationRequest, baseConfig: DataCatererConfigurationBuilder): DataCatererConfigurationBuilder = { + configurationRequest.alert.foldLeft(baseConfig)((conf, c) => { + c._1 match { + case CONFIG_ALERT_TRIGGER_ON => conf.alertTriggerOn(c._2) + case CONFIG_ALERT_SLACK_TOKEN => conf.slackAlertToken(c._2) + case CONFIG_ALERT_SLACK_CHANNELS => conf.slackAlertChannels(c._2.split(",").map(_.trim): _*) + case _ => + LOGGER.warn(s"Unexpected alert configuration key, key=${c._1}") + conf + } + }) + } + + def mapValidationConfiguration(configurationRequest: ConfigurationRequest, baseConfig: DataCatererConfigurationBuilder): DataCatererConfigurationBuilder = { + configurationRequest.validation.foldLeft(baseConfig)((conf, c) => { + c._1 match { + case CONFIG_VALIDATION_NUM_SAMPLE_ERROR_RECORDS => conf.numErrorSampleRecords(c._2.toInt) + case CONFIG_VALIDATION_ENABLE_DELETE_RECORD_TRACKING_FILES => conf.enableDeleteRecordTrackingFiles(c._2.toBoolean) + case _ => + LOGGER.warn(s"Unexpected validation configuration key, key=${c._1}") + conf + } + }) + } + + def mapGenerationConfiguration(configurationRequest: ConfigurationRequest, baseConfig: DataCatererConfigurationBuilder): DataCatererConfigurationBuilder = { + configurationRequest.generation.foldLeft(baseConfig)((conf, c) => { + c._1 match { + case CONFIG_GENERATION_NUM_RECORDS_PER_BATCH => conf.numRecordsPerBatch(c._2.toLong) + case CONFIG_GENERATION_NUM_RECORDS_PER_STEP => + val parsedNum = c._2.toLong + if (parsedNum != -1) conf.numRecordsPerStep(c._2.toLong) else conf + case _ => + LOGGER.warn(s"Unexpected generation configuration key, key=${c._1}") + conf + } + }) + } + + def mapMetadataConfiguration(configurationRequest: ConfigurationRequest, baseConfig: DataCatererConfigurationBuilder): DataCatererConfigurationBuilder = { + configurationRequest.metadata.foldLeft(baseConfig)((conf, c) => { + c._1 match { + case CONFIG_METADATA_NUM_RECORDS_FROM_DATA_SOURCE => conf.numRecordsFromDataSourceForDataProfiling(c._2.toInt) + case CONFIG_METADATA_NUM_RECORDS_FOR_ANALYSIS => conf.numRecordsForAnalysisForDataProfiling(c._2.toInt) + case CONFIG_METADATA_ONE_OF_DISTINCT_COUNT_VS_COUNT_THRESHOLD => conf.oneOfDistinctCountVsCountThreshold(c._2.toDouble) + case CONFIG_METADATA_ONE_OF_MIN_COUNT => conf.oneOfMinCount(c._2.toLong) + case CONFIG_METADATA_NUM_GENERATED_SAMPLES => conf.numGeneratedSamples(c._2.toInt) + case _ => + LOGGER.warn(s"Unexpected metadata configuration key, key=${c._1}") + conf + } + }) + } + + def mapFolderConfiguration(configurationRequest: ConfigurationRequest, installDirectory: String, baseConfig: DataCatererConfigurationBuilder): DataCatererConfigurationBuilder = { + val nonEmptyFolderConfig = configurationRequest.folder.filter(_._2.nonEmpty).foldLeft(baseConfig)((conf, c) => { + c._1 match { + case CONFIG_FOLDER_PLAN_FILE_PATH => conf.planFilePath(c._2) + case CONFIG_FOLDER_TASK_FOLDER_PATH => conf.taskFolderPath(c._2) + case CONFIG_FOLDER_GENERATED_PLAN_AND_TASK_FOLDER_PATH => conf.generatedPlanAndTaskFolderPath(c._2) + case CONFIG_FOLDER_GENERATED_REPORTS_FOLDER_PATH => conf.generatedReportsFolderPath(c._2) + case CONFIG_FOLDER_RECORD_TRACKING_FOLDER_PATH => conf.recordTrackingFolderPath(c._2) + case CONFIG_FOLDER_VALIDATION_FOLDER_PATH => conf.validationFolderPath(c._2) + case CONFIG_FOLDER_RECORD_TRACKING_FOR_VALIDATION_FOLDER_PATH => conf.recordTrackingForValidationFolderPath(c._2) + case _ => + LOGGER.warn(s"Unexpected folder configuration key, key=${c._1}") + conf + } + }) + // should set the base directory to the install directory for most folders if not overridden + configurationRequest.folder.filter(_._2.isEmpty).foldLeft(nonEmptyFolderConfig)((conf, c) => { + c._1 match { + case CONFIG_FOLDER_PLAN_FILE_PATH => conf + case CONFIG_FOLDER_TASK_FOLDER_PATH => conf.taskFolderPath(s"$installDirectory/task") + case CONFIG_FOLDER_GENERATED_PLAN_AND_TASK_FOLDER_PATH => conf.generatedPlanAndTaskFolderPath(s"$installDirectory/generated-plan-task") + case CONFIG_FOLDER_GENERATED_REPORTS_FOLDER_PATH => conf.generatedReportsFolderPath(s"$installDirectory/report") + case CONFIG_FOLDER_RECORD_TRACKING_FOLDER_PATH => conf.recordTrackingFolderPath(s"$installDirectory/record-tracking") + case CONFIG_FOLDER_VALIDATION_FOLDER_PATH => conf.validationFolderPath(s"$installDirectory/validation") + case CONFIG_FOLDER_RECORD_TRACKING_FOR_VALIDATION_FOLDER_PATH => conf.recordTrackingForValidationFolderPath(s"$installDirectory/record-tracking-validation") + case _ => + LOGGER.warn(s"Unexpected folder configuration key, key=${c._1}") + conf + } + }) + } +} diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/ConnectionMapper.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/ConnectionMapper.scala new file mode 100644 index 0000000..c1da662 --- /dev/null +++ b/app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/ConnectionMapper.scala @@ -0,0 +1,108 @@ +package io.github.datacatering.datacaterer.core.ui.mapper + +import io.github.datacatering.datacaterer.api.ConnectionConfigWithTaskBuilder +import io.github.datacatering.datacaterer.api.connection.{ConnectionTaskBuilder, FileBuilder, JdbcBuilder} +import io.github.datacatering.datacaterer.api.model.Constants.{CASSANDRA_KEYSPACE, CASSANDRA_NAME, CASSANDRA_TABLE, CSV, DELTA, HTTP, ICEBERG, ICEBERG_CATALOG_GLUE, ICEBERG_CATALOG_HADOOP, ICEBERG_CATALOG_HIVE, ICEBERG_CATALOG_REST, ICEBERG_CATALOG_TYPE, ICEBERG_CATALOG_URI, JMS_CONNECTION_FACTORY, JMS_DESTINATION_NAME, JMS_INITIAL_CONTEXT_FACTORY, JMS_VPN_NAME, JSON, KAFKA, KAFKA_TOPIC, MYSQL, ORC, PARQUET, PASSWORD, PATH, POSTGRES, SCHEMA, SOLACE, SPARK_ICEBERG_CATALOG_TYPE, SPARK_ICEBERG_CATALOG_URI, SPARK_ICEBERG_CATALOG_WAREHOUSE, TABLE, URL, USERNAME} +import io.github.datacatering.datacaterer.core.ui.mapper.UiMapper.checkOptions +import io.github.datacatering.datacaterer.core.ui.model.DataSourceRequest + +object ConnectionMapper { + + def connectionMapping(dataSourceRequest: DataSourceRequest): ConnectionTaskBuilder[_] = { + dataSourceRequest.`type` match { + case Some(CASSANDRA_NAME) => createCassandraConnection(dataSourceRequest) + case Some(POSTGRES) => createJdbcConnection(dataSourceRequest, POSTGRES) + case Some(MYSQL) => createJdbcConnection(dataSourceRequest, MYSQL) + case Some(CSV) => createFileConnection(dataSourceRequest, CSV) + case Some(JSON) => createFileConnection(dataSourceRequest, JSON) + case Some(PARQUET) => createFileConnection(dataSourceRequest, PARQUET) + case Some(ORC) => createFileConnection(dataSourceRequest, ORC) + case Some(DELTA) => createFileConnection(dataSourceRequest, DELTA) + case Some(ICEBERG) => createIcebergConnection(dataSourceRequest) + case Some(SOLACE) => + val opt = dataSourceRequest.options.getOrElse(Map()) + checkOptions(dataSourceRequest.name, List(URL, USERNAME, PASSWORD, JMS_DESTINATION_NAME, JMS_VPN_NAME, JMS_CONNECTION_FACTORY, JMS_INITIAL_CONTEXT_FACTORY), opt) + ConnectionConfigWithTaskBuilder().solace(dataSourceRequest.name, opt(URL), opt(USERNAME), opt(PASSWORD), + opt(JMS_VPN_NAME), opt(JMS_CONNECTION_FACTORY), opt(JMS_INITIAL_CONTEXT_FACTORY), opt) + case Some(KAFKA) => + val opt = dataSourceRequest.options.getOrElse(Map()) + checkOptions(dataSourceRequest.name, List(URL, KAFKA_TOPIC), opt) + ConnectionConfigWithTaskBuilder().kafka(dataSourceRequest.name, opt(URL), opt) + case Some(HTTP) => + val opt = dataSourceRequest.options.getOrElse(Map()) + ConnectionConfigWithTaskBuilder().http(dataSourceRequest.name, opt.getOrElse(USERNAME, ""), opt.getOrElse(PASSWORD, ""), opt) + case Some(x) => + throw new IllegalArgumentException(s"Unsupported data source from UI, data-source-type=$x") + case _ => + throw new IllegalArgumentException(s"No data source type defined, unable to create connections, " + + s"data-source-name=${dataSourceRequest.name}, task-name=${dataSourceRequest.taskName}") + } + } + + private def createFileConnection(dataSourceRequest: DataSourceRequest, format: String): FileBuilder = { + val opt = dataSourceRequest.options.getOrElse(Map()) + checkOptions(dataSourceRequest.name, List(PATH), opt) + ConnectionConfigWithTaskBuilder().file(dataSourceRequest.name, format, opt(PATH), opt) + } + + private def createIcebergConnection(dataSourceRequest: DataSourceRequest): FileBuilder = { + val opt = dataSourceRequest.options.getOrElse(Map()) + val name = dataSourceRequest.name + checkOptions(name, List(ICEBERG_CATALOG_TYPE, TABLE), opt) + val baseSparkOpts = Map( + SPARK_ICEBERG_CATALOG_TYPE -> opt(ICEBERG_CATALOG_TYPE), + TABLE -> opt(TABLE) + ) + val sparkOpts = opt(ICEBERG_CATALOG_TYPE) match { + case ICEBERG_CATALOG_HADOOP | ICEBERG_CATALOG_GLUE => + checkOptions(name, List(PATH), opt) + Map(SPARK_ICEBERG_CATALOG_WAREHOUSE -> opt(PATH)) + case ICEBERG_CATALOG_HIVE | ICEBERG_CATALOG_REST => + checkOptions(name, List(ICEBERG_CATALOG_URI), opt) + Map(SPARK_ICEBERG_CATALOG_URI -> opt(ICEBERG_CATALOG_URI)) + case _ => Map() + } + ConnectionConfigWithTaskBuilder().file(name, ICEBERG, opt.getOrElse(PATH, ""), baseSparkOpts ++ sparkOpts) + } + + private def createJdbcConnection(dataSourceRequest: DataSourceRequest, format: String): JdbcBuilder[_] = { + val opt = dataSourceRequest.options.getOrElse(Map()) + checkOptions(dataSourceRequest.name, List(URL, USERNAME, PASSWORD), opt) + val connectionConfigWithTaskBuilder = ConnectionConfigWithTaskBuilder() + + val baseConnection = format match { + case POSTGRES => connectionConfigWithTaskBuilder.postgres(dataSourceRequest.name, opt(URL), opt(USERNAME), opt(PASSWORD), opt) + case MYSQL => connectionConfigWithTaskBuilder.mysql(dataSourceRequest.name, opt(URL), opt(USERNAME), opt(PASSWORD), opt) + case x => throw new IllegalArgumentException(s"Unsupported connection format, format=$x") + } + + (opt.get(SCHEMA), opt.get(TABLE)) match { + case (Some(schema), Some(table)) => baseConnection.table(schema, table) + case (Some(schema), None) => + assert(schema.nonEmpty, s"Empty schema name for $format connection, data-source-name=${dataSourceRequest.name}") + throw new IllegalArgumentException(s"Missing table name for $format connection, data-source-name=${dataSourceRequest.name}, schema=$schema") + case (None, Some(table)) => + assert(table.nonEmpty, s"Empty table name for $format connection, data-source-name=${dataSourceRequest.name}") + throw new IllegalArgumentException(s"Missing schema name for $format connection, data-source-name=${dataSourceRequest.name}, table=$table") + case (None, None) => baseConnection // TODO this is allowed only when there is metadata collection enabled + } + } + + private def createCassandraConnection(dataSourceRequest: DataSourceRequest) = { + val opt = dataSourceRequest.options.getOrElse(Map()) + checkOptions(dataSourceRequest.name, List(URL, USERNAME, PASSWORD), opt) + + val cassandraConnection = ConnectionConfigWithTaskBuilder().cassandra(dataSourceRequest.name, opt(URL), opt(USERNAME), opt(PASSWORD), opt) + (opt.get(CASSANDRA_KEYSPACE), opt.get(CASSANDRA_TABLE)) match { + case (Some(keyspace), Some(table)) => cassandraConnection.table(keyspace, table) + case (Some(keyspace), None) => + assert(keyspace.nonEmpty, s"Empty keyspace name for Cassandra connection, data-source-name=${dataSourceRequest.name}") + throw new IllegalArgumentException(s"Missing table name for Cassandra connection, data-source-name=${dataSourceRequest.name}, keyspace=$keyspace") + case (None, Some(table)) => + assert(table.nonEmpty, s"Empty table name for Cassandra connection, data-source-name=${dataSourceRequest.name}") + throw new IllegalArgumentException(s"Missing keyspace name for Cassandra connection, data-source-name=${dataSourceRequest.name}, table=$table") + case (None, None) => cassandraConnection // TODO this is allowed only when there is metadata collection enabled + } + } + +} diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/CountMapper.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/CountMapper.scala new file mode 100644 index 0000000..3ae43b7 --- /dev/null +++ b/app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/CountMapper.scala @@ -0,0 +1,34 @@ +package io.github.datacatering.datacaterer.core.ui.mapper + +import io.github.datacatering.datacaterer.api.{CountBuilder, GeneratorBuilder} +import io.github.datacatering.datacaterer.api.model.Constants.{DEFAULT_COUNT_RECORDS, DEFAULT_PER_COLUMN_COUNT_RECORDS, DISTRIBUTION_EXPONENTIAL, DISTRIBUTION_NORMAL} +import io.github.datacatering.datacaterer.core.ui.model.DataSourceRequest + +object CountMapper { + + def countMapping(dataSourceRequest: DataSourceRequest): CountBuilder = { + dataSourceRequest.count.map(recordCountRequest => { + val baseRecordCount = (recordCountRequest.records, recordCountRequest.recordsMin, recordCountRequest.recordsMax) match { + case (Some(records), None, None) => CountBuilder().records(records) + case (None, Some(min), Some(max)) => CountBuilder().generator(GeneratorBuilder().min(min).max(max)) + case _ => CountBuilder().records(DEFAULT_COUNT_RECORDS) + } + + val perColumnNames = recordCountRequest.perColumnNames.getOrElse(List()) + if (perColumnNames.nonEmpty) { + (recordCountRequest.perColumnRecords, recordCountRequest.perColumnRecordsMin, recordCountRequest.perColumnRecordsMax, + recordCountRequest.perColumnRecordsDistribution, recordCountRequest.perColumnRecordsDistributionRateParam) match { + case (Some(records), None, None, None, None) => baseRecordCount.recordsPerColumn(records, perColumnNames: _*) + case (None, Some(min), Some(max), None, None) => baseRecordCount.recordsPerColumnGenerator(GeneratorBuilder().min(min).max(max), perColumnNames: _*) + case (None, Some(min), Some(max), Some(DISTRIBUTION_EXPONENTIAL), Some(rate)) => baseRecordCount.recordsPerColumnExponentialDistribution(min, max, rate.toDouble, perColumnNames: _*) + case (None, None, None, Some(DISTRIBUTION_EXPONENTIAL), Some(rate)) => baseRecordCount.recordsPerColumnExponentialDistribution(rate.toDouble, perColumnNames: _*) + case (None, Some(min), Some(max), Some(DISTRIBUTION_NORMAL), None) => baseRecordCount.recordsPerColumnNormalDistribution(min, max, perColumnNames: _*) + case _ => baseRecordCount.recordsPerColumn(DEFAULT_PER_COLUMN_COUNT_RECORDS, perColumnNames: _*) + } + } else { + baseRecordCount + } + }).getOrElse(CountBuilder()) + } + +} diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/FieldMapper.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/FieldMapper.scala new file mode 100644 index 0000000..be9a4d4 --- /dev/null +++ b/app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/FieldMapper.scala @@ -0,0 +1,82 @@ +package io.github.datacatering.datacaterer.core.ui.mapper + +import io.github.datacatering.datacaterer.api.model.Constants.{DATA_CONTRACT_CLI, MARQUEZ, ONE_OF_GENERATOR, ONE_OF_GENERATOR_DELIMITER, OPEN_API, OPEN_DATA_CONTRACT_STANDARD, OPEN_LINEAGE_NAMESPACE, OPEN_METADATA, OPEN_METADATA_AUTH_TYPE, PATH, REGEX_GENERATOR, SCHEMA_LOCATION, URL} +import io.github.datacatering.datacaterer.api.model.DataType +import io.github.datacatering.datacaterer.api.{FieldBuilder, MetadataSourceBuilder} +import io.github.datacatering.datacaterer.core.exception.InvalidMetadataDataSourceOptionsException +import io.github.datacatering.datacaterer.core.model.Constants.CONNECTION_TYPE +import io.github.datacatering.datacaterer.core.ui.mapper.UiMapper.checkOptions +import io.github.datacatering.datacaterer.core.ui.model.{DataSourceRequest, FieldRequest, MetadataSourceRequest} +import org.apache.log4j.Logger + +object FieldMapper { + + private val LOGGER = Logger.getLogger(getClass.getName) + + def fieldMapping(dataSourceRequest: DataSourceRequest): (Option[MetadataSourceBuilder], Option[List[FieldBuilder]]) = { + dataSourceRequest.fields.map(fields => { + fields.optMetadataSource.map(metadataSource => + (Some(dataGenerationMetadataSourceMapping(metadataSource)), None) + ).getOrElse( + (None, Some(fieldMapping(dataSourceRequest.name, fields.optFields))) + ) + }).getOrElse((None, None)) + } + + private def dataGenerationMetadataSourceMapping(metadataSource: MetadataSourceRequest): MetadataSourceBuilder = { + // metadata source exists and should have options defined (at least type and groupType) + if (metadataSource.overrideOptions.isEmpty) { + throw InvalidMetadataDataSourceOptionsException(metadataSource.name) + } + val builder = MetadataSourceBuilder() + val baseOptions = metadataSource.overrideOptions.getOrElse(Map()) + if (!baseOptions.contains(CONNECTION_TYPE)) { + throw new IllegalArgumentException("Unable to determine metadata source type") + } + val builderWithOptions = baseOptions(CONNECTION_TYPE) match { + case DATA_CONTRACT_CLI => + checkOptions(metadataSource.name, List(PATH), baseOptions) + builder.dataContractCli(baseOptions(PATH)) + case OPEN_METADATA => + checkOptions(metadataSource.name, List(URL, OPEN_METADATA_AUTH_TYPE), baseOptions) + builder.openMetadata(baseOptions(URL), baseOptions(OPEN_METADATA_AUTH_TYPE), baseOptions) + case OPEN_API => + checkOptions(metadataSource.name, List(SCHEMA_LOCATION), baseOptions) + builder.openApi(baseOptions(SCHEMA_LOCATION)) + case MARQUEZ => + checkOptions(metadataSource.name, List(URL, OPEN_LINEAGE_NAMESPACE), baseOptions) + builder.marquez(baseOptions(URL), baseOptions(OPEN_LINEAGE_NAMESPACE)) + case OPEN_DATA_CONTRACT_STANDARD => + checkOptions(metadataSource.name, List(PATH), baseOptions) + builder.openDataContractStandard(baseOptions(PATH)) + case x => + LOGGER.warn(s"Unsupported metadata source for data generation, metadata-connection-type=$x") + builder + } + builderWithOptions + } + + /* + * Get field mapping based on manually defined fields or setting the metadata source to get schema details. + * Try get metadata source details first. If doesn't exist, fallback to manual fields. + * If no manual fields or metadata source exist, then metadata will be gathered from data source. + */ + private def fieldMapping( + dataSourceName: String, + optFields: Option[List[FieldRequest]] = None + ): List[FieldBuilder] = { + optFields.map(fields => { + fields.map(field => { + assert(field.name.nonEmpty, s"Field name cannot be empty, data-source-name=$dataSourceName") + assert(field.`type`.nonEmpty, s"Field type cannot be empty, data-source-name=$dataSourceName, field-name=${field.name}") + val options = field.options.getOrElse(Map()) + val baseBuild = FieldBuilder().name(field.name).`type`(DataType.fromString(field.`type`)).options(options) + val withRegex = options.get(REGEX_GENERATOR).map(regex => baseBuild.regex(regex)).getOrElse(baseBuild) + val withOneOf = options.get(ONE_OF_GENERATOR).map(oneOf => withRegex.oneOf(oneOf.split(ONE_OF_GENERATOR_DELIMITER).map(_.trim): _*)).getOrElse(withRegex) + val optNested = field.nested.map(nestedFields => fieldMapping(dataSourceName, nestedFields.optFields)) + optNested.map(nested => withOneOf.schema(nested: _*)).getOrElse(withOneOf) + }) + }).getOrElse(List()) + } + +} diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/ForeignKeyMapper.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/ForeignKeyMapper.scala new file mode 100644 index 0000000..069ce04 --- /dev/null +++ b/app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/ForeignKeyMapper.scala @@ -0,0 +1,49 @@ +package io.github.datacatering.datacaterer.core.ui.mapper + +import io.github.datacatering.datacaterer.api.PlanBuilder +import io.github.datacatering.datacaterer.api.connection.ConnectionTaskBuilder +import io.github.datacatering.datacaterer.api.model.Constants.{DEFAULT_STEP_NAME, VALIDATION_OPTION_DELIMITER} +import io.github.datacatering.datacaterer.api.model.ForeignKeyRelation +import io.github.datacatering.datacaterer.core.exception.MissingConnectionForForeignKeyException +import io.github.datacatering.datacaterer.core.generator.metadata.StepNameProvider +import io.github.datacatering.datacaterer.core.ui.model.{ForeignKeyRequest, ForeignKeyRequestItem} + +object ForeignKeyMapper { + + def foreignKeyMapping(foreignKeyRequests: List[ForeignKeyRequest], connections: List[ConnectionTaskBuilder[_]], planBuilder: PlanBuilder): PlanBuilder = { + val mappedWithConnections = foreignKeyRequests.map(fkr => { + val sourceFk = mapToForeignKeyRelation(connections, fkr.source.get) + val generationLinkConnections = mapForeignKeyLinksToRelations(connections, fkr.generationLinks) + val deleteLinkConnections = mapForeignKeyLinksToRelations(connections, fkr.deleteLinks) + + (sourceFk, generationLinkConnections, deleteLinkConnections) + }) + mappedWithConnections.foldLeft(planBuilder)((pb, fk) => pb.addForeignKeyRelationship(fk._1, fk._2, fk._3)) + } + + private def mapToForeignKeyRelation(connections: List[ConnectionTaskBuilder[_]], foreignKeyRequestItem: ForeignKeyRequestItem): ForeignKeyRelation = { + val connection = getConnectionByTaskName(connections, foreignKeyRequestItem.taskName) + val columns = foreignKeyRequestItem.columns.split(VALIDATION_OPTION_DELIMITER).toList + val dataSourceName = connection.connectionConfigWithTaskBuilder.dataSourceName + //if there are options defined in the foreign key request item, it needs to be used to define the step name + //since if a metadata source is used to generate sub data sources, the options are used to define the step name + //i.e. read schema from OpenAPI doc and step name becomes 'GET/my-path' + val overrideOptions = foreignKeyRequestItem.options.getOrElse(Map()) + val baseOptions = connection.connectionConfigWithTaskBuilder.options + val stepName = StepNameProvider.fromOptions(baseOptions ++ overrideOptions).getOrElse(connection.step.map(_.step.name).getOrElse(DEFAULT_STEP_NAME)) + ForeignKeyRelation(dataSourceName, stepName, columns) + } + + private def mapForeignKeyLinksToRelations(connections: List[ConnectionTaskBuilder[_]], links: List[ForeignKeyRequestItem]): List[ForeignKeyRelation] = { + links.map(link => mapToForeignKeyRelation(connections, link)) + } + + private def getConnectionByTaskName(connections: List[ConnectionTaskBuilder[_]], taskName: String): ConnectionTaskBuilder[_] = { + val matchingConnection = connections.find(c => c.task.exists(taskBuilder => taskBuilder.task.name == taskName)) + matchingConnection match { + case Some(value) => value + case None => throw MissingConnectionForForeignKeyException(taskName) + } + } + +} diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/UiMapper.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/UiMapper.scala index 82f0327..757a417 100644 --- a/app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/UiMapper.scala +++ b/app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/UiMapper.scala @@ -1,19 +1,18 @@ package io.github.datacatering.datacaterer.core.ui.mapper -import io.github.datacatering.datacaterer.api.connection.{ConnectionTaskBuilder, FileBuilder, JdbcBuilder} +import io.github.datacatering.datacaterer.api.connection.ConnectionTaskBuilder import io.github.datacatering.datacaterer.api.model.Constants._ -import io.github.datacatering.datacaterer.api.model.{DataType, ForeignKeyRelation} -import io.github.datacatering.datacaterer.api.{BasePlanRun, ColumnNamesValidationBuilder, ColumnValidationBuilder, ConnectionConfigWithTaskBuilder, CountBuilder, DataCatererConfigurationBuilder, FieldBuilder, GeneratorBuilder, GroupByValidationBuilder, MetadataSourceBuilder, PlanBuilder, PlanRun, ValidationBuilder} -import io.github.datacatering.datacaterer.core.exception.{InvalidMetadataDataSourceOptionsException, MissingConnectionForForeignKeyException} -import io.github.datacatering.datacaterer.core.generator.metadata.StepNameProvider -import io.github.datacatering.datacaterer.core.model.Constants.CONNECTION_TYPE -import io.github.datacatering.datacaterer.core.ui.model.{ConfigurationRequest, DataSourceRequest, FieldRequest, ForeignKeyRequest, ForeignKeyRequestItem, MetadataSourceRequest, PlanRunRequest, ValidationItemRequest, ValidationItemRequests} -import org.apache.log4j.Logger +import io.github.datacatering.datacaterer.api.{BasePlanRun, DataCatererConfigurationBuilder, PlanRun} +import io.github.datacatering.datacaterer.core.ui.mapper.ConfigurationMapper.configurationMapping +import io.github.datacatering.datacaterer.core.ui.mapper.ConnectionMapper.connectionMapping +import io.github.datacatering.datacaterer.core.ui.mapper.CountMapper.countMapping +import io.github.datacatering.datacaterer.core.ui.mapper.FieldMapper.fieldMapping +import io.github.datacatering.datacaterer.core.ui.mapper.ForeignKeyMapper.foreignKeyMapping +import io.github.datacatering.datacaterer.core.ui.mapper.ValidationMapper.{connectionsWithUpstreamValidationMapping, validationMapping} +import io.github.datacatering.datacaterer.core.ui.model.{DataSourceRequest, PlanRunRequest} object UiMapper { - private val LOGGER = Logger.getLogger(getClass.getName) - def mapToPlanRun(planRunRequest: PlanRunRequest, installDirectory: String): PlanRun = { val plan = new BasePlanRun() val planBuilder = plan.plan.name(planRunRequest.name).runId(planRunRequest.id) @@ -32,7 +31,7 @@ object UiMapper { plan } - def dataSourceToConnection(dataSourceRequest: DataSourceRequest): ConnectionTaskBuilder[_] = { + private def dataSourceToConnection(dataSourceRequest: DataSourceRequest): ConnectionTaskBuilder[_] = { val baseConnection = connectionMapping(dataSourceRequest) val mappedFields = fieldMapping(dataSourceRequest) val countBuilder = countMapping(dataSourceRequest) @@ -49,548 +48,7 @@ object UiMapper { .enableDataGeneration(enableDataGenFromOpts || mappedFields._1.nonEmpty || mappedFields._2.nonEmpty) } - def foreignKeyMapping(foreignKeyRequests: List[ForeignKeyRequest], connections: List[ConnectionTaskBuilder[_]], planBuilder: PlanBuilder): PlanBuilder = { - val mappedWithConnections = foreignKeyRequests.map(fkr => { - val sourceFk = mapToForeignKeyRelation(connections, fkr.source.get) - val generationLinkConnections = mapForeignKeyLinksToRelations(connections, fkr.generationLinks) - val deleteLinkConnections = mapForeignKeyLinksToRelations(connections, fkr.deleteLinks) - - (sourceFk, generationLinkConnections, deleteLinkConnections) - }) - mappedWithConnections.foldLeft(planBuilder)((pb, fk) => pb.addForeignKeyRelationship(fk._1, fk._2, fk._3)) - } - - private def mapToForeignKeyRelation(connections: List[ConnectionTaskBuilder[_]], foreignKeyRequestItem: ForeignKeyRequestItem): ForeignKeyRelation = { - val connection = getConnectionByTaskName(connections, foreignKeyRequestItem.taskName) - val columns = foreignKeyRequestItem.columns.split(VALIDATION_OPTION_DELIMITER).toList - val dataSourceName = connection.connectionConfigWithTaskBuilder.dataSourceName - //if there are options defined in the foreign key request item, it needs to be used to define the step name - //since if a metadata source is used to generate sub data sources, the options are used to define the step name - //i.e. read schema from OpenAPI doc and step name becomes 'GET/my-path' - val overrideOptions = foreignKeyRequestItem.options.getOrElse(Map()) - val baseOptions = connection.connectionConfigWithTaskBuilder.options - val stepName = StepNameProvider.fromOptions(baseOptions ++ overrideOptions).getOrElse(connection.step.map(_.step.name).getOrElse(DEFAULT_STEP_NAME)) - ForeignKeyRelation(dataSourceName, stepName, columns) - } - - private def mapForeignKeyLinksToRelations(connections: List[ConnectionTaskBuilder[_]], links: List[ForeignKeyRequestItem]): List[ForeignKeyRelation] = { - links.map(link => mapToForeignKeyRelation(connections, link)) - } - - private def getConnectionByTaskName(connections: List[ConnectionTaskBuilder[_]], taskName: String): ConnectionTaskBuilder[_] = { - val matchingConnection = connections.find(c => c.task.exists(taskBuilder => taskBuilder.task.name == taskName)) - matchingConnection match { - case Some(value) => value - case None => throw MissingConnectionForForeignKeyException(taskName) - } - } - - private def configurationMapping( - configurationRequest: ConfigurationRequest, - installDirectory: String, - connections: List[ConnectionTaskBuilder[_]] - ): DataCatererConfigurationBuilder = { - val isConnectionContainsMetadataSource = connections.exists(conn => conn.connectionConfigWithTaskBuilder.options.contains(METADATA_SOURCE_TYPE)) - val configUpdatedFromConnections = if (isConnectionContainsMetadataSource) { - configurationRequest.copy(flag = configurationRequest.flag ++ Map(CONFIG_FLAGS_GENERATE_PLAN_AND_TASKS -> isConnectionContainsMetadataSource.toString)) - } else configurationRequest - - val baseConfig = DataCatererConfigurationBuilder() - val withFlagConf = mapFlagsConfiguration(configUpdatedFromConnections, baseConfig) - val withFolderConf = mapFolderConfiguration(configUpdatedFromConnections, installDirectory, withFlagConf) - val withMetadataConf = mapMetadataConfiguration(configUpdatedFromConnections, withFolderConf) - val withGenerationConf = mapGenerationConfiguration(configUpdatedFromConnections, withMetadataConf) - val withValidationConf = mapValidationConfiguration(configUpdatedFromConnections, withGenerationConf) - val withAlertConf = mapAlertConfiguration(configUpdatedFromConnections, withValidationConf) - - withAlertConf - } - - def mapFlagsConfiguration(configurationRequest: ConfigurationRequest, baseConfig: DataCatererConfigurationBuilder): DataCatererConfigurationBuilder = { - configurationRequest.flag.foldLeft(baseConfig)((conf, c) => { - val boolVal = c._2.toBoolean - c._1 match { - case CONFIG_FLAGS_COUNT => conf.enableCount(boolVal) - case CONFIG_FLAGS_GENERATE_DATA => conf.enableGenerateData(boolVal) - case CONFIG_FLAGS_RECORD_TRACKING => conf.enableRecordTracking(boolVal) - case CONFIG_FLAGS_DELETE_GENERATED_RECORDS => conf.enableDeleteGeneratedRecords(boolVal) - case CONFIG_FLAGS_GENERATE_PLAN_AND_TASKS => conf.enableGeneratePlanAndTasks(boolVal) - case CONFIG_FLAGS_FAIL_ON_ERROR => conf.enableFailOnError(boolVal) - case CONFIG_FLAGS_UNIQUE_CHECK => conf.enableUniqueCheck(boolVal) - case CONFIG_FLAGS_SINK_METADATA => conf.enableSinkMetadata(boolVal) - case CONFIG_FLAGS_SAVE_REPORTS => conf.enableSaveReports(boolVal) - case CONFIG_FLAGS_VALIDATION => conf.enableValidation(boolVal) - case CONFIG_FLAGS_GENERATE_VALIDATIONS => conf.enableGenerateValidations(boolVal) - case CONFIG_FLAGS_ALERTS => conf.enableAlerts(boolVal) - case _ => - LOGGER.warn(s"Unexpected flags configuration key, key=${c._1}") - conf - } - }) - } - - def mapAlertConfiguration(configurationRequest: ConfigurationRequest, baseConfig: DataCatererConfigurationBuilder): DataCatererConfigurationBuilder = { - configurationRequest.alert.foldLeft(baseConfig)((conf, c) => { - c._1 match { - case CONFIG_ALERT_TRIGGER_ON => conf.alertTriggerOn(c._2) - case CONFIG_ALERT_SLACK_TOKEN => conf.slackAlertToken(c._2) - case CONFIG_ALERT_SLACK_CHANNELS => conf.slackAlertChannels(c._2.split(",").map(_.trim): _*) - case _ => - LOGGER.warn(s"Unexpected alert configuration key, key=${c._1}") - conf - } - }) - } - - def mapValidationConfiguration(configurationRequest: ConfigurationRequest, baseConfig: DataCatererConfigurationBuilder): DataCatererConfigurationBuilder = { - configurationRequest.validation.foldLeft(baseConfig)((conf, c) => { - c._1 match { - case CONFIG_VALIDATION_NUM_SAMPLE_ERROR_RECORDS => conf.numErrorSampleRecords(c._2.toInt) - case CONFIG_VALIDATION_ENABLE_DELETE_RECORD_TRACKING_FILES => conf.enableDeleteRecordTrackingFiles(c._2.toBoolean) - case _ => - LOGGER.warn(s"Unexpected validation configuration key, key=${c._1}") - conf - } - }) - } - - def mapGenerationConfiguration(configurationRequest: ConfigurationRequest, baseConfig: DataCatererConfigurationBuilder): DataCatererConfigurationBuilder = { - configurationRequest.generation.foldLeft(baseConfig)((conf, c) => { - c._1 match { - case CONFIG_GENERATION_NUM_RECORDS_PER_BATCH => conf.numRecordsPerBatch(c._2.toLong) - case CONFIG_GENERATION_NUM_RECORDS_PER_STEP => - val parsedNum = c._2.toLong - if (parsedNum != -1) conf.numRecordsPerStep(c._2.toLong) else conf - case _ => - LOGGER.warn(s"Unexpected generation configuration key, key=${c._1}") - conf - } - }) - } - - def mapMetadataConfiguration(configurationRequest: ConfigurationRequest, baseConfig: DataCatererConfigurationBuilder): DataCatererConfigurationBuilder = { - configurationRequest.metadata.foldLeft(baseConfig)((conf, c) => { - c._1 match { - case CONFIG_METADATA_NUM_RECORDS_FROM_DATA_SOURCE => conf.numRecordsFromDataSourceForDataProfiling(c._2.toInt) - case CONFIG_METADATA_NUM_RECORDS_FOR_ANALYSIS => conf.numRecordsForAnalysisForDataProfiling(c._2.toInt) - case CONFIG_METADATA_ONE_OF_DISTINCT_COUNT_VS_COUNT_THRESHOLD => conf.oneOfDistinctCountVsCountThreshold(c._2.toDouble) - case CONFIG_METADATA_ONE_OF_MIN_COUNT => conf.oneOfMinCount(c._2.toLong) - case CONFIG_METADATA_NUM_GENERATED_SAMPLES => conf.numGeneratedSamples(c._2.toInt) - case _ => - LOGGER.warn(s"Unexpected metadata configuration key, key=${c._1}") - conf - } - }) - } - - def mapFolderConfiguration(configurationRequest: ConfigurationRequest, installDirectory: String, baseConfig: DataCatererConfigurationBuilder): DataCatererConfigurationBuilder = { - val nonEmptyFolderConfig = configurationRequest.folder.filter(_._2.nonEmpty).foldLeft(baseConfig)((conf, c) => { - c._1 match { - case CONFIG_FOLDER_PLAN_FILE_PATH => conf.planFilePath(c._2) - case CONFIG_FOLDER_TASK_FOLDER_PATH => conf.taskFolderPath(c._2) - case CONFIG_FOLDER_GENERATED_PLAN_AND_TASK_FOLDER_PATH => conf.generatedPlanAndTaskFolderPath(c._2) - case CONFIG_FOLDER_GENERATED_REPORTS_FOLDER_PATH => conf.generatedReportsFolderPath(c._2) - case CONFIG_FOLDER_RECORD_TRACKING_FOLDER_PATH => conf.recordTrackingFolderPath(c._2) - case CONFIG_FOLDER_VALIDATION_FOLDER_PATH => conf.validationFolderPath(c._2) - case CONFIG_FOLDER_RECORD_TRACKING_FOR_VALIDATION_FOLDER_PATH => conf.recordTrackingForValidationFolderPath(c._2) - case _ => - LOGGER.warn(s"Unexpected folder configuration key, key=${c._1}") - conf - } - }) - // should set the base directory to the install directory for most folders if not overridden - configurationRequest.folder.filter(_._2.isEmpty).foldLeft(nonEmptyFolderConfig)((conf, c) => { - c._1 match { - case CONFIG_FOLDER_PLAN_FILE_PATH => conf - case CONFIG_FOLDER_TASK_FOLDER_PATH => conf.taskFolderPath(s"$installDirectory/task") - case CONFIG_FOLDER_GENERATED_PLAN_AND_TASK_FOLDER_PATH => conf.generatedPlanAndTaskFolderPath(s"$installDirectory/generated-plan-task") - case CONFIG_FOLDER_GENERATED_REPORTS_FOLDER_PATH => conf.generatedReportsFolderPath(s"$installDirectory/report") - case CONFIG_FOLDER_RECORD_TRACKING_FOLDER_PATH => conf.recordTrackingFolderPath(s"$installDirectory/record-tracking") - case CONFIG_FOLDER_VALIDATION_FOLDER_PATH => conf.validationFolderPath(s"$installDirectory/validation") - case CONFIG_FOLDER_RECORD_TRACKING_FOR_VALIDATION_FOLDER_PATH => conf.recordTrackingForValidationFolderPath(s"$installDirectory/record-tracking-validation") - case _ => - LOGGER.warn(s"Unexpected folder configuration key, key=${c._1}") - conf - } - }) - } - - def validationMapping(dataSourceRequest: DataSourceRequest): List[ValidationBuilder] = { - dataSourceRequest.validations - .map(validations => validations.optValidations.map(v => v.flatMap(validationItemRequestToValidationBuilders)).getOrElse(List())) - .getOrElse(List()) - } - - private def validationItemRequestToValidationBuilders(validateItem: ValidationItemRequest): List[ValidationBuilder] = { - validateItem.`type` match { - case VALIDATION_COLUMN => - //map type of column validation to builder method - //each option is a new validation - validateItem.options.map(opts => { - val colName = opts(VALIDATION_FIELD) - opts - .filter(o => !VALIDATION_SUPPORTING_OPTIONS.contains(o._1)) - .map(opt => { - val baseValid = validationWithDescriptionAndErrorThreshold(opts).col(colName) - columnValidationMapping(baseValid, opts, colName, opt) - }) - .toList - }).getOrElse(List()) - case VALIDATION_COLUMN_NAMES => - validateItem.options.map(opts => { - opts - .filter(o => !VALIDATION_SUPPORTING_OPTIONS.contains(o._1)) - .map(opt => { - val baseValid = validationWithDescriptionAndErrorThreshold(opts).columnNames - columnNamesValidationMapping(baseValid, opts, opt) - }) - .toList - }).getOrElse(List()) - case VALIDATION_UPSTREAM => - // require upstream ConnectionTaskBuilder - List() - case VALIDATION_GROUP_BY => - validateItem.options.map(opts => { - val groupByCols = opts(VALIDATION_GROUP_BY_COLUMNS).split(VALIDATION_OPTION_DELIMITER) - val baseValid = validationWithDescriptionAndErrorThreshold(opts).groupBy(groupByCols: _*) - groupByValidationMapping(baseValid, validateItem.nested) - }).getOrElse(List()) - case _ => List() - } - } - - private def validationWithDescriptionAndErrorThreshold(options: Map[String, String]): ValidationBuilder = { - val optDescription = options.get(VALIDATION_DESCRIPTION) - val optErrorThreshold = options.get(VALIDATION_ERROR_THRESHOLD) - val baseValidation = ValidationBuilder() - val validWithDesc = optDescription.map(desc => baseValidation.description(desc)).getOrElse(baseValidation) - optErrorThreshold.map(error => validWithDesc.errorThreshold(error.toDouble)).getOrElse(validWithDesc) - } - - def countMapping(dataSourceRequest: DataSourceRequest): CountBuilder = { - dataSourceRequest.count.map(recordCountRequest => { - val baseRecordCount = (recordCountRequest.records, recordCountRequest.recordsMin, recordCountRequest.recordsMax) match { - case (Some(records), None, None) => CountBuilder().records(records) - case (None, Some(min), Some(max)) => CountBuilder().generator(GeneratorBuilder().min(min).max(max)) - case _ => CountBuilder().records(DEFAULT_COUNT_RECORDS) - } - - val perColumnNames = recordCountRequest.perColumnNames.getOrElse(List()) - if (perColumnNames.nonEmpty) { - (recordCountRequest.perColumnRecords, recordCountRequest.perColumnRecordsMin, recordCountRequest.perColumnRecordsMax, - recordCountRequest.perColumnRecordsDistribution, recordCountRequest.perColumnRecordsDistributionRateParam) match { - case (Some(records), None, None, None, None) => baseRecordCount.recordsPerColumn(records, perColumnNames: _*) - case (None, Some(min), Some(max), None, None) => baseRecordCount.recordsPerColumnGenerator(GeneratorBuilder().min(min).max(max), perColumnNames: _*) - case (None, Some(min), Some(max), Some(DISTRIBUTION_EXPONENTIAL), Some(rate)) => baseRecordCount.recordsPerColumnExponentialDistribution(min, max, rate.toDouble, perColumnNames: _*) - case (None, None, None, Some(DISTRIBUTION_EXPONENTIAL), Some(rate)) => baseRecordCount.recordsPerColumnExponentialDistribution(rate.toDouble, perColumnNames: _*) - case (None, Some(min), Some(max), Some(DISTRIBUTION_NORMAL), None) => baseRecordCount.recordsPerColumnNormalDistribution(min, max, perColumnNames: _*) - case _ => baseRecordCount.recordsPerColumn(DEFAULT_PER_COLUMN_COUNT_RECORDS, perColumnNames: _*) - } - } else { - baseRecordCount - } - }).getOrElse(CountBuilder()) - } - - def fieldMapping(dataSourceRequest: DataSourceRequest): (Option[MetadataSourceBuilder], Option[List[FieldBuilder]]) = { - dataSourceRequest.fields.map(fields => { - fields.optMetadataSource.map(metadataSource => - (Some(dataGenerationMetadataSourceMapping(metadataSource)), None) - ).getOrElse( - (None, Some(fieldMapping(dataSourceRequest.name, fields.optFields, fields.optMetadataSource))) - ) - }).getOrElse((None, None)) - } - - private def dataGenerationMetadataSourceMapping(metadataSource: MetadataSourceRequest): MetadataSourceBuilder = { - // metadata source exists and should have options defined (at least type and groupType) - if (metadataSource.overrideOptions.isEmpty) { - throw InvalidMetadataDataSourceOptionsException(metadataSource.name) - } - val builder = MetadataSourceBuilder() - val baseOptions = metadataSource.overrideOptions.getOrElse(Map()) - val builderWithOptions = baseOptions(CONNECTION_TYPE) match { - case OPEN_METADATA => - checkOptions(metadataSource.name, List(URL, OPEN_METADATA_AUTH_TYPE), baseOptions) - builder.openMetadata(baseOptions(URL), baseOptions(OPEN_METADATA_AUTH_TYPE), baseOptions) - case OPEN_API => - checkOptions(metadataSource.name, List(SCHEMA_LOCATION), baseOptions) - builder.openApi(baseOptions(SCHEMA_LOCATION)) - case MARQUEZ => - checkOptions(metadataSource.name, List(URL, OPEN_LINEAGE_NAMESPACE), baseOptions) - builder.marquez(baseOptions(URL), baseOptions(OPEN_LINEAGE_NAMESPACE)) - case x => - LOGGER.warn(s"Unsupported metadata source for data generation, metadata-connection-type=$x") - builder - } - builderWithOptions - } - - /* - * Get field mapping based on manually defined fields or setting the metadata source to get schema details. - * Try get metadata source details first. If doesn't exist, fallback to manual fields. - * If no manual fields or metadata source exist, then metadata will be gathered from data source. - */ - private def fieldMapping( - dataSourceName: String, - optFields: Option[List[FieldRequest]] = None, - optMetadataSource: Option[MetadataSourceRequest] = None - ): List[FieldBuilder] = { - optFields.map(fields => { - fields.map(field => { - assert(field.name.nonEmpty, s"Field name cannot be empty, data-source-name=$dataSourceName") - assert(field.`type`.nonEmpty, s"Field type cannot be empty, data-source-name=$dataSourceName, field-name=${field.name}") - val options = field.options.getOrElse(Map()) - val baseBuild = FieldBuilder().name(field.name).`type`(DataType.fromString(field.`type`)).options(options) - val withRegex = options.get(REGEX_GENERATOR).map(regex => baseBuild.regex(regex)).getOrElse(baseBuild) - val withOneOf = options.get(ONE_OF_GENERATOR).map(oneOf => withRegex.oneOf(oneOf.split(ONE_OF_GENERATOR_DELIMITER).map(_.trim): _*)).getOrElse(withRegex) - val optNested = field.nested.map(nestedFields => fieldMapping(dataSourceName, nestedFields.optFields)) - optNested.map(nested => withOneOf.schema(nested: _*)).getOrElse(withOneOf) - }) - }).getOrElse(List()) - } - - def connectionMapping(dataSourceRequest: DataSourceRequest): ConnectionTaskBuilder[_] = { - dataSourceRequest.`type` match { - case Some(CASSANDRA_NAME) => createCassandraConnection(dataSourceRequest) - case Some(POSTGRES) => createJdbcConnection(dataSourceRequest, POSTGRES) - case Some(MYSQL) => createJdbcConnection(dataSourceRequest, MYSQL) - case Some(CSV) => createFileConnection(dataSourceRequest, CSV) - case Some(JSON) => createFileConnection(dataSourceRequest, JSON) - case Some(PARQUET) => createFileConnection(dataSourceRequest, PARQUET) - case Some(ORC) => createFileConnection(dataSourceRequest, ORC) - case Some(DELTA) => createFileConnection(dataSourceRequest, DELTA) - case Some(ICEBERG) => createIcebergConnection(dataSourceRequest) - case Some(SOLACE) => - val opt = dataSourceRequest.options.getOrElse(Map()) - checkOptions(dataSourceRequest.name, List(URL, USERNAME, PASSWORD, JMS_DESTINATION_NAME, JMS_VPN_NAME, JMS_CONNECTION_FACTORY, JMS_INITIAL_CONTEXT_FACTORY), opt) - ConnectionConfigWithTaskBuilder().solace(dataSourceRequest.name, opt(URL), opt(USERNAME), opt(PASSWORD), - opt(JMS_VPN_NAME), opt(JMS_CONNECTION_FACTORY), opt(JMS_INITIAL_CONTEXT_FACTORY), opt) - case Some(KAFKA) => - val opt = dataSourceRequest.options.getOrElse(Map()) - checkOptions(dataSourceRequest.name, List(URL, KAFKA_TOPIC), opt) - ConnectionConfigWithTaskBuilder().kafka(dataSourceRequest.name, opt(URL), opt) - case Some(HTTP) => - val opt = dataSourceRequest.options.getOrElse(Map()) - ConnectionConfigWithTaskBuilder().http(dataSourceRequest.name, opt.getOrElse(USERNAME, ""), opt.getOrElse(PASSWORD, ""), opt) - case Some(x) => - throw new IllegalArgumentException(s"Unsupported data source from UI, data-source-type=$x") - case _ => - throw new IllegalArgumentException(s"No data source type defined, unable to create connections, " + - s"data-source-name=${dataSourceRequest.name}, task-name=${dataSourceRequest.taskName}") - } - } - - def connectionsWithUpstreamValidationMapping(connections: List[ConnectionTaskBuilder[_]], dataSources: List[DataSourceRequest]): List[ConnectionTaskBuilder[_]] = { - val dataSourcesWithUpstreamValidation = dataSources - .filter(ds => { - ds.validations - .map(_.optValidations.getOrElse(List())).getOrElse(List()) - .exists(_.`type` == VALIDATION_UPSTREAM) - }) - .map(ds => (ds.taskName, ds.validations.map(_.optValidations.get).getOrElse(List()))) - .toMap - - connections.map(connection => { - val connectionTaskName = connection.task.map(_.task.name).getOrElse("") - val optDataSourceWithUpstreamValidation = dataSourcesWithUpstreamValidation.get(connectionTaskName) - optDataSourceWithUpstreamValidation match { - case Some(value) => - val upstreamValidations = value.filter(_.`type` == VALIDATION_UPSTREAM) - val mappedValidations = upstreamValidationMapping(connections, upstreamValidations) - val allValidations = connection.getValidations ++ mappedValidations - connection.validations(allValidations: _*).enableDataValidation(allValidations.nonEmpty) - case None => - LOGGER.debug(s"Task does not have any upstream validations defined, task-name=$connectionTaskName") - connection - } - }) - } - - private def upstreamValidationMapping(connections: List[ConnectionTaskBuilder[_]], upstreamValidations: List[ValidationItemRequest]): List[ValidationBuilder] = { - upstreamValidations.flatMap(upstreamValidation => { - def getOption(k: String): Option[String] = upstreamValidation.options.flatMap(_.get(k)) - - val upstreamTaskName = getOption(VALIDATION_UPSTREAM_TASK_NAME).getOrElse("") - val upstreamConnection = connections.find(_.task.exists(_.task.name == upstreamTaskName)) - - if (upstreamConnection.isDefined) { - val baseValid = validationWithDescriptionAndErrorThreshold(upstreamValidation.options.getOrElse(Map())).upstreamData(upstreamConnection.get) - - // check for join options - val joinValidation = (getOption(VALIDATION_UPSTREAM_JOIN_TYPE), getOption(VALIDATION_UPSTREAM_JOIN_COLUMNS), getOption(VALIDATION_UPSTREAM_JOIN_EXPR)) match { - case (Some(joinType), Some(joinCols), _) => baseValid.joinType(joinType).joinColumns(joinCols.split(","): _*) - case (Some(joinType), None, Some(joinExpr)) => baseValid.joinType(joinType).joinExpr(joinExpr) - case (None, Some(joinCols), _) => baseValid.joinType(DEFAULT_VALIDATION_JOIN_TYPE).joinColumns(joinCols.split(","): _*) - case (None, None, Some(joinExpr)) => baseValid.joinType(DEFAULT_VALIDATION_JOIN_TYPE).joinExpr(joinExpr) - case _ => throw new IllegalArgumentException("Unexpected upstream validation join options, need to define join columns or expression") - } - val upstreamWithValidations = upstreamValidation.nested.map(nest => - nest.validations.flatMap(nestedValidation => { - validationItemRequestToValidationBuilders(nestedValidation) - .map(joinValidation.withValidation) - }) - ).getOrElse(List()) - upstreamWithValidations - } else { - LOGGER.error(s"Validation upstream task name is not defined in task list, unable to execute upstream validations, " + - s"validation-upstream-task-name=$upstreamTaskName") - List() - } - }) - } - - private def groupByValidationMapping(baseValid: GroupByValidationBuilder, optNestedValidations: Option[ValidationItemRequests]) = { - optNestedValidations.map(validationReqs => { - validationReqs.validations.flatMap(validationReq => { - // only column validations can be applied after group by - validationReq.options.map(opts => { - // check for aggType and aggCol - (opts.get(VALIDATION_AGGREGATION_TYPE), opts.get(VALIDATION_AGGREGATION_COLUMN)) match { - case (Some(aggType), Some(aggCol)) => - val aggregateValidation = aggType match { - case VALIDATION_MIN => baseValid.min(aggCol) - case VALIDATION_MAX => baseValid.max(aggCol) - case VALIDATION_COUNT => baseValid.count(aggCol) - case VALIDATION_SUM => baseValid.sum(aggCol) - case VALIDATION_AVERAGE => baseValid.avg(aggCol) - case VALIDATION_STANDARD_DEVIATION => baseValid.stddev(aggCol) - case _ => throw new IllegalArgumentException(s"Unexpected aggregation type found in group by validation, aggregation-type=$aggType") - } - opts.filter(o => o._1 != VALIDATION_AGGREGATION_TYPE && o._1 != VALIDATION_AGGREGATION_COLUMN) - .map(opt => columnValidationMapping(aggregateValidation, opts, opt._2, opt)) - .toList - case _ => throw new IllegalArgumentException("Keys 'aggType' and 'aggCol' are expected when defining a group by validation") - } - }).getOrElse(List()) - }) - }).getOrElse(List()) - } - - private def columnNamesValidationMapping(baseValid: ColumnNamesValidationBuilder, opts: Map[String, String], opt: (String, String)) = { - opt._1 match { - case VALIDATION_COLUMN_NAMES_COUNT_EQUAL => baseValid.countEqual(opt._2.toInt) - case VALIDATION_COLUMN_NAMES_COUNT_BETWEEN => - val min = opts(VALIDATION_MIN) - val max = opts(VALIDATION_MAX) - baseValid.countBetween(min.toInt, max.toInt) - case VALIDATION_COLUMN_NAMES_MATCH_ORDER => baseValid.matchOrder(opt._2.split(VALIDATION_OPTION_DELIMITER): _*) - case VALIDATION_COLUMN_NAMES_MATCH_SET => baseValid.matchSet(opt._2.split(VALIDATION_OPTION_DELIMITER): _*) - case _ => - LOGGER.warn("Unknown column name validation type, defaulting to column name count equal to 1") - baseValid.countEqual(1) - } - } - - private def columnValidationMapping(baseValid: ColumnValidationBuilder, opts: Map[String, String], colName: String, opt: (String, String)) = { - opt._1 match { - case VALIDATION_EQUAL => baseValid.isEqualCol(opt._2) - case VALIDATION_NOT_EQUAL => baseValid.isNotEqualCol(opt._2) - case VALIDATION_NULL => baseValid.isNull - case VALIDATION_NOT_NULL => baseValid.isNotNull - case VALIDATION_CONTAINS => baseValid.contains(opt._2) - case VALIDATION_NOT_CONTAINS => baseValid.notContains(opt._2) - case VALIDATION_UNIQUE => validationWithDescriptionAndErrorThreshold(opts).unique(colName) - case VALIDATION_LESS_THAN => baseValid.lessThan(opt._2) - case VALIDATION_LESS_THAN_OR_EQUAL => baseValid.lessThanOrEqual(opt._2) - case VALIDATION_GREATER_THAN => baseValid.greaterThan(opt._2) - case VALIDATION_GREATER_THAN_OR_EQUAL => baseValid.greaterThanOrEqual(opt._2) - case VALIDATION_BETWEEN => - val min = opts(VALIDATION_MIN) - val max = opts(VALIDATION_MAX) - baseValid.betweenCol(min, max) - case VALIDATION_NOT_BETWEEN => - val min = opts(VALIDATION_MIN) - val max = opts(VALIDATION_MAX) - baseValid.notBetweenCol(min, max) - case VALIDATION_IN => baseValid.in(opt._2.split(VALIDATION_OPTION_DELIMITER): _*) - case VALIDATION_NOT_IN => baseValid.notIn(opt._2.split(VALIDATION_OPTION_DELIMITER): _*) - case VALIDATION_MATCHES => baseValid.matches(opt._2) - case VALIDATION_NOT_MATCHES => baseValid.notMatches(opt._2) - case VALIDATION_STARTS_WITH => baseValid.startsWith(opt._2) - case VALIDATION_NOT_STARTS_WITH => baseValid.notStartsWith(opt._2) - case VALIDATION_ENDS_WITH => baseValid.endsWith(opt._2) - case VALIDATION_NOT_ENDS_WITH => baseValid.notEndsWith(opt._2) - case VALIDATION_SIZE => baseValid.size(opt._2.toInt) - case VALIDATION_NOT_SIZE => baseValid.notSize(opt._2.toInt) - case VALIDATION_LESS_THAN_SIZE => baseValid.lessThanSize(opt._2.toInt) - case VALIDATION_LESS_THAN_OR_EQUAL_SIZE => baseValid.lessThanOrEqualSize(opt._2.toInt) - case VALIDATION_GREATER_THAN_SIZE => baseValid.greaterThanSize(opt._2.toInt) - case VALIDATION_GREATER_THAN_OR_EQUAL_SIZE => baseValid.greaterThanOrEqualSize(opt._2.toInt) - case VALIDATION_LUHN_CHECK => baseValid.luhnCheck - case VALIDATION_HAS_TYPE => baseValid.hasType(opt._2) - case VALIDATION_SQL => baseValid.expr(opt._2) - case x => throw new IllegalArgumentException(s"Unsupported column validation, validation-key=$x") - } - } - - private def createFileConnection(dataSourceRequest: DataSourceRequest, format: String): FileBuilder = { - val opt = dataSourceRequest.options.getOrElse(Map()) - checkOptions(dataSourceRequest.name, List(PATH), opt) - ConnectionConfigWithTaskBuilder().file(dataSourceRequest.name, format, opt(PATH), opt) - } - - private def createIcebergConnection(dataSourceRequest: DataSourceRequest): FileBuilder = { - val opt = dataSourceRequest.options.getOrElse(Map()) - val name = dataSourceRequest.name - checkOptions(name, List(ICEBERG_CATALOG_TYPE, TABLE), opt) - val baseSparkOpts = Map( - SPARK_ICEBERG_CATALOG_TYPE -> opt(ICEBERG_CATALOG_TYPE), - TABLE -> opt(TABLE) - ) - val sparkOpts = opt(ICEBERG_CATALOG_TYPE) match { - case ICEBERG_CATALOG_HADOOP | ICEBERG_CATALOG_GLUE => - checkOptions(name, List(PATH), opt) - Map(SPARK_ICEBERG_CATALOG_WAREHOUSE -> opt(PATH)) - case ICEBERG_CATALOG_HIVE | ICEBERG_CATALOG_REST => - checkOptions(name, List(ICEBERG_CATALOG_URI), opt) - Map(SPARK_ICEBERG_CATALOG_URI -> opt(ICEBERG_CATALOG_URI)) - case _ => Map() - } - ConnectionConfigWithTaskBuilder().file(name, ICEBERG, opt.getOrElse(PATH, ""), baseSparkOpts ++ sparkOpts) - } - - private def createJdbcConnection(dataSourceRequest: DataSourceRequest, format: String): JdbcBuilder[_] = { - val opt = dataSourceRequest.options.getOrElse(Map()) - checkOptions(dataSourceRequest.name, List(URL, USERNAME, PASSWORD), opt) - val connectionConfigWithTaskBuilder = ConnectionConfigWithTaskBuilder() - - val baseConnection = format match { - case POSTGRES => connectionConfigWithTaskBuilder.postgres(dataSourceRequest.name, opt(URL), opt(USERNAME), opt(PASSWORD), opt) - case MYSQL => connectionConfigWithTaskBuilder.mysql(dataSourceRequest.name, opt(URL), opt(USERNAME), opt(PASSWORD), opt) - case x => throw new IllegalArgumentException(s"Unsupported connection format, format=$x") - } - - (opt.get(SCHEMA), opt.get(TABLE)) match { - case (Some(schema), Some(table)) => baseConnection.table(schema, table) - case (Some(schema), None) => - assert(schema.nonEmpty, s"Empty schema name for $format connection, data-source-name=${dataSourceRequest.name}") - throw new IllegalArgumentException(s"Missing table name for $format connection, data-source-name=${dataSourceRequest.name}, schema=$schema") - case (None, Some(table)) => - assert(table.nonEmpty, s"Empty table name for $format connection, data-source-name=${dataSourceRequest.name}") - throw new IllegalArgumentException(s"Missing schema name for $format connection, data-source-name=${dataSourceRequest.name}, table=$table") - case (None, None) => baseConnection // TODO this is allowed only when there is metadata collection enabled - } - } - - private def createCassandraConnection(dataSourceRequest: DataSourceRequest) = { - val opt = dataSourceRequest.options.getOrElse(Map()) - checkOptions(dataSourceRequest.name, List(URL, USERNAME, PASSWORD), opt) - - val cassandraConnection = ConnectionConfigWithTaskBuilder().cassandra(dataSourceRequest.name, opt(URL), opt(USERNAME), opt(PASSWORD), opt) - (opt.get(CASSANDRA_KEYSPACE), opt.get(CASSANDRA_TABLE)) match { - case (Some(keyspace), Some(table)) => cassandraConnection.table(keyspace, table) - case (Some(keyspace), None) => - assert(keyspace.nonEmpty, s"Empty keyspace name for Cassandra connection, data-source-name=${dataSourceRequest.name}") - throw new IllegalArgumentException(s"Missing table name for Cassandra connection, data-source-name=${dataSourceRequest.name}, keyspace=$keyspace") - case (None, Some(table)) => - assert(table.nonEmpty, s"Empty table name for Cassandra connection, data-source-name=${dataSourceRequest.name}") - throw new IllegalArgumentException(s"Missing keyspace name for Cassandra connection, data-source-name=${dataSourceRequest.name}, table=$table") - case (None, None) => cassandraConnection // TODO this is allowed only when there is metadata collection enabled - } - } - - private def checkOptions(dataSourceName: String, requiredOptions: List[String], options: Map[String, String]): Unit = { + def checkOptions(dataSourceName: String, requiredOptions: List[String], options: Map[String, String]): Unit = { requiredOptions.foreach(opt => assert( options.contains(opt) && options(opt).nonEmpty, @@ -598,5 +56,4 @@ object UiMapper { ) ) } - } diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/ValidationMapper.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/ValidationMapper.scala new file mode 100644 index 0000000..396fd74 --- /dev/null +++ b/app/src/main/scala/io/github/datacatering/datacaterer/core/ui/mapper/ValidationMapper.scala @@ -0,0 +1,207 @@ +package io.github.datacatering.datacaterer.core.ui.mapper + +import io.github.datacatering.datacaterer.api.connection.ConnectionTaskBuilder +import io.github.datacatering.datacaterer.api.model.Constants._ +import io.github.datacatering.datacaterer.api.{ColumnNamesValidationBuilder, ColumnValidationBuilder, GroupByValidationBuilder, ValidationBuilder} +import io.github.datacatering.datacaterer.core.ui.model.{DataSourceRequest, ValidationItemRequest, ValidationItemRequests} +import org.apache.log4j.Logger + +object ValidationMapper { + + private val LOGGER = Logger.getLogger(getClass.getName) + + def validationMapping(dataSourceRequest: DataSourceRequest): List[ValidationBuilder] = { + dataSourceRequest.validations + .map(validations => validations.optValidations.map(v => v.flatMap(validationItemRequestToValidationBuilders)).getOrElse(List())) + .getOrElse(List()) + } + + private def validationItemRequestToValidationBuilders(validateItem: ValidationItemRequest): List[ValidationBuilder] = { + validateItem.`type` match { + case VALIDATION_COLUMN => + //map type of column validation to builder method + //each option is a new validation + validateItem.options.map(opts => { + val colName = opts(VALIDATION_FIELD) + opts + .filter(o => !VALIDATION_SUPPORTING_OPTIONS.contains(o._1)) + .map(opt => { + val baseValid = validationWithDescriptionAndErrorThreshold(opts).col(colName) + columnValidationMapping(baseValid, opts, colName, opt) + }) + .toList + }).getOrElse(List()) + case VALIDATION_COLUMN_NAMES => + validateItem.options.map(opts => { + opts + .filter(o => !VALIDATION_SUPPORTING_OPTIONS.contains(o._1)) + .map(opt => { + val baseValid = validationWithDescriptionAndErrorThreshold(opts).columnNames + columnNamesValidationMapping(baseValid, opts, opt) + }) + .toList + }).getOrElse(List()) + case VALIDATION_UPSTREAM => + // require upstream ConnectionTaskBuilder + List() + case VALIDATION_GROUP_BY => + validateItem.options.map(opts => { + val groupByCols = opts(VALIDATION_GROUP_BY_COLUMNS).split(VALIDATION_OPTION_DELIMITER) + val baseValid = validationWithDescriptionAndErrorThreshold(opts).groupBy(groupByCols: _*) + groupByValidationMapping(baseValid, validateItem.nested) + }).getOrElse(List()) + case _ => List() + } + } + + private def validationWithDescriptionAndErrorThreshold(options: Map[String, String]): ValidationBuilder = { + val optDescription = options.get(VALIDATION_DESCRIPTION) + val optErrorThreshold = options.get(VALIDATION_ERROR_THRESHOLD) + val baseValidation = ValidationBuilder() + val validWithDesc = optDescription.map(desc => baseValidation.description(desc)).getOrElse(baseValidation) + optErrorThreshold.map(error => validWithDesc.errorThreshold(error.toDouble)).getOrElse(validWithDesc) + } + + def connectionsWithUpstreamValidationMapping(connections: List[ConnectionTaskBuilder[_]], dataSources: List[DataSourceRequest]): List[ConnectionTaskBuilder[_]] = { + val dataSourcesWithUpstreamValidation = dataSources + .filter(ds => { + ds.validations + .map(_.optValidations.getOrElse(List())).getOrElse(List()) + .exists(_.`type` == VALIDATION_UPSTREAM) + }) + .map(ds => (ds.taskName, ds.validations.map(_.optValidations.get).getOrElse(List()))) + .toMap + + connections.map(connection => { + val connectionTaskName = connection.task.map(_.task.name).getOrElse("") + val optDataSourceWithUpstreamValidation = dataSourcesWithUpstreamValidation.get(connectionTaskName) + optDataSourceWithUpstreamValidation match { + case Some(value) => + val upstreamValidations = value.filter(_.`type` == VALIDATION_UPSTREAM) + val mappedValidations = upstreamValidationMapping(connections, upstreamValidations) + val allValidations = connection.getValidations ++ mappedValidations + connection.validations(allValidations: _*).enableDataValidation(allValidations.nonEmpty) + case None => + LOGGER.debug(s"Task does not have any upstream validations defined, task-name=$connectionTaskName") + connection + } + }) + } + + private def upstreamValidationMapping(connections: List[ConnectionTaskBuilder[_]], upstreamValidations: List[ValidationItemRequest]): List[ValidationBuilder] = { + upstreamValidations.flatMap(upstreamValidation => { + def getOption(k: String): Option[String] = upstreamValidation.options.flatMap(_.get(k)) + + val upstreamTaskName = getOption(VALIDATION_UPSTREAM_TASK_NAME).getOrElse("") + val upstreamConnection = connections.find(_.task.exists(_.task.name == upstreamTaskName)) + + if (upstreamConnection.isDefined) { + val baseValid = validationWithDescriptionAndErrorThreshold(upstreamValidation.options.getOrElse(Map())).upstreamData(upstreamConnection.get) + + // check for join options + val joinValidation = (getOption(VALIDATION_UPSTREAM_JOIN_TYPE), getOption(VALIDATION_UPSTREAM_JOIN_COLUMNS), getOption(VALIDATION_UPSTREAM_JOIN_EXPR)) match { + case (Some(joinType), Some(joinCols), _) => baseValid.joinType(joinType).joinColumns(joinCols.split(","): _*) + case (Some(joinType), None, Some(joinExpr)) => baseValid.joinType(joinType).joinExpr(joinExpr) + case (None, Some(joinCols), _) => baseValid.joinType(DEFAULT_VALIDATION_JOIN_TYPE).joinColumns(joinCols.split(","): _*) + case (None, None, Some(joinExpr)) => baseValid.joinType(DEFAULT_VALIDATION_JOIN_TYPE).joinExpr(joinExpr) + case _ => throw new IllegalArgumentException("Unexpected upstream validation join options, need to define join columns or expression") + } + val upstreamWithValidations = upstreamValidation.nested.map(nest => + nest.validations.flatMap(nestedValidation => { + validationItemRequestToValidationBuilders(nestedValidation) + .map(joinValidation.withValidation) + }) + ).getOrElse(List()) + upstreamWithValidations + } else { + LOGGER.error(s"Validation upstream task name is not defined in task list, unable to execute upstream validations, " + + s"validation-upstream-task-name=$upstreamTaskName") + List() + } + }) + } + + private def groupByValidationMapping(baseValid: GroupByValidationBuilder, optNestedValidations: Option[ValidationItemRequests]) = { + optNestedValidations.map(validationReqs => { + validationReqs.validations.flatMap(validationReq => { + // only column validations can be applied after group by + validationReq.options.map(opts => { + // check for aggType and aggCol + (opts.get(VALIDATION_AGGREGATION_TYPE), opts.get(VALIDATION_AGGREGATION_COLUMN)) match { + case (Some(aggType), Some(aggCol)) => + val aggregateValidation = aggType match { + case VALIDATION_MIN => baseValid.min(aggCol) + case VALIDATION_MAX => baseValid.max(aggCol) + case VALIDATION_COUNT => baseValid.count(aggCol) + case VALIDATION_SUM => baseValid.sum(aggCol) + case VALIDATION_AVERAGE => baseValid.avg(aggCol) + case VALIDATION_STANDARD_DEVIATION => baseValid.stddev(aggCol) + case _ => throw new IllegalArgumentException(s"Unexpected aggregation type found in group by validation, aggregation-type=$aggType") + } + opts.filter(o => o._1 != VALIDATION_AGGREGATION_TYPE && o._1 != VALIDATION_AGGREGATION_COLUMN) + .map(opt => columnValidationMapping(aggregateValidation, opts, opt._2, opt)) + .toList + case _ => throw new IllegalArgumentException("Keys 'aggType' and 'aggCol' are expected when defining a group by validation") + } + }).getOrElse(List()) + }) + }).getOrElse(List()) + } + + private def columnNamesValidationMapping(baseValid: ColumnNamesValidationBuilder, opts: Map[String, String], opt: (String, String)) = { + opt._1 match { + case VALIDATION_COLUMN_NAMES_COUNT_EQUAL => baseValid.countEqual(opt._2.toInt) + case VALIDATION_COLUMN_NAMES_COUNT_BETWEEN => + val min = opts(VALIDATION_MIN) + val max = opts(VALIDATION_MAX) + baseValid.countBetween(min.toInt, max.toInt) + case VALIDATION_COLUMN_NAMES_MATCH_ORDER => baseValid.matchOrder(opt._2.split(VALIDATION_OPTION_DELIMITER): _*) + case VALIDATION_COLUMN_NAMES_MATCH_SET => baseValid.matchSet(opt._2.split(VALIDATION_OPTION_DELIMITER): _*) + case _ => + LOGGER.warn("Unknown column name validation type, defaulting to column name count equal to 1") + baseValid.countEqual(1) + } + } + + private def columnValidationMapping(baseValid: ColumnValidationBuilder, opts: Map[String, String], colName: String, opt: (String, String)) = { + opt._1 match { + case VALIDATION_EQUAL => baseValid.isEqualCol(opt._2) + case VALIDATION_NOT_EQUAL => baseValid.isNotEqualCol(opt._2) + case VALIDATION_NULL => baseValid.isNull + case VALIDATION_NOT_NULL => baseValid.isNotNull + case VALIDATION_CONTAINS => baseValid.contains(opt._2) + case VALIDATION_NOT_CONTAINS => baseValid.notContains(opt._2) + case VALIDATION_UNIQUE => validationWithDescriptionAndErrorThreshold(opts).unique(colName) + case VALIDATION_LESS_THAN => baseValid.lessThan(opt._2) + case VALIDATION_LESS_THAN_OR_EQUAL => baseValid.lessThanOrEqual(opt._2) + case VALIDATION_GREATER_THAN => baseValid.greaterThan(opt._2) + case VALIDATION_GREATER_THAN_OR_EQUAL => baseValid.greaterThanOrEqual(opt._2) + case VALIDATION_BETWEEN => + val min = opts(VALIDATION_MIN) + val max = opts(VALIDATION_MAX) + baseValid.betweenCol(min, max) + case VALIDATION_NOT_BETWEEN => + val min = opts(VALIDATION_MIN) + val max = opts(VALIDATION_MAX) + baseValid.notBetweenCol(min, max) + case VALIDATION_IN => baseValid.in(opt._2.split(VALIDATION_OPTION_DELIMITER): _*) + case VALIDATION_NOT_IN => baseValid.notIn(opt._2.split(VALIDATION_OPTION_DELIMITER): _*) + case VALIDATION_MATCHES => baseValid.matches(opt._2) + case VALIDATION_NOT_MATCHES => baseValid.notMatches(opt._2) + case VALIDATION_STARTS_WITH => baseValid.startsWith(opt._2) + case VALIDATION_NOT_STARTS_WITH => baseValid.notStartsWith(opt._2) + case VALIDATION_ENDS_WITH => baseValid.endsWith(opt._2) + case VALIDATION_NOT_ENDS_WITH => baseValid.notEndsWith(opt._2) + case VALIDATION_SIZE => baseValid.size(opt._2.toInt) + case VALIDATION_NOT_SIZE => baseValid.notSize(opt._2.toInt) + case VALIDATION_LESS_THAN_SIZE => baseValid.lessThanSize(opt._2.toInt) + case VALIDATION_LESS_THAN_OR_EQUAL_SIZE => baseValid.lessThanOrEqualSize(opt._2.toInt) + case VALIDATION_GREATER_THAN_SIZE => baseValid.greaterThanSize(opt._2.toInt) + case VALIDATION_GREATER_THAN_OR_EQUAL_SIZE => baseValid.greaterThanOrEqualSize(opt._2.toInt) + case VALIDATION_LUHN_CHECK => baseValid.luhnCheck + case VALIDATION_HAS_TYPE => baseValid.hasType(opt._2) + case VALIDATION_SQL => baseValid.expr(opt._2) + case x => throw new IllegalArgumentException(s"Unsupported column validation, validation-key=$x") + } + } +} diff --git a/app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/ConfigurationMapperTest.scala b/app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/ConfigurationMapperTest.scala new file mode 100644 index 0000000..870f192 --- /dev/null +++ b/app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/ConfigurationMapperTest.scala @@ -0,0 +1,141 @@ +package io.github.datacatering.datacaterer.core.ui.mapper + +import io.github.datacatering.datacaterer.api.DataCatererConfigurationBuilder +import io.github.datacatering.datacaterer.api.model.Constants._ +import io.github.datacatering.datacaterer.core.ui.model.ConfigurationRequest +import org.junit.runner.RunWith +import org.scalatest.funsuite.AnyFunSuite +import org.scalatestplus.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +class ConfigurationMapperTest extends AnyFunSuite { + + test("Can convert UI flag config") { + val configRequest = ConfigurationRequest(flag = Map( + CONFIG_FLAGS_COUNT -> "false", + CONFIG_FLAGS_GENERATE_DATA -> "false", + CONFIG_FLAGS_RECORD_TRACKING -> "false", + CONFIG_FLAGS_DELETE_GENERATED_RECORDS -> "false", + CONFIG_FLAGS_GENERATE_PLAN_AND_TASKS -> "false", + CONFIG_FLAGS_FAIL_ON_ERROR -> "false", + CONFIG_FLAGS_UNIQUE_CHECK -> "false", + CONFIG_FLAGS_SINK_METADATA -> "false", + CONFIG_FLAGS_SAVE_REPORTS -> "false", + CONFIG_FLAGS_VALIDATION -> "false", + CONFIG_FLAGS_GENERATE_VALIDATIONS -> "false", + CONFIG_FLAGS_ALERTS -> "false", + "blah" -> "false" + )) + val baseConf = DataCatererConfigurationBuilder() + val res = ConfigurationMapper.mapFlagsConfiguration(configRequest, baseConf).build + + assert(!res.flagsConfig.enableCount) + assert(!res.flagsConfig.enableGenerateData) + assert(!res.flagsConfig.enableRecordTracking) + assert(!res.flagsConfig.enableDeleteGeneratedRecords) + assert(!res.flagsConfig.enableGeneratePlanAndTasks) + assert(!res.flagsConfig.enableFailOnError) + assert(!res.flagsConfig.enableUniqueCheck) + assert(!res.flagsConfig.enableSinkMetadata) + assert(!res.flagsConfig.enableSaveReports) + assert(!res.flagsConfig.enableValidation) + assert(!res.flagsConfig.enableGenerateValidations) + assert(!res.flagsConfig.enableAlerts) + } + + test("Can convert UI alert config") { + val configRequest = ConfigurationRequest(alert = Map(CONFIG_ALERT_TRIGGER_ON -> "failure", CONFIG_ALERT_SLACK_TOKEN -> "abc123", + CONFIG_ALERT_SLACK_CHANNELS -> "job-fail", "blah" -> "hello")) + val baseConf = DataCatererConfigurationBuilder() + val res = ConfigurationMapper.mapAlertConfiguration(configRequest, baseConf).build + + assert(res.alertConfig.triggerOn == "failure") + assert(res.alertConfig.slackAlertConfig.token == "abc123") + assert(res.alertConfig.slackAlertConfig.channels == List("job-fail")) + } + + test("Can convert UI validation config") { + val configRequest = ConfigurationRequest(validation = Map(CONFIG_VALIDATION_NUM_SAMPLE_ERROR_RECORDS -> "2", + CONFIG_VALIDATION_ENABLE_DELETE_RECORD_TRACKING_FILES -> "false", "blah" -> "hello")) + val baseConf = DataCatererConfigurationBuilder() + val res = ConfigurationMapper.mapValidationConfiguration(configRequest, baseConf).build + + assert(res.validationConfig.numSampleErrorRecords == 2) + assert(!res.validationConfig.enableDeleteRecordTrackingFiles) + } + + test("Can convert UI generation config") { + val configRequest = ConfigurationRequest(generation = Map(CONFIG_GENERATION_NUM_RECORDS_PER_BATCH -> "100", + CONFIG_GENERATION_NUM_RECORDS_PER_STEP -> "10", "blah" -> "hello")) + val baseConf = DataCatererConfigurationBuilder() + val res = ConfigurationMapper.mapGenerationConfiguration(configRequest, baseConf).build + + assert(res.generationConfig.numRecordsPerBatch == 100) + assert(res.generationConfig.numRecordsPerStep.contains(10)) + } + + test("Can convert UI metadata config") { + val configRequest = ConfigurationRequest(metadata = Map( + CONFIG_METADATA_NUM_RECORDS_FROM_DATA_SOURCE -> "100", + CONFIG_METADATA_NUM_RECORDS_FOR_ANALYSIS -> "10", + CONFIG_METADATA_ONE_OF_DISTINCT_COUNT_VS_COUNT_THRESHOLD -> "1", + CONFIG_METADATA_ONE_OF_MIN_COUNT -> "5", + CONFIG_METADATA_NUM_GENERATED_SAMPLES -> "7", + "blah" -> "hello" + )) + val baseConf = DataCatererConfigurationBuilder() + val res = ConfigurationMapper.mapMetadataConfiguration(configRequest, baseConf).build + + assert(res.metadataConfig.numRecordsFromDataSource == 100) + assert(res.metadataConfig.numRecordsForAnalysis == 10) + assert(res.metadataConfig.oneOfDistinctCountVsCountThreshold == 1) + assert(res.metadataConfig.oneOfMinCount == 5) + assert(res.metadataConfig.numGeneratedSamples == 7) + } + + test("Can convert UI folder config") { + val configRequest = ConfigurationRequest(folder = Map( + CONFIG_FOLDER_PLAN_FILE_PATH -> "/tmp/plan-file", + CONFIG_FOLDER_TASK_FOLDER_PATH -> "/tmp/task-folder", + CONFIG_FOLDER_GENERATED_PLAN_AND_TASK_FOLDER_PATH -> "/tmp/gen", + CONFIG_FOLDER_GENERATED_REPORTS_FOLDER_PATH -> "/tmp/report", + CONFIG_FOLDER_RECORD_TRACKING_FOLDER_PATH -> "/tmp/record", + CONFIG_FOLDER_VALIDATION_FOLDER_PATH -> "/tmp/valid", + CONFIG_FOLDER_RECORD_TRACKING_FOR_VALIDATION_FOLDER_PATH -> "/tmp/record-valid", + "blah" -> "hello" + )) + val baseConf = DataCatererConfigurationBuilder() + val res = ConfigurationMapper.mapFolderConfiguration(configRequest, "/my-install", baseConf).build + + assert(res.foldersConfig.planFilePath == "/tmp/plan-file") + assert(res.foldersConfig.taskFolderPath == "/tmp/task-folder") + assert(res.foldersConfig.generatedPlanAndTaskFolderPath == "/tmp/gen") + assert(res.foldersConfig.generatedReportsFolderPath == "/tmp/report") + assert(res.foldersConfig.recordTrackingFolderPath == "/tmp/record") + assert(res.foldersConfig.validationFolderPath == "/tmp/valid") + assert(res.foldersConfig.recordTrackingForValidationFolderPath == "/tmp/record-valid") + } + + test("Can convert UI folder config with install directory") { + val configRequest = ConfigurationRequest(folder = Map( + CONFIG_FOLDER_PLAN_FILE_PATH -> "", + CONFIG_FOLDER_TASK_FOLDER_PATH -> "", + CONFIG_FOLDER_GENERATED_PLAN_AND_TASK_FOLDER_PATH -> "", + CONFIG_FOLDER_GENERATED_REPORTS_FOLDER_PATH -> "", + CONFIG_FOLDER_RECORD_TRACKING_FOLDER_PATH -> "", + CONFIG_FOLDER_VALIDATION_FOLDER_PATH -> "", + CONFIG_FOLDER_RECORD_TRACKING_FOR_VALIDATION_FOLDER_PATH -> "", + "blah" -> "hello" + )) + val baseConf = DataCatererConfigurationBuilder() + val res = ConfigurationMapper.mapFolderConfiguration(configRequest, "/my-install", baseConf).build + + assert(res.foldersConfig.planFilePath == DEFAULT_PLAN_FILE_PATH) + assert(res.foldersConfig.taskFolderPath == "/my-install/task") + assert(res.foldersConfig.generatedPlanAndTaskFolderPath == "/my-install/generated-plan-task") + assert(res.foldersConfig.generatedReportsFolderPath == "/my-install/report") + assert(res.foldersConfig.recordTrackingFolderPath == "/my-install/record-tracking") + assert(res.foldersConfig.validationFolderPath == "/my-install/validation") + assert(res.foldersConfig.recordTrackingForValidationFolderPath == "/my-install/record-tracking-validation") + } +} diff --git a/app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/ConnectionMapperTest.scala b/app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/ConnectionMapperTest.scala new file mode 100644 index 0000000..bfae910 --- /dev/null +++ b/app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/ConnectionMapperTest.scala @@ -0,0 +1,179 @@ +package io.github.datacatering.datacaterer.core.ui.mapper + +import io.github.datacatering.datacaterer.api.model.Constants.{CASSANDRA, CASSANDRA_KEYSPACE, CASSANDRA_NAME, CASSANDRA_TABLE, CSV, DRIVER, FORMAT, HTTP, JDBC, JMS, JMS_CONNECTION_FACTORY, JMS_DESTINATION_NAME, JMS_INITIAL_CONTEXT_FACTORY, JMS_VPN_NAME, JSON, KAFKA, KAFKA_TOPIC, MYSQL, MYSQL_DRIVER, ORC, PARQUET, PARTITIONS, PARTITION_BY, PASSWORD, PATH, POSTGRES, POSTGRES_DRIVER, SCHEMA, SOLACE, TABLE, URL, USERNAME} +import io.github.datacatering.datacaterer.core.ui.model.DataSourceRequest +import org.junit.runner.RunWith +import org.scalatest.funsuite.AnyFunSuite +import org.scalatestplus.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +class ConnectionMapperTest extends AnyFunSuite { + + test("Can convert UI connection mapping for Cassandra") { + val dataSourceRequest = DataSourceRequest("cassandra-name", "task-1", Some(CASSANDRA_NAME), Some(Map(URL -> "localhost:9092", USERNAME -> "cassandra", PASSWORD -> "cassandra"))) + val res = ConnectionMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder + assertResult("cassandra-name")(res.dataSourceName) + assertResult(8)(res.options.size) + assertResult(Some(CASSANDRA))(res.options.get(FORMAT)) + assertResult(Some("localhost:9092"))(res.options.get(URL)) + assertResult(Some("localhost"))(res.options.get("spark.cassandra.connection.host")) + assertResult(Some("9092"))(res.options.get("spark.cassandra.connection.port")) + assertResult(Some("cassandra"))(res.options.get(USERNAME)) + assertResult(Some("cassandra"))(res.options.get(PASSWORD)) + assertResult(Some("cassandra"))(res.options.get("spark.cassandra.auth.username")) + assertResult(Some("cassandra"))(res.options.get("spark.cassandra.auth.password")) + } + + test("Can convert UI connection mapping for Cassandra with keyspace and table") { + val dataSourceRequest = DataSourceRequest("cassandra-name", "task-1", Some(CASSANDRA_NAME), Some(Map(URL -> "localhost:9092", USERNAME -> "cassandra", PASSWORD -> "cassandra", CASSANDRA_KEYSPACE -> "account", CASSANDRA_TABLE -> "accounts"))) + val res = ConnectionMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder + assertResult("cassandra-name")(res.dataSourceName) + assertResult(10)(res.options.size) + assertResult(Some(CASSANDRA))(res.options.get(FORMAT)) + assertResult(Some("localhost:9092"))(res.options.get(URL)) + assertResult(Some("account"))(res.options.get(CASSANDRA_KEYSPACE)) + assertResult(Some("accounts"))(res.options.get(CASSANDRA_TABLE)) + } + + test("Throw error if only keyspace or table is defined for Cassandra") { + val dataSourceRequest = DataSourceRequest("cassandra-name", "task-1", Some(CASSANDRA_NAME), Some(Map(URL -> "localhost:9092", USERNAME -> "cassandra", PASSWORD -> "cassandra", CASSANDRA_KEYSPACE -> "account"))) + val dataSourceRequest1 = DataSourceRequest("cassandra-name", "task-1", Some(CASSANDRA_NAME), Some(Map(URL -> "localhost:9092", USERNAME -> "cassandra", PASSWORD -> "cassandra", CASSANDRA_TABLE -> "accounts"))) + assertThrows[IllegalArgumentException](ConnectionMapper.connectionMapping(dataSourceRequest)) + assertThrows[IllegalArgumentException](ConnectionMapper.connectionMapping(dataSourceRequest1)) + } + + test("Can convert UI connection mapping for Postgres") { + val dataSourceRequest = DataSourceRequest("postgres-name", "task-1", Some(POSTGRES), Some(Map(URL -> "localhost:5432", USERNAME -> "postgres", PASSWORD -> "postgres", DRIVER -> POSTGRES_DRIVER))) + val res = ConnectionMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder + assertResult("postgres-name")(res.dataSourceName) + assertResult(5)(res.options.size) + assertResult(Some(JDBC))(res.options.get(FORMAT)) + assertResult(Some(POSTGRES_DRIVER))(res.options.get(DRIVER)) + assertResult(Some("localhost:5432"))(res.options.get(URL)) + assertResult(Some("postgres"))(res.options.get(USERNAME)) + assertResult(Some("postgres"))(res.options.get(PASSWORD)) + } + + test("Can convert UI connection mapping for MySQL") { + val dataSourceRequest = DataSourceRequest("mysql-name", "task-1", Some(MYSQL), Some(Map(URL -> "localhost:5432", USERNAME -> "root", PASSWORD -> "root", DRIVER -> MYSQL_DRIVER))) + val res = ConnectionMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder + assertResult("mysql-name")(res.dataSourceName) + assertResult(5)(res.options.size) + assertResult(Some(JDBC))(res.options.get(FORMAT)) + assertResult(Some(MYSQL_DRIVER))(res.options.get(DRIVER)) + assertResult(Some("localhost:5432"))(res.options.get(URL)) + assertResult(Some("root"))(res.options.get(USERNAME)) + assertResult(Some("root"))(res.options.get(PASSWORD)) + } + + test("Can convert UI connection mapping for Postgres with schema and table") { + val dataSourceRequest = DataSourceRequest("postgres-name", "task-1", Some(POSTGRES), Some(Map(URL -> "localhost:5432", USERNAME -> "postgres", PASSWORD -> "postgres", DRIVER -> POSTGRES_DRIVER, SCHEMA -> "account", TABLE -> "accounts"))) + val res = ConnectionMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder + assertResult("postgres-name")(res.dataSourceName) + assertResult(7)(res.options.size) + assertResult(Some(JDBC))(res.options.get(FORMAT)) + assertResult(Some(POSTGRES_DRIVER))(res.options.get(DRIVER)) + assertResult(Some("localhost:5432"))(res.options.get(URL)) + assertResult(Some("postgres"))(res.options.get(USERNAME)) + assertResult(Some("postgres"))(res.options.get(PASSWORD)) + assertResult(Some("account"))(res.options.get(SCHEMA)) + assertResult(Some("accounts"))(res.options.get(TABLE)) + } + + test("Throw error if only schema or table is defined for Postgres") { + val dataSourceRequest = DataSourceRequest("postgres-name", "task-1", Some(POSTGRES), Some(Map(URL -> "localhost:5432", USERNAME -> "postgres", PASSWORD -> "postgres", DRIVER -> POSTGRES_DRIVER, SCHEMA -> "account"))) + val dataSourceRequest1 = DataSourceRequest("postgres-name", "task-1", Some(POSTGRES), Some(Map(URL -> "localhost:5432", USERNAME -> "postgres", PASSWORD -> "postgres", DRIVER -> POSTGRES_DRIVER, TABLE -> "accounts"))) + assertThrows[IllegalArgumentException](ConnectionMapper.connectionMapping(dataSourceRequest)) + assertThrows[IllegalArgumentException](ConnectionMapper.connectionMapping(dataSourceRequest1)) + } + + test("Can convert UI connection mapping for CSV") { + val dataSourceRequest = DataSourceRequest("csv-name", "task-1", Some(CSV), Some(Map(PATH -> "/tmp/my-csv"))) + val res = ConnectionMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder + assertResult(2)(res.options.size) + assertResult(Some(CSV))(res.options.get(FORMAT)) + assertResult(Some("/tmp/my-csv"))(res.options.get(PATH)) + } + + test("Can convert UI connection mapping for CSV with partitions and partitionBy") { + val dataSourceRequest = DataSourceRequest("csv-name", "task-1", Some(CSV), Some(Map(PATH -> "/tmp/my-csv", PARTITIONS -> "2", PARTITION_BY -> "account_id,year"))) + val res = ConnectionMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder + assertResult(4)(res.options.size) + assertResult(Some(CSV))(res.options.get(FORMAT)) + assertResult(Some("/tmp/my-csv"))(res.options.get(PATH)) + assertResult(Some("2"))(res.options.get(PARTITIONS)) + assertResult(Some("account_id,year"))(res.options.get(PARTITION_BY)) + } + + test("Can convert UI connection mapping for JSON") { + val dataSourceRequest = DataSourceRequest("json-name", "task-1", Some(JSON), Some(Map(PATH -> "/tmp/my-json"))) + val res = ConnectionMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder + assertResult(2)(res.options.size) + assertResult(Some(JSON))(res.options.get(FORMAT)) + assertResult(Some("/tmp/my-json"))(res.options.get(PATH)) + } + + test("Can convert UI connection mapping for Parquet") { + val dataSourceRequest = DataSourceRequest("parquet-name", "task-1", Some(PARQUET), Some(Map(PATH -> "/tmp/my-parquet"))) + val res = ConnectionMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder + assertResult(2)(res.options.size) + assertResult(Some(PARQUET))(res.options.get(FORMAT)) + assertResult(Some("/tmp/my-parquet"))(res.options.get(PATH)) + } + + test("Can convert UI connection mapping for ORC") { + val dataSourceRequest = DataSourceRequest("orc-name", "task-1", Some(ORC), Some(Map(PATH -> "/tmp/my-orc"))) + val res = ConnectionMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder + assertResult(2)(res.options.size) + assertResult(Some(ORC))(res.options.get(FORMAT)) + assertResult(Some("/tmp/my-orc"))(res.options.get(PATH)) + } + + test("Can convert UI connection mapping for Solace") { + val dataSourceRequest = DataSourceRequest("solace-name", "task-1", Some(SOLACE), Some(Map(URL -> "localhost:55554", USERNAME -> "solace", PASSWORD -> "solace", JMS_DESTINATION_NAME -> "/JNDI/my-queue", JMS_VPN_NAME -> "default", JMS_CONNECTION_FACTORY -> "jms-connection", JMS_INITIAL_CONTEXT_FACTORY -> "jms-init"))) + val res = ConnectionMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder + assertResult(8)(res.options.size) + assertResult(Some(JMS))(res.options.get(FORMAT)) + assertResult(Some("localhost:55554"))(res.options.get(URL)) + assertResult(Some("solace"))(res.options.get(USERNAME)) + assertResult(Some("solace"))(res.options.get(PASSWORD)) + assertResult(Some("/JNDI/my-queue"))(res.options.get(JMS_DESTINATION_NAME)) + assertResult(Some("default"))(res.options.get(JMS_VPN_NAME)) + assertResult(Some("jms-connection"))(res.options.get(JMS_CONNECTION_FACTORY)) + assertResult(Some("jms-init"))(res.options.get(JMS_INITIAL_CONTEXT_FACTORY)) + } + + test("Can convert UI connection mapping for Kafka") { + val dataSourceRequest = DataSourceRequest("kafka-name", "task-1", Some(KAFKA), Some(Map(URL -> "localhost:1234", KAFKA_TOPIC -> "my-topic"))) + val res = ConnectionMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder + assertResult(4)(res.options.size) + assertResult(Some(KAFKA))(res.options.get(FORMAT)) + assertResult(Some("localhost:1234"))(res.options.get(URL)) + assertResult(Some("localhost:1234"))(res.options.get("kafka.bootstrap.servers")) + assertResult(Some("my-topic"))(res.options.get(KAFKA_TOPIC)) + } + + test("Can convert UI connection mapping for HTTP") { + val dataSourceRequest = DataSourceRequest( + "http-name", + "task-1", + Some(HTTP), + Some(Map(USERNAME -> "root", PASSWORD -> "root")) + ) + val res = ConnectionMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder + assertResult(3)(res.options.size) + assertResult(Some(HTTP))(res.options.get(FORMAT)) + assertResult(Some("root"))(res.options.get(USERNAME)) + assertResult(Some("root"))(res.options.get(PASSWORD)) + } + + test("Throw exception if provided unknown data source") { + val dataSourceRequest = DataSourceRequest("unknown-name", "task-1", Some("unknown")) + assertThrows[IllegalArgumentException](ConnectionMapper.connectionMapping(dataSourceRequest)) + } + + test("Throw exception if no data source type provided") { + val dataSourceRequest = DataSourceRequest("unknown-name", "task-1", None) + assertThrows[IllegalArgumentException](ConnectionMapper.connectionMapping(dataSourceRequest)) + } +} diff --git a/app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/CountMapperTest.scala b/app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/CountMapperTest.scala new file mode 100644 index 0000000..c09684f --- /dev/null +++ b/app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/CountMapperTest.scala @@ -0,0 +1,82 @@ +package io.github.datacatering.datacaterer.core.ui.mapper + +import io.github.datacatering.datacaterer.api.model.Constants.{DISTRIBUTION, DISTRIBUTION_EXPONENTIAL, DISTRIBUTION_NORMAL, DISTRIBUTION_RATE_PARAMETER, MAXIMUM, MINIMUM} +import io.github.datacatering.datacaterer.core.ui.model.{DataSourceRequest, RecordCountRequest} +import org.junit.runner.RunWith +import org.scalatest.funsuite.AnyFunSuite +import org.scalatestplus.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +class CountMapperTest extends AnyFunSuite { + + test("Can convert UI count configuration with base record count") { + val dataSourceRequest = DataSourceRequest("plan-name", "task-1", count = Some(RecordCountRequest(Some(10)))) + val res = CountMapper.countMapping(dataSourceRequest).count + assert(res.records.contains(10)) + } + + test("Can convert UI count configuration with base record min and max") { + val dataSourceRequest = DataSourceRequest("plan-name", "task-1", count = Some(RecordCountRequest(None, Some(1), Some(10)))) + val res = CountMapper.countMapping(dataSourceRequest).count + assert(res.generator.isDefined) + assert(res.generator.get.options.get(MINIMUM).contains("1")) + assert(res.generator.get.options.get(MAXIMUM).contains("10")) + } + + test("Can convert UI count configuration with per column count") { + val dataSourceRequest = DataSourceRequest("plan-name", "task-1", count = Some(RecordCountRequest(perColumnNames = Some(List("account_id")), perColumnRecords = Some(10)))) + val res = CountMapper.countMapping(dataSourceRequest).count + assert(res.perColumn.isDefined) + assert(res.perColumn.get.columnNames.size == 1) + assert(res.perColumn.get.columnNames.contains("account_id")) + assert(res.perColumn.get.count.contains(10)) + } + + test("Can convert UI count configuration with per column min and max") { + val dataSourceRequest = DataSourceRequest("plan-name", "task-1", count = Some(RecordCountRequest(perColumnNames = Some(List("account_id")), perColumnRecordsMin = Some(10), perColumnRecordsMax = Some(20)))) + val res = CountMapper.countMapping(dataSourceRequest).count + assert(res.perColumn.isDefined) + assert(res.perColumn.get.columnNames.size == 1) + assert(res.perColumn.get.columnNames.contains("account_id")) + assert(res.perColumn.get.generator.isDefined) + assert(res.perColumn.get.generator.get.options.get(MINIMUM).contains("10")) + assert(res.perColumn.get.generator.get.options.get(MAXIMUM).contains("20")) + } + + test("Can convert UI count configuration with per column distribution") { + val dataSourceRequest = DataSourceRequest("plan-name", "task-1", count = Some(RecordCountRequest(perColumnNames = Some(List("account_id")), perColumnRecordsDistribution = Some(DISTRIBUTION_EXPONENTIAL), perColumnRecordsDistributionRateParam = Some("0.5")))) + val res = CountMapper.countMapping(dataSourceRequest).count + assert(res.perColumn.isDefined) + assert(res.perColumn.get.columnNames.size == 1) + assert(res.perColumn.get.columnNames.contains("account_id")) + assert(res.perColumn.get.generator.isDefined) + assert(res.perColumn.get.generator.get.options.get(DISTRIBUTION).contains(DISTRIBUTION_EXPONENTIAL)) + assert(res.perColumn.get.generator.get.options.get(DISTRIBUTION_RATE_PARAMETER).contains("0.5")) + } + + test("Can convert UI count configuration with per column distribution with min and max") { + val dataSourceRequest = DataSourceRequest("plan-name", "task-1", count = Some(RecordCountRequest(perColumnNames = Some(List("account_id")), perColumnRecordsDistribution = Some(DISTRIBUTION_EXPONENTIAL), perColumnRecordsDistributionRateParam = Some("0.5"), perColumnRecordsMin = Some(1), perColumnRecordsMax = Some(3)))) + val res = CountMapper.countMapping(dataSourceRequest).count + assert(res.perColumn.isDefined) + assert(res.perColumn.get.columnNames.size == 1) + assert(res.perColumn.get.columnNames.contains("account_id")) + assert(res.perColumn.get.generator.isDefined) + assert(res.perColumn.get.generator.get.options.get(DISTRIBUTION).contains(DISTRIBUTION_EXPONENTIAL)) + assert(res.perColumn.get.generator.get.options.get(DISTRIBUTION_RATE_PARAMETER).contains("0.5")) + assert(res.perColumn.get.generator.get.options.get(MINIMUM).contains("1")) + assert(res.perColumn.get.generator.get.options.get(MAXIMUM).contains("3")) + } + + test("Can convert UI count configuration with per column normal distribution with min and max") { + val dataSourceRequest = DataSourceRequest("plan-name", "task-1", count = Some(RecordCountRequest(perColumnNames = Some(List("account_id")), perColumnRecordsDistribution = Some(DISTRIBUTION_NORMAL), perColumnRecordsMin = Some(1), perColumnRecordsMax = Some(3)))) + val res = CountMapper.countMapping(dataSourceRequest).count + assert(res.perColumn.isDefined) + assert(res.perColumn.get.columnNames.size == 1) + assert(res.perColumn.get.columnNames.contains("account_id")) + assert(res.perColumn.get.generator.isDefined) + assert(res.perColumn.get.generator.get.options.get(DISTRIBUTION).contains(DISTRIBUTION_NORMAL)) + assert(res.perColumn.get.generator.get.options.get(MINIMUM).contains("1")) + assert(res.perColumn.get.generator.get.options.get(MAXIMUM).contains("3")) + } + +} diff --git a/app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/FieldMapperTest.scala b/app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/FieldMapperTest.scala new file mode 100644 index 0000000..aa5e303 --- /dev/null +++ b/app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/FieldMapperTest.scala @@ -0,0 +1,57 @@ +package io.github.datacatering.datacaterer.core.ui.mapper + +import io.github.datacatering.datacaterer.api.model.Constants.{HTTP, ONE_OF_GENERATOR, OPEN_API, PASSWORD, REGEX_GENERATOR, SCHEMA_LOCATION, USERNAME} +import io.github.datacatering.datacaterer.core.model.Constants.CONNECTION_TYPE +import io.github.datacatering.datacaterer.core.ui.model.{DataSourceRequest, FieldRequest, FieldRequests, MetadataSourceRequest} +import org.junit.runner.RunWith +import org.scalatest.funsuite.AnyFunSuite +import org.scalatestplus.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +class FieldMapperTest extends AnyFunSuite { + + test("Can convert UI field mapping") { + val dataSourceRequest = DataSourceRequest("plan-name", "task-1", fields = Some(FieldRequests(Some(List( + FieldRequest("name", "string"), + FieldRequest("account_id", "string", Some(Map(REGEX_GENERATOR -> "acc[0-9]{1}"))), + FieldRequest("status", "string", Some(Map(ONE_OF_GENERATOR -> "open,closed"))), + FieldRequest("details", "struct", nested = Some(FieldRequests(Some(List(FieldRequest("age", "integer")))))), + ))))) + val optFieldMapping = FieldMapper.fieldMapping(dataSourceRequest) + assert(optFieldMapping._2.isDefined) + val res = optFieldMapping._2.get + assert(res.size == 4) + val nameField = res.find(_.field.name == "name") + assert(nameField.exists(_.field.`type`.contains("string"))) + val accountField = res.find(_.field.name == "account_id") + assert(accountField.exists(_.field.`type`.contains("string"))) + assert(accountField.exists(_.field.generator.exists(_.`type` == REGEX_GENERATOR))) + assert(accountField.exists(_.field.generator.exists(_.options.get(REGEX_GENERATOR).contains("acc[0-9]{1}")))) + val statusField = res.find(_.field.name == "status") + assert(statusField.exists(_.field.generator.exists(_.`type` == ONE_OF_GENERATOR))) + assert(statusField.exists(_.field.generator.exists(_.options.get(ONE_OF_GENERATOR).contains(List("open", "closed"))))) + val detailsField = res.find(_.field.name == "details") + assertResult(Some("struct<>"))(detailsField.get.field.`type`) + assert(detailsField.exists(_.field.schema.isDefined)) + assert(detailsField.exists(_.field.schema.get.fields.isDefined)) + assert(detailsField.exists(_.field.schema.get.fields.get.size == 1)) + assert(detailsField.exists(_.field.schema.get.fields.get.head.name == "age")) + assert(detailsField.exists(_.field.schema.get.fields.get.head.`type`.contains("integer"))) + } + + test("Can convert UI connection mapping for HTTP with OpenAPI spec as a metadata source") { + val dataSourceRequest = DataSourceRequest( + "http-name", + "task-1", + Some(HTTP), + Some(Map(USERNAME -> "root", PASSWORD -> "root")), + Some(FieldRequests(optMetadataSource = Some(MetadataSourceRequest("my-openapi", Some(Map(CONNECTION_TYPE -> OPEN_API, SCHEMA_LOCATION -> "/opt/open-api-spec.json")))))) + ) + val res = FieldMapper.fieldMapping(dataSourceRequest) + assert(res._1.isDefined) + val metadataSource = res._1.get.metadataSource + assertResult(OPEN_API)(metadataSource.`type`) + assertResult(1)(metadataSource.connectionOptions.size) + assertResult("/opt/open-api-spec.json")(metadataSource.connectionOptions(SCHEMA_LOCATION)) + } +} diff --git a/app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/ForeignKeyMapperTest.scala b/app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/ForeignKeyMapperTest.scala new file mode 100644 index 0000000..9701d77 --- /dev/null +++ b/app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/ForeignKeyMapperTest.scala @@ -0,0 +1,29 @@ +package io.github.datacatering.datacaterer.core.ui.mapper + +import io.github.datacatering.datacaterer.api.{FieldBuilder, PlanBuilder} +import io.github.datacatering.datacaterer.api.connection.FileBuilder +import io.github.datacatering.datacaterer.core.ui.model.{ForeignKeyRequest, ForeignKeyRequestItem} +import org.junit.runner.RunWith +import org.scalatest.funsuite.AnyFunSuite +import org.scalatestplus.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +class ForeignKeyMapperTest extends AnyFunSuite { + + test("Can convert UI foreign key config") { + val foreignKeyRequests = List(ForeignKeyRequest(Some(ForeignKeyRequestItem("task-1", "account_id,year")), List(ForeignKeyRequestItem("task-2", "account_number,year")))) + val connections = List( + FileBuilder().name("task-1").schema(FieldBuilder().name("account_id"), FieldBuilder().name("year")), + FileBuilder().name("task-2").schema(FieldBuilder().name("account_number"), FieldBuilder().name("year")) + ) + val planBuilder = PlanBuilder() + val res = ForeignKeyMapper.foreignKeyMapping(foreignKeyRequests, connections, planBuilder) + assertResult(1)(res.plan.sinkOptions.get.foreignKeys.size) + assert(res.plan.sinkOptions.get.foreignKeys.head._1.startsWith("json")) + assert(res.plan.sinkOptions.get.foreignKeys.head._1.endsWith("account_id,year")) + assert(res.plan.sinkOptions.get.foreignKeys.head._2.size == 1) + assert(res.plan.sinkOptions.get.foreignKeys.head._2.head.startsWith("json")) + assert(res.plan.sinkOptions.get.foreignKeys.head._2.head.endsWith("account_number,year")) + } + +} diff --git a/app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/UiMapperTest.scala b/app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/UiMapperTest.scala index 2637782..3c124e6 100644 --- a/app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/UiMapperTest.scala +++ b/app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/UiMapperTest.scala @@ -1,10 +1,7 @@ package io.github.datacatering.datacaterer.core.ui.mapper -import io.github.datacatering.datacaterer.api.connection.FileBuilder import io.github.datacatering.datacaterer.api.model.Constants._ -import io.github.datacatering.datacaterer.api.model.{ColumnNamesValidation, ExpressionValidation, GroupByValidation, UpstreamDataSourceValidation} -import io.github.datacatering.datacaterer.api.{DataCatererConfigurationBuilder, FieldBuilder, PlanBuilder} -import io.github.datacatering.datacaterer.core.ui.model.{ConfigurationRequest, DataSourceRequest, FieldRequest, FieldRequests, ForeignKeyRequest, ForeignKeyRequestItem, PlanRunRequest, RecordCountRequest, ValidationItemRequest, ValidationItemRequests, ValidationRequest} +import io.github.datacatering.datacaterer.core.ui.model.{ConfigurationRequest, DataSourceRequest, PlanRunRequest} import org.junit.runner.RunWith import org.scalatest.funsuite.AnyFunSuite import org.scalatestplus.junit.JUnitRunner @@ -13,7 +10,13 @@ import org.scalatestplus.junit.JUnitRunner class UiMapperTest extends AnyFunSuite { test("Can convert UI plan run request to plan run") { - val planRunRequest = PlanRunRequest("plan-name", "some-id", List(DataSourceRequest("csv", "my-csv", Some(CSV), Some(Map(PATH -> "/tmp/csv")))), List(), Some(ConfigurationRequest())) + val planRunRequest = PlanRunRequest( + "plan-name", + "some-id", + List(DataSourceRequest("csv", "my-csv", Some(CSV), Some(Map(PATH -> "/tmp/csv")))), + List(), + Some(ConfigurationRequest()) + ) val res = UiMapper.mapToPlanRun(planRunRequest, "/tmp/my-install")._plan assertResult("plan-name")(res.name) assertResult(Some("some-id"))(res.runId) @@ -21,721 +24,4 @@ class UiMapperTest extends AnyFunSuite { assertResult(true)(res.sinkOptions.isEmpty) assertResult(0)(res.validations.size) } - - test("Can convert UI foreign key config") { - val foreignKeyRequests = List(ForeignKeyRequest(Some(ForeignKeyRequestItem("task-1", "account_id,year")), List(ForeignKeyRequestItem("task-2", "account_number,year")))) - val connections = List( - FileBuilder().name("task-1").schema(FieldBuilder().name("account_id"), FieldBuilder().name("year")), - FileBuilder().name("task-2").schema(FieldBuilder().name("account_number"), FieldBuilder().name("year")) - ) - val planBuilder = PlanBuilder() - val res = UiMapper.foreignKeyMapping(foreignKeyRequests, connections, planBuilder) - assertResult(1)(res.plan.sinkOptions.get.foreignKeys.size) - assert(res.plan.sinkOptions.get.foreignKeys.head._1.startsWith("json")) - assert(res.plan.sinkOptions.get.foreignKeys.head._1.endsWith("account_id,year")) - assert(res.plan.sinkOptions.get.foreignKeys.head._2.size == 1) - assert(res.plan.sinkOptions.get.foreignKeys.head._2.head.startsWith("json")) - assert(res.plan.sinkOptions.get.foreignKeys.head._2.head.endsWith("account_number,year")) - } - - test("Can convert UI flag config") { - val configRequest = ConfigurationRequest(flag = Map( - CONFIG_FLAGS_COUNT -> "false", - CONFIG_FLAGS_GENERATE_DATA -> "false", - CONFIG_FLAGS_RECORD_TRACKING -> "false", - CONFIG_FLAGS_DELETE_GENERATED_RECORDS -> "false", - CONFIG_FLAGS_GENERATE_PLAN_AND_TASKS -> "false", - CONFIG_FLAGS_FAIL_ON_ERROR -> "false", - CONFIG_FLAGS_UNIQUE_CHECK -> "false", - CONFIG_FLAGS_SINK_METADATA -> "false", - CONFIG_FLAGS_SAVE_REPORTS -> "false", - CONFIG_FLAGS_VALIDATION -> "false", - CONFIG_FLAGS_GENERATE_VALIDATIONS -> "false", - CONFIG_FLAGS_ALERTS -> "false", - "blah" -> "false" - )) - val baseConf = DataCatererConfigurationBuilder() - val res = UiMapper.mapFlagsConfiguration(configRequest, baseConf).build - - assert(!res.flagsConfig.enableCount) - assert(!res.flagsConfig.enableGenerateData) - assert(!res.flagsConfig.enableRecordTracking) - assert(!res.flagsConfig.enableDeleteGeneratedRecords) - assert(!res.flagsConfig.enableGeneratePlanAndTasks) - assert(!res.flagsConfig.enableFailOnError) - assert(!res.flagsConfig.enableUniqueCheck) - assert(!res.flagsConfig.enableSinkMetadata) - assert(!res.flagsConfig.enableSaveReports) - assert(!res.flagsConfig.enableValidation) - assert(!res.flagsConfig.enableGenerateValidations) - assert(!res.flagsConfig.enableAlerts) - } - - test("Can convert UI alert config") { - val configRequest = ConfigurationRequest(alert = Map(CONFIG_ALERT_TRIGGER_ON -> "failure", CONFIG_ALERT_SLACK_TOKEN -> "abc123", - CONFIG_ALERT_SLACK_CHANNELS -> "job-fail", "blah" -> "hello")) - val baseConf = DataCatererConfigurationBuilder() - val res = UiMapper.mapAlertConfiguration(configRequest, baseConf).build - - assert(res.alertConfig.triggerOn == "failure") - assert(res.alertConfig.slackAlertConfig.token == "abc123") - assert(res.alertConfig.slackAlertConfig.channels == List("job-fail")) - } - - test("Can convert UI validation config") { - val configRequest = ConfigurationRequest(validation = Map(CONFIG_VALIDATION_NUM_SAMPLE_ERROR_RECORDS -> "2", - CONFIG_VALIDATION_ENABLE_DELETE_RECORD_TRACKING_FILES -> "false", "blah" -> "hello")) - val baseConf = DataCatererConfigurationBuilder() - val res = UiMapper.mapValidationConfiguration(configRequest, baseConf).build - - assert(res.validationConfig.numSampleErrorRecords == 2) - assert(!res.validationConfig.enableDeleteRecordTrackingFiles) - } - - test("Can convert UI generation config") { - val configRequest = ConfigurationRequest(generation = Map(CONFIG_GENERATION_NUM_RECORDS_PER_BATCH -> "100", - CONFIG_GENERATION_NUM_RECORDS_PER_STEP -> "10", "blah" -> "hello")) - val baseConf = DataCatererConfigurationBuilder() - val res = UiMapper.mapGenerationConfiguration(configRequest, baseConf).build - - assert(res.generationConfig.numRecordsPerBatch == 100) - assert(res.generationConfig.numRecordsPerStep.contains(10)) - } - - test("Can convert UI metadata config") { - val configRequest = ConfigurationRequest(metadata = Map( - CONFIG_METADATA_NUM_RECORDS_FROM_DATA_SOURCE -> "100", - CONFIG_METADATA_NUM_RECORDS_FOR_ANALYSIS -> "10", - CONFIG_METADATA_ONE_OF_DISTINCT_COUNT_VS_COUNT_THRESHOLD -> "1", - CONFIG_METADATA_ONE_OF_MIN_COUNT -> "5", - CONFIG_METADATA_NUM_GENERATED_SAMPLES -> "7", - "blah" -> "hello" - )) - val baseConf = DataCatererConfigurationBuilder() - val res = UiMapper.mapMetadataConfiguration(configRequest, baseConf).build - - assert(res.metadataConfig.numRecordsFromDataSource == 100) - assert(res.metadataConfig.numRecordsForAnalysis == 10) - assert(res.metadataConfig.oneOfDistinctCountVsCountThreshold == 1) - assert(res.metadataConfig.oneOfMinCount == 5) - assert(res.metadataConfig.numGeneratedSamples == 7) - } - - test("Can convert UI folder config") { - val configRequest = ConfigurationRequest(folder = Map( - CONFIG_FOLDER_PLAN_FILE_PATH -> "/tmp/plan-file", - CONFIG_FOLDER_TASK_FOLDER_PATH -> "/tmp/task-folder", - CONFIG_FOLDER_GENERATED_PLAN_AND_TASK_FOLDER_PATH -> "/tmp/gen", - CONFIG_FOLDER_GENERATED_REPORTS_FOLDER_PATH -> "/tmp/report", - CONFIG_FOLDER_RECORD_TRACKING_FOLDER_PATH -> "/tmp/record", - CONFIG_FOLDER_VALIDATION_FOLDER_PATH -> "/tmp/valid", - CONFIG_FOLDER_RECORD_TRACKING_FOR_VALIDATION_FOLDER_PATH -> "/tmp/record-valid", - "blah" -> "hello" - )) - val baseConf = DataCatererConfigurationBuilder() - val res = UiMapper.mapFolderConfiguration(configRequest, "/my-install", baseConf).build - - assert(res.foldersConfig.planFilePath == "/tmp/plan-file") - assert(res.foldersConfig.taskFolderPath == "/tmp/task-folder") - assert(res.foldersConfig.generatedPlanAndTaskFolderPath == "/tmp/gen") - assert(res.foldersConfig.generatedReportsFolderPath == "/tmp/report") - assert(res.foldersConfig.recordTrackingFolderPath == "/tmp/record") - assert(res.foldersConfig.validationFolderPath == "/tmp/valid") - assert(res.foldersConfig.recordTrackingForValidationFolderPath == "/tmp/record-valid") - } - - test("Can convert UI folder config with install directory") { - val configRequest = ConfigurationRequest(folder = Map( - CONFIG_FOLDER_PLAN_FILE_PATH -> "", - CONFIG_FOLDER_TASK_FOLDER_PATH -> "", - CONFIG_FOLDER_GENERATED_PLAN_AND_TASK_FOLDER_PATH -> "", - CONFIG_FOLDER_GENERATED_REPORTS_FOLDER_PATH -> "", - CONFIG_FOLDER_RECORD_TRACKING_FOLDER_PATH -> "", - CONFIG_FOLDER_VALIDATION_FOLDER_PATH -> "", - CONFIG_FOLDER_RECORD_TRACKING_FOR_VALIDATION_FOLDER_PATH -> "", - "blah" -> "hello" - )) - val baseConf = DataCatererConfigurationBuilder() - val res = UiMapper.mapFolderConfiguration(configRequest, "/my-install", baseConf).build - - assert(res.foldersConfig.planFilePath == DEFAULT_PLAN_FILE_PATH) - assert(res.foldersConfig.taskFolderPath == "/my-install/task") - assert(res.foldersConfig.generatedPlanAndTaskFolderPath == "/my-install/generated-plan-task") - assert(res.foldersConfig.generatedReportsFolderPath == "/my-install/report") - assert(res.foldersConfig.recordTrackingFolderPath == "/my-install/record-tracking") - assert(res.foldersConfig.validationFolderPath == "/my-install/validation") - assert(res.foldersConfig.recordTrackingForValidationFolderPath == "/my-install/record-tracking-validation") - } - - test("Can convert UI count configuration with base record count") { - val dataSourceRequest = DataSourceRequest("plan-name", "task-1", count = Some(RecordCountRequest(Some(10)))) - val res = UiMapper.countMapping(dataSourceRequest).count - assert(res.records.contains(10)) - } - - test("Can convert UI count configuration with base record min and max") { - val dataSourceRequest = DataSourceRequest("plan-name", "task-1", count = Some(RecordCountRequest(None, Some(1), Some(10)))) - val res = UiMapper.countMapping(dataSourceRequest).count - assert(res.generator.isDefined) - assert(res.generator.get.options.get(MINIMUM).contains("1")) - assert(res.generator.get.options.get(MAXIMUM).contains("10")) - } - - test("Can convert UI count configuration with per column count") { - val dataSourceRequest = DataSourceRequest("plan-name", "task-1", count = Some(RecordCountRequest(perColumnNames = Some(List("account_id")), perColumnRecords = Some(10)))) - val res = UiMapper.countMapping(dataSourceRequest).count - assert(res.perColumn.isDefined) - assert(res.perColumn.get.columnNames.size == 1) - assert(res.perColumn.get.columnNames.contains("account_id")) - assert(res.perColumn.get.count.contains(10)) - } - - test("Can convert UI count configuration with per column min and max") { - val dataSourceRequest = DataSourceRequest("plan-name", "task-1", count = Some(RecordCountRequest(perColumnNames = Some(List("account_id")), perColumnRecordsMin = Some(10), perColumnRecordsMax = Some(20)))) - val res = UiMapper.countMapping(dataSourceRequest).count - assert(res.perColumn.isDefined) - assert(res.perColumn.get.columnNames.size == 1) - assert(res.perColumn.get.columnNames.contains("account_id")) - assert(res.perColumn.get.generator.isDefined) - assert(res.perColumn.get.generator.get.options.get(MINIMUM).contains("10")) - assert(res.perColumn.get.generator.get.options.get(MAXIMUM).contains("20")) - } - - test("Can convert UI count configuration with per column distribution") { - val dataSourceRequest = DataSourceRequest("plan-name", "task-1", count = Some(RecordCountRequest(perColumnNames = Some(List("account_id")), perColumnRecordsDistribution = Some(DISTRIBUTION_EXPONENTIAL), perColumnRecordsDistributionRateParam = Some("0.5")))) - val res = UiMapper.countMapping(dataSourceRequest).count - assert(res.perColumn.isDefined) - assert(res.perColumn.get.columnNames.size == 1) - assert(res.perColumn.get.columnNames.contains("account_id")) - assert(res.perColumn.get.generator.isDefined) - assert(res.perColumn.get.generator.get.options.get(DISTRIBUTION).contains(DISTRIBUTION_EXPONENTIAL)) - assert(res.perColumn.get.generator.get.options.get(DISTRIBUTION_RATE_PARAMETER).contains("0.5")) - } - - test("Can convert UI count configuration with per column distribution with min and max") { - val dataSourceRequest = DataSourceRequest("plan-name", "task-1", count = Some(RecordCountRequest(perColumnNames = Some(List("account_id")), perColumnRecordsDistribution = Some(DISTRIBUTION_EXPONENTIAL), perColumnRecordsDistributionRateParam = Some("0.5"), perColumnRecordsMin = Some(1), perColumnRecordsMax = Some(3)))) - val res = UiMapper.countMapping(dataSourceRequest).count - assert(res.perColumn.isDefined) - assert(res.perColumn.get.columnNames.size == 1) - assert(res.perColumn.get.columnNames.contains("account_id")) - assert(res.perColumn.get.generator.isDefined) - assert(res.perColumn.get.generator.get.options.get(DISTRIBUTION).contains(DISTRIBUTION_EXPONENTIAL)) - assert(res.perColumn.get.generator.get.options.get(DISTRIBUTION_RATE_PARAMETER).contains("0.5")) - assert(res.perColumn.get.generator.get.options.get(MINIMUM).contains("1")) - assert(res.perColumn.get.generator.get.options.get(MAXIMUM).contains("3")) - } - - test("Can convert UI count configuration with per column normal distribution with min and max") { - val dataSourceRequest = DataSourceRequest("plan-name", "task-1", count = Some(RecordCountRequest(perColumnNames = Some(List("account_id")), perColumnRecordsDistribution = Some(DISTRIBUTION_NORMAL), perColumnRecordsMin = Some(1), perColumnRecordsMax = Some(3)))) - val res = UiMapper.countMapping(dataSourceRequest).count - assert(res.perColumn.isDefined) - assert(res.perColumn.get.columnNames.size == 1) - assert(res.perColumn.get.columnNames.contains("account_id")) - assert(res.perColumn.get.generator.isDefined) - assert(res.perColumn.get.generator.get.options.get(DISTRIBUTION).contains(DISTRIBUTION_NORMAL)) - assert(res.perColumn.get.generator.get.options.get(MINIMUM).contains("1")) - assert(res.perColumn.get.generator.get.options.get(MAXIMUM).contains("3")) - } - - test("Can convert UI field mapping") { - val dataSourceRequest = DataSourceRequest("plan-name", "task-1", fields = Some(FieldRequests(Some(List( - FieldRequest("name", "string"), - FieldRequest("account_id", "string", Some(Map(REGEX_GENERATOR -> "acc[0-9]{1}"))), - FieldRequest("status", "string", Some(Map(ONE_OF_GENERATOR -> "open,closed"))), - FieldRequest("details", "struct", nested = Some(FieldRequests(Some(List(FieldRequest("age", "integer")))))), - ))))) - val optFieldMapping = UiMapper.fieldMapping(dataSourceRequest) - assert(optFieldMapping._2.isDefined) - val res = optFieldMapping._2.get - assert(res.size == 4) - val nameField = res.find(_.field.name == "name") - assert(nameField.exists(_.field.`type`.contains("string"))) - val accountField = res.find(_.field.name == "account_id") - assert(accountField.exists(_.field.`type`.contains("string"))) - assert(accountField.exists(_.field.generator.exists(_.`type` == REGEX_GENERATOR))) - assert(accountField.exists(_.field.generator.exists(_.options.get(REGEX_GENERATOR).contains("acc[0-9]{1}")))) - val statusField = res.find(_.field.name == "status") - assert(statusField.exists(_.field.generator.exists(_.`type` == ONE_OF_GENERATOR))) - assert(statusField.exists(_.field.generator.exists(_.options.get(ONE_OF_GENERATOR).contains(List("open", "closed"))))) - val detailsField = res.find(_.field.name == "details") - assertResult(Some("struct<>"))(detailsField.get.field.`type`) - assert(detailsField.exists(_.field.schema.isDefined)) - assert(detailsField.exists(_.field.schema.get.fields.isDefined)) - assert(detailsField.exists(_.field.schema.get.fields.get.size == 1)) - assert(detailsField.exists(_.field.schema.get.fields.get.head.name == "age")) - assert(detailsField.exists(_.field.schema.get.fields.get.head.`type`.contains("integer"))) - } - - test("Can convert UI connection mapping for Cassandra") { - val dataSourceRequest = DataSourceRequest("cassandra-name", "task-1", Some(CASSANDRA_NAME), Some(Map(URL -> "localhost:9092", USERNAME -> "cassandra", PASSWORD -> "cassandra"))) - val res = UiMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder - assertResult("cassandra-name")(res.dataSourceName) - assertResult(8)(res.options.size) - assertResult(Some(CASSANDRA))(res.options.get(FORMAT)) - assertResult(Some("localhost:9092"))(res.options.get(URL)) - assertResult(Some("localhost"))(res.options.get("spark.cassandra.connection.host")) - assertResult(Some("9092"))(res.options.get("spark.cassandra.connection.port")) - assertResult(Some("cassandra"))(res.options.get(USERNAME)) - assertResult(Some("cassandra"))(res.options.get(PASSWORD)) - assertResult(Some("cassandra"))(res.options.get("spark.cassandra.auth.username")) - assertResult(Some("cassandra"))(res.options.get("spark.cassandra.auth.password")) - } - - test("Can convert UI connection mapping for Cassandra with keyspace and table") { - val dataSourceRequest = DataSourceRequest("cassandra-name", "task-1", Some(CASSANDRA_NAME), Some(Map(URL -> "localhost:9092", USERNAME -> "cassandra", PASSWORD -> "cassandra", CASSANDRA_KEYSPACE -> "account", CASSANDRA_TABLE -> "accounts"))) - val res = UiMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder - assertResult("cassandra-name")(res.dataSourceName) - assertResult(10)(res.options.size) - assertResult(Some(CASSANDRA))(res.options.get(FORMAT)) - assertResult(Some("localhost:9092"))(res.options.get(URL)) - assertResult(Some("account"))(res.options.get(CASSANDRA_KEYSPACE)) - assertResult(Some("accounts"))(res.options.get(CASSANDRA_TABLE)) - } - - test("Throw error if only keyspace or table is defined for Cassandra") { - val dataSourceRequest = DataSourceRequest("cassandra-name", "task-1", Some(CASSANDRA_NAME), Some(Map(URL -> "localhost:9092", USERNAME -> "cassandra", PASSWORD -> "cassandra", CASSANDRA_KEYSPACE -> "account"))) - val dataSourceRequest1 = DataSourceRequest("cassandra-name", "task-1", Some(CASSANDRA_NAME), Some(Map(URL -> "localhost:9092", USERNAME -> "cassandra", PASSWORD -> "cassandra", CASSANDRA_TABLE -> "accounts"))) - assertThrows[IllegalArgumentException](UiMapper.connectionMapping(dataSourceRequest)) - assertThrows[IllegalArgumentException](UiMapper.connectionMapping(dataSourceRequest1)) - } - - test("Can convert UI connection mapping for Postgres") { - val dataSourceRequest = DataSourceRequest("postgres-name", "task-1", Some(POSTGRES), Some(Map(URL -> "localhost:5432", USERNAME -> "postgres", PASSWORD -> "postgres", DRIVER -> POSTGRES_DRIVER))) - val res = UiMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder - assertResult("postgres-name")(res.dataSourceName) - assertResult(5)(res.options.size) - assertResult(Some(JDBC))(res.options.get(FORMAT)) - assertResult(Some(POSTGRES_DRIVER))(res.options.get(DRIVER)) - assertResult(Some("localhost:5432"))(res.options.get(URL)) - assertResult(Some("postgres"))(res.options.get(USERNAME)) - assertResult(Some("postgres"))(res.options.get(PASSWORD)) - } - - test("Can convert UI connection mapping for MySQL") { - val dataSourceRequest = DataSourceRequest("mysql-name", "task-1", Some(MYSQL), Some(Map(URL -> "localhost:5432", USERNAME -> "root", PASSWORD -> "root", DRIVER -> MYSQL_DRIVER))) - val res = UiMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder - assertResult("mysql-name")(res.dataSourceName) - assertResult(5)(res.options.size) - assertResult(Some(JDBC))(res.options.get(FORMAT)) - assertResult(Some(MYSQL_DRIVER))(res.options.get(DRIVER)) - assertResult(Some("localhost:5432"))(res.options.get(URL)) - assertResult(Some("root"))(res.options.get(USERNAME)) - assertResult(Some("root"))(res.options.get(PASSWORD)) - } - - test("Can convert UI connection mapping for Postgres with schema and table") { - val dataSourceRequest = DataSourceRequest("postgres-name", "task-1", Some(POSTGRES), Some(Map(URL -> "localhost:5432", USERNAME -> "postgres", PASSWORD -> "postgres", DRIVER -> POSTGRES_DRIVER, SCHEMA -> "account", TABLE -> "accounts"))) - val res = UiMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder - assertResult("postgres-name")(res.dataSourceName) - assertResult(7)(res.options.size) - assertResult(Some(JDBC))(res.options.get(FORMAT)) - assertResult(Some(POSTGRES_DRIVER))(res.options.get(DRIVER)) - assertResult(Some("localhost:5432"))(res.options.get(URL)) - assertResult(Some("postgres"))(res.options.get(USERNAME)) - assertResult(Some("postgres"))(res.options.get(PASSWORD)) - assertResult(Some("account"))(res.options.get(SCHEMA)) - assertResult(Some("accounts"))(res.options.get(TABLE)) - } - - test("Throw error if only schema or table is defined for Postgres") { - val dataSourceRequest = DataSourceRequest("postgres-name", "task-1", Some(POSTGRES), Some(Map(URL -> "localhost:5432", USERNAME -> "postgres", PASSWORD -> "postgres", DRIVER -> POSTGRES_DRIVER, SCHEMA -> "account"))) - val dataSourceRequest1 = DataSourceRequest("postgres-name", "task-1", Some(POSTGRES), Some(Map(URL -> "localhost:5432", USERNAME -> "postgres", PASSWORD -> "postgres", DRIVER -> POSTGRES_DRIVER, TABLE -> "accounts"))) - assertThrows[IllegalArgumentException](UiMapper.connectionMapping(dataSourceRequest)) - assertThrows[IllegalArgumentException](UiMapper.connectionMapping(dataSourceRequest1)) - } - - test("Can convert UI connection mapping for CSV") { - val dataSourceRequest = DataSourceRequest("csv-name", "task-1", Some(CSV), Some(Map(PATH -> "/tmp/my-csv"))) - val res = UiMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder - assertResult(2)(res.options.size) - assertResult(Some(CSV))(res.options.get(FORMAT)) - assertResult(Some("/tmp/my-csv"))(res.options.get(PATH)) - } - - test("Can convert UI connection mapping for CSV with partitions and partitionBy") { - val dataSourceRequest = DataSourceRequest("csv-name", "task-1", Some(CSV), Some(Map(PATH -> "/tmp/my-csv", PARTITIONS -> "2", PARTITION_BY -> "account_id,year"))) - val res = UiMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder - assertResult(4)(res.options.size) - assertResult(Some(CSV))(res.options.get(FORMAT)) - assertResult(Some("/tmp/my-csv"))(res.options.get(PATH)) - assertResult(Some("2"))(res.options.get(PARTITIONS)) - assertResult(Some("account_id,year"))(res.options.get(PARTITION_BY)) - } - - test("Can convert UI connection mapping for JSON") { - val dataSourceRequest = DataSourceRequest("json-name", "task-1", Some(JSON), Some(Map(PATH -> "/tmp/my-json"))) - val res = UiMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder - assertResult(2)(res.options.size) - assertResult(Some(JSON))(res.options.get(FORMAT)) - assertResult(Some("/tmp/my-json"))(res.options.get(PATH)) - } - - test("Can convert UI connection mapping for Parquet") { - val dataSourceRequest = DataSourceRequest("parquet-name", "task-1", Some(PARQUET), Some(Map(PATH -> "/tmp/my-parquet"))) - val res = UiMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder - assertResult(2)(res.options.size) - assertResult(Some(PARQUET))(res.options.get(FORMAT)) - assertResult(Some("/tmp/my-parquet"))(res.options.get(PATH)) - } - - test("Can convert UI connection mapping for ORC") { - val dataSourceRequest = DataSourceRequest("orc-name", "task-1", Some(ORC), Some(Map(PATH -> "/tmp/my-orc"))) - val res = UiMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder - assertResult(2)(res.options.size) - assertResult(Some(ORC))(res.options.get(FORMAT)) - assertResult(Some("/tmp/my-orc"))(res.options.get(PATH)) - } - - test("Can convert UI connection mapping for Solace") { - val dataSourceRequest = DataSourceRequest("solace-name", "task-1", Some(SOLACE), Some(Map(URL -> "localhost:55554", USERNAME -> "solace", PASSWORD -> "solace", JMS_DESTINATION_NAME -> "/JNDI/my-queue", JMS_VPN_NAME -> "default", JMS_CONNECTION_FACTORY -> "jms-connection", JMS_INITIAL_CONTEXT_FACTORY -> "jms-init"))) - val res = UiMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder - assertResult(8)(res.options.size) - assertResult(Some(JMS))(res.options.get(FORMAT)) - assertResult(Some("localhost:55554"))(res.options.get(URL)) - assertResult(Some("solace"))(res.options.get(USERNAME)) - assertResult(Some("solace"))(res.options.get(PASSWORD)) - assertResult(Some("/JNDI/my-queue"))(res.options.get(JMS_DESTINATION_NAME)) - assertResult(Some("default"))(res.options.get(JMS_VPN_NAME)) - assertResult(Some("jms-connection"))(res.options.get(JMS_CONNECTION_FACTORY)) - assertResult(Some("jms-init"))(res.options.get(JMS_INITIAL_CONTEXT_FACTORY)) - } - - test("Can convert UI connection mapping for Kafka") { - val dataSourceRequest = DataSourceRequest("kafka-name", "task-1", Some(KAFKA), Some(Map(URL -> "localhost:1234", KAFKA_TOPIC -> "my-topic"))) - val res = UiMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder - assertResult(4)(res.options.size) - assertResult(Some(KAFKA))(res.options.get(FORMAT)) - assertResult(Some("localhost:1234"))(res.options.get(URL)) - assertResult(Some("localhost:1234"))(res.options.get("kafka.bootstrap.servers")) - assertResult(Some("my-topic"))(res.options.get(KAFKA_TOPIC)) - } - - test("Can convert UI connection mapping for HTTP") { - val dataSourceRequest = DataSourceRequest("http-name", "task-1", Some(HTTP), Some(Map(USERNAME -> "root", PASSWORD -> "root"))) - val res = UiMapper.connectionMapping(dataSourceRequest).connectionConfigWithTaskBuilder - assertResult(3)(res.options.size) - assertResult(Some(HTTP))(res.options.get(FORMAT)) - assertResult(Some("root"))(res.options.get(USERNAME)) - assertResult(Some("root"))(res.options.get(PASSWORD)) - } - - test("Throw exception if provided unknown data source") { - val dataSourceRequest = DataSourceRequest("unknown-name", "task-1", Some("unknown")) - assertThrows[IllegalArgumentException](UiMapper.connectionMapping(dataSourceRequest)) - } - - test("Throw exception if no data source type provided") { - val dataSourceRequest = DataSourceRequest("unknown-name", "task-1", None) - assertThrows[IllegalArgumentException](UiMapper.connectionMapping(dataSourceRequest)) - } - - test("Can convert UI validation mapping for basic column validation") { - val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest(Some( - List(ValidationItemRequest(VALIDATION_COLUMN, Some(Map(VALIDATION_FIELD -> "account_id", VALIDATION_EQUAL -> "abc123", "description" -> "valid desc", "errorThreshold" -> "2")))) - )))) - val res = UiMapper.validationMapping(dataSourceRequest) - assertResult(1)(res.size) - val exprValid = res.head.validation.asInstanceOf[ExpressionValidation] - assertResult("account_id == abc123")(exprValid.expr) - assertResult(Some("valid desc"))(exprValid.description) - assertResult(Some(2.0))(exprValid.errorThreshold) - assertResult(1)(exprValid.selectExpr.size) - assertResult("*")(exprValid.selectExpr.head) - } - - test("Can convert UI validation mapping for column name validation") { - val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest(Some( - List(ValidationItemRequest(VALIDATION_COLUMN_NAMES, Some(Map(VALIDATION_COLUMN_NAMES_COUNT_EQUAL -> "5")))) - )))) - val res = UiMapper.validationMapping(dataSourceRequest) - assertResult(1)(res.size) - val valid = res.head.validation.asInstanceOf[ColumnNamesValidation] - assertResult(VALIDATION_COLUMN_NAME_COUNT_EQUAL)(valid.columnNameType) - assertResult(5)(valid.count) - } - - test("Can convert UI validation mapping for column name validation count between") { - val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest(Some( - List(ValidationItemRequest(VALIDATION_COLUMN_NAMES, Some(Map(VALIDATION_COLUMN_NAMES_COUNT_BETWEEN -> "blah", VALIDATION_MIN -> "1", VALIDATION_MAX -> "2")))) - )))) - val res = UiMapper.validationMapping(dataSourceRequest) - assertResult(1)(res.size) - val valid = res.head.validation.asInstanceOf[ColumnNamesValidation] - assertResult(VALIDATION_COLUMN_NAME_COUNT_BETWEEN)(valid.columnNameType) - assertResult(1)(valid.minCount) - assertResult(2)(valid.maxCount) - } - - test("Can convert UI validation mapping for column name validation match order") { - val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest(Some( - List(ValidationItemRequest(VALIDATION_COLUMN_NAMES, Some(Map(VALIDATION_COLUMN_NAMES_MATCH_ORDER -> "account_id,year")))) - )))) - val res = UiMapper.validationMapping(dataSourceRequest) - assertResult(1)(res.size) - val valid = res.head.validation.asInstanceOf[ColumnNamesValidation] - assertResult(VALIDATION_COLUMN_NAME_MATCH_ORDER)(valid.columnNameType) - assertResult(Array("account_id", "year"))(valid.names) - } - - test("Can convert UI validation mapping for column name validation match set") { - val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest(Some( - List(ValidationItemRequest(VALIDATION_COLUMN_NAMES, Some(Map(VALIDATION_COLUMN_NAMES_MATCH_SET -> "account_id,year")))) - )))) - val res = UiMapper.validationMapping(dataSourceRequest) - assertResult(1)(res.size) - val valid = res.head.validation.asInstanceOf[ColumnNamesValidation] - assertResult(VALIDATION_COLUMN_NAME_MATCH_SET)(valid.columnNameType) - assertResult(Array("account_id", "year"))(valid.names) - } - - test("Can convert UI validation mapping, when unknown option, default to column name count equals 1") { - val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest(Some( - List(ValidationItemRequest(VALIDATION_COLUMN_NAMES, Some(Map("unknown" -> "hello")))) - )))) - val res = UiMapper.validationMapping(dataSourceRequest) - assertResult(1)(res.size) - val valid = res.head.validation.asInstanceOf[ColumnNamesValidation] - assertResult(VALIDATION_COLUMN_NAME_COUNT_EQUAL)(valid.columnNameType) - assertResult(1)(valid.count) - } - - test("Can convert UI validation mapping with min group by validation") { - val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest( - Some(List(ValidationItemRequest(VALIDATION_GROUP_BY, Some(Map(VALIDATION_GROUP_BY_COLUMNS -> "account_id")), - Some(ValidationItemRequests(List(ValidationItemRequest( - VALIDATION_COLUMN, Some(Map("aggType" -> VALIDATION_MIN, "aggCol" -> "amount", VALIDATION_EQUAL -> "10")) - )))) - )))))) - val res = UiMapper.validationMapping(dataSourceRequest) - assertResult(1)(res.size) - val valid = res.head.validation.asInstanceOf[GroupByValidation] - assertResult(Seq("account_id"))(valid.groupByCols) - assertResult("amount")(valid.aggCol) - assertResult(VALIDATION_MIN)(valid.aggType) - assertResult("min(amount) == 10")(valid.aggExpr) - } - - test("Can convert UI validation mapping with max group by validation") { - val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest( - Some(List(ValidationItemRequest(VALIDATION_GROUP_BY, Some(Map(VALIDATION_GROUP_BY_COLUMNS -> "account_id")), - Some(ValidationItemRequests(List(ValidationItemRequest( - VALIDATION_COLUMN, Some(Map("aggType" -> VALIDATION_MAX, "aggCol" -> "amount", VALIDATION_EQUAL -> "10")) - )))) - )))))) - val res = UiMapper.validationMapping(dataSourceRequest) - assertResult(1)(res.size) - val valid = res.head.validation.asInstanceOf[GroupByValidation] - assertResult(Seq("account_id"))(valid.groupByCols) - assertResult("amount")(valid.aggCol) - assertResult(VALIDATION_MAX)(valid.aggType) - assertResult("max(amount) == 10")(valid.aggExpr) - } - - test("Can convert UI validation mapping with count group by validation") { - val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest(Some( - List(ValidationItemRequest(VALIDATION_GROUP_BY, Some(Map(VALIDATION_GROUP_BY_COLUMNS -> "account_id")), - Some(ValidationItemRequests(List(ValidationItemRequest( - VALIDATION_COLUMN, Some(Map("aggType" -> VALIDATION_COUNT, "aggCol" -> "amount", VALIDATION_EQUAL -> "10")) - )))) - )))))) - val res = UiMapper.validationMapping(dataSourceRequest) - assertResult(1)(res.size) - val valid = res.head.validation.asInstanceOf[GroupByValidation] - assertResult(Seq("account_id"))(valid.groupByCols) - assertResult("amount")(valid.aggCol) - assertResult(VALIDATION_COUNT)(valid.aggType) - assertResult("count(amount) == 10")(valid.aggExpr) - } - - test("Can convert UI validation mapping with sum group by validation") { - val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest( - Some(List(ValidationItemRequest(VALIDATION_GROUP_BY, Some(Map(VALIDATION_GROUP_BY_COLUMNS -> "account_id")), - Some(ValidationItemRequests(List(ValidationItemRequest( - VALIDATION_COLUMN, Some(Map("aggType" -> VALIDATION_SUM, "aggCol" -> "amount", VALIDATION_EQUAL -> "10")) - )))) - )))))) - val res = UiMapper.validationMapping(dataSourceRequest) - assertResult(1)(res.size) - val valid = res.head.validation.asInstanceOf[GroupByValidation] - assertResult(Seq("account_id"))(valid.groupByCols) - assertResult("amount")(valid.aggCol) - assertResult(VALIDATION_SUM)(valid.aggType) - assertResult("sum(amount) == 10")(valid.aggExpr) - } - - test("Can convert UI validation mapping with average group by validation") { - val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest( - Some(List(ValidationItemRequest(VALIDATION_GROUP_BY, Some(Map(VALIDATION_GROUP_BY_COLUMNS -> "account_id")), - Some(ValidationItemRequests(List(ValidationItemRequest( - VALIDATION_COLUMN, Some(Map("aggType" -> VALIDATION_AVERAGE, "aggCol" -> "amount", VALIDATION_EQUAL -> "10")) - )))) - )))))) - val res = UiMapper.validationMapping(dataSourceRequest) - assertResult(1)(res.size) - val valid = res.head.validation.asInstanceOf[GroupByValidation] - assertResult(Seq("account_id"))(valid.groupByCols) - assertResult("amount")(valid.aggCol) - assertResult("avg")(valid.aggType) - assertResult("avg(amount) == 10")(valid.aggExpr) - } - - test("Can convert UI validation mapping with standard deviation group by validation") { - val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest( - Some(List(ValidationItemRequest(VALIDATION_GROUP_BY, Some(Map(VALIDATION_GROUP_BY_COLUMNS -> "account_id")), - Some(ValidationItemRequests(List(ValidationItemRequest( - VALIDATION_COLUMN, Some(Map("aggType" -> VALIDATION_STANDARD_DEVIATION, "aggCol" -> "amount", VALIDATION_EQUAL -> "10")) - )))) - )))))) - val res = UiMapper.validationMapping(dataSourceRequest) - assertResult(1)(res.size) - val valid = res.head.validation.asInstanceOf[GroupByValidation] - assertResult(Seq("account_id"))(valid.groupByCols) - assertResult("amount")(valid.aggCol) - assertResult("stddev")(valid.aggType) - assertResult("stddev(amount) == 10")(valid.aggExpr) - } - - test("Throw error when given unknown aggregation type") { - val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest( - Some(List(ValidationItemRequest(VALIDATION_GROUP_BY, Some(Map(VALIDATION_GROUP_BY_COLUMNS -> "account_id")), - Some(ValidationItemRequests(List(ValidationItemRequest( - VALIDATION_COLUMN, Some(Map("aggType" -> "unknown", "aggCol" -> "amount", VALIDATION_EQUAL -> "10")) - )))) - )))))) - assertThrows[IllegalArgumentException](UiMapper.validationMapping(dataSourceRequest)) - } - - test("Throw error when no aggType or aggCol is given") { - val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest( - Some(List(ValidationItemRequest(VALIDATION_GROUP_BY, Some(Map(VALIDATION_GROUP_BY_COLUMNS -> "account_id")), - Some(ValidationItemRequests(List(ValidationItemRequest( - VALIDATION_COLUMN, Some(Map("aggCol" -> "amount", VALIDATION_EQUAL -> "10")) - )))) - )))))) - val dataSourceRequest1 = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest( - Some(List(ValidationItemRequest(VALIDATION_GROUP_BY, Some(Map(VALIDATION_GROUP_BY_COLUMNS -> "account_id")), - Some(ValidationItemRequests(List(ValidationItemRequest( - VALIDATION_COLUMN, Some(Map("aggType" -> "max", VALIDATION_EQUAL -> "10")) - )))) - )))))) - val dataSourceRequest2 = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest( - Some(List(ValidationItemRequest(VALIDATION_GROUP_BY, Some(Map(VALIDATION_GROUP_BY_COLUMNS -> "account_id")), - Some(ValidationItemRequests(List(ValidationItemRequest( - VALIDATION_COLUMN, Some(Map(VALIDATION_EQUAL -> "10")) - )))) - )))))) - assertThrows[RuntimeException](UiMapper.validationMapping(dataSourceRequest)) - assertThrows[RuntimeException](UiMapper.validationMapping(dataSourceRequest1)) - assertThrows[RuntimeException](UiMapper.validationMapping(dataSourceRequest2)) - } - - test("Can convert UI upstream validation mapping") { - val connections = List( - FileBuilder().name("task-1").schema(FieldBuilder().name("account_id"), FieldBuilder().name("year")), - FileBuilder().name("task-2").schema(FieldBuilder().name("account_id"), FieldBuilder().name("date")) - ) - val dataSourceRequest = List( - DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest(Some(List(ValidationItemRequest(VALIDATION_UPSTREAM, Some(Map( - VALIDATION_UPSTREAM_TASK_NAME -> "task-2", - VALIDATION_UPSTREAM_JOIN_TYPE -> "outer", - VALIDATION_UPSTREAM_JOIN_COLUMNS -> "account_id", - )), - Some(ValidationItemRequests(List(ValidationItemRequest(VALIDATION_COLUMN, Some(Map(VALIDATION_FIELD -> "year", VALIDATION_EQUAL -> "2020")))))) - )))))) - ) - val res = UiMapper.connectionsWithUpstreamValidationMapping(connections, dataSourceRequest) - assertResult(2)(res.size) - val taskWithValidation = res.find(_.task.get.task.name == "task-1").get - val taskValidations = taskWithValidation.step.get.optValidation.get.dataSourceValidation.validations - assertResult(1)(taskValidations.size) - val upstreamValidation = taskValidations.head.validation.asInstanceOf[UpstreamDataSourceValidation] - assertResult(List("account_id"))(upstreamValidation.joinColumns) - assertResult("outer")(upstreamValidation.joinType) - assertResult("task-2")(upstreamValidation.upstreamDataSource.task.get.task.name) - val exprValid = upstreamValidation.validation.validation.asInstanceOf[ExpressionValidation] - assertResult(List("*"))(exprValid.selectExpr) - assertResult("year == 2020")(exprValid.expr) - } - - test("Can convert UI upstream validation mapping with join expression") { - val connections = List( - FileBuilder().name("task-1").schema(FieldBuilder().name("account_id"), FieldBuilder().name("year")), - FileBuilder().name("task-2").schema(FieldBuilder().name("account_id"), FieldBuilder().name("date")) - ) - val dataSourceRequest = List( - DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest(Some(List(ValidationItemRequest(VALIDATION_UPSTREAM, Some(Map( - VALIDATION_UPSTREAM_TASK_NAME -> "task-2", - VALIDATION_UPSTREAM_JOIN_TYPE -> "outer", - VALIDATION_UPSTREAM_JOIN_EXPR -> "account_id == task-2_account_id", - )), - Some(ValidationItemRequests(List(ValidationItemRequest(VALIDATION_COLUMN, Some(Map(VALIDATION_FIELD -> "year", VALIDATION_EQUAL -> "2020")))))) - )))))) - ) - val res = UiMapper.connectionsWithUpstreamValidationMapping(connections, dataSourceRequest) - assertResult(2)(res.size) - val taskWithValidation = res.find(_.task.get.task.name == "task-1").get - val taskValidations = taskWithValidation.step.get.optValidation.get.dataSourceValidation.validations - assertResult(1)(taskValidations.size) - val upstreamValidation = taskValidations.head.validation.asInstanceOf[UpstreamDataSourceValidation] - assertResult(List("expr:account_id == task-2_account_id"))(upstreamValidation.joinColumns) - assertResult("outer")(upstreamValidation.joinType) - assertResult("task-2")(upstreamValidation.upstreamDataSource.task.get.task.name) - val exprValid = upstreamValidation.validation.validation.asInstanceOf[ExpressionValidation] - assertResult(List("*"))(exprValid.selectExpr) - assertResult("year == 2020")(exprValid.expr) - } - - test("Can convert UI upstream validation mapping with join columns only") { - val connections = List( - FileBuilder().name("task-1").schema(FieldBuilder().name("account_id"), FieldBuilder().name("year")), - FileBuilder().name("task-2").schema(FieldBuilder().name("account_id"), FieldBuilder().name("date")) - ) - val dataSourceRequest = List( - DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest(Some(List(ValidationItemRequest(VALIDATION_UPSTREAM, Some(Map( - VALIDATION_UPSTREAM_TASK_NAME -> "task-2", - VALIDATION_UPSTREAM_JOIN_COLUMNS -> "account_id", - )), - Some(ValidationItemRequests(List(ValidationItemRequest(VALIDATION_COLUMN, Some(Map(VALIDATION_FIELD -> "year", VALIDATION_EQUAL -> "2020")))))) - )))))) - ) - val res = UiMapper.connectionsWithUpstreamValidationMapping(connections, dataSourceRequest) - assertResult(2)(res.size) - val taskWithValidation = res.find(_.task.get.task.name == "task-1").get - val taskValidations = taskWithValidation.step.get.optValidation.get.dataSourceValidation.validations - assertResult(1)(taskValidations.size) - val upstreamValidation = taskValidations.head.validation.asInstanceOf[UpstreamDataSourceValidation] - assertResult(List("account_id"))(upstreamValidation.joinColumns) - assertResult(DEFAULT_VALIDATION_JOIN_TYPE)(upstreamValidation.joinType) - assertResult("task-2")(upstreamValidation.upstreamDataSource.task.get.task.name) - val exprValid = upstreamValidation.validation.validation.asInstanceOf[ExpressionValidation] - assertResult(List("*"))(exprValid.selectExpr) - assertResult("year == 2020")(exprValid.expr) - } - - test("Can convert UI upstream validation mapping with join expression only") { - val connections = List( - FileBuilder().name("task-1").schema(FieldBuilder().name("account_id"), FieldBuilder().name("year")), - FileBuilder().name("task-2").schema(FieldBuilder().name("account_id"), FieldBuilder().name("date")) - ) - val dataSourceRequest = List( - DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest(Some(List(ValidationItemRequest(VALIDATION_UPSTREAM, Some(Map( - VALIDATION_UPSTREAM_TASK_NAME -> "task-2", - VALIDATION_UPSTREAM_JOIN_EXPR -> "account_id == task-2_account_id", - )), - Some(ValidationItemRequests(List(ValidationItemRequest(VALIDATION_COLUMN, Some(Map(VALIDATION_FIELD -> "year", VALIDATION_EQUAL -> "2020")))))) - )))))) - ) - val res = UiMapper.connectionsWithUpstreamValidationMapping(connections, dataSourceRequest) - assertResult(2)(res.size) - val taskWithValidation = res.find(_.task.get.task.name == "task-1").get - val taskValidations = taskWithValidation.step.get.optValidation.get.dataSourceValidation.validations - assertResult(1)(taskValidations.size) - val upstreamValidation = taskValidations.head.validation.asInstanceOf[UpstreamDataSourceValidation] - assertResult(List("expr:account_id == task-2_account_id"))(upstreamValidation.joinColumns) - assertResult(DEFAULT_VALIDATION_JOIN_TYPE)(upstreamValidation.joinType) - assertResult("task-2")(upstreamValidation.upstreamDataSource.task.get.task.name) - val exprValid = upstreamValidation.validation.validation.asInstanceOf[ExpressionValidation] - assertResult(List("*"))(exprValid.selectExpr) - assertResult("year == 2020")(exprValid.expr) - } } diff --git a/app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/ValidationMapperTest.scala b/app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/ValidationMapperTest.scala new file mode 100644 index 0000000..2283fe9 --- /dev/null +++ b/app/src/test/scala/io/github/datacatering/datacaterer/core/ui/mapper/ValidationMapperTest.scala @@ -0,0 +1,324 @@ +package io.github.datacatering.datacaterer.core.ui.mapper; + +import io.github.datacatering.datacaterer.api.FieldBuilder +import io.github.datacatering.datacaterer.api.connection.FileBuilder +import io.github.datacatering.datacaterer.api.model.Constants.{DEFAULT_VALIDATION_JOIN_TYPE, VALIDATION_AVERAGE, VALIDATION_COLUMN, VALIDATION_COLUMN_NAMES, VALIDATION_COLUMN_NAMES_COUNT_BETWEEN, VALIDATION_COLUMN_NAMES_COUNT_EQUAL, VALIDATION_COLUMN_NAMES_MATCH_ORDER, VALIDATION_COLUMN_NAMES_MATCH_SET, VALIDATION_COLUMN_NAME_COUNT_BETWEEN, VALIDATION_COLUMN_NAME_COUNT_EQUAL, VALIDATION_COLUMN_NAME_MATCH_ORDER, VALIDATION_COLUMN_NAME_MATCH_SET, VALIDATION_COUNT, VALIDATION_EQUAL, VALIDATION_FIELD, VALIDATION_GROUP_BY, VALIDATION_GROUP_BY_COLUMNS, VALIDATION_MAX, VALIDATION_MIN, VALIDATION_STANDARD_DEVIATION, VALIDATION_SUM, VALIDATION_UPSTREAM, VALIDATION_UPSTREAM_JOIN_COLUMNS, VALIDATION_UPSTREAM_JOIN_EXPR, VALIDATION_UPSTREAM_JOIN_TYPE, VALIDATION_UPSTREAM_TASK_NAME} +import io.github.datacatering.datacaterer.api.model.{ColumnNamesValidation, ExpressionValidation, GroupByValidation, UpstreamDataSourceValidation} +import io.github.datacatering.datacaterer.core.ui.model.{DataSourceRequest, ValidationItemRequest, ValidationItemRequests, ValidationRequest} +import org.junit.runner.RunWith +import org.scalatest.funsuite.AnyFunSuite +import org.scalatestplus.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +class ValidationMapperTest extends AnyFunSuite { + + test("Can convert UI validation mapping for basic column validation") { + val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest(Some( + List(ValidationItemRequest(VALIDATION_COLUMN, Some(Map(VALIDATION_FIELD -> "account_id", VALIDATION_EQUAL -> "abc123", "description" -> "valid desc", "errorThreshold" -> "2")))) + )))) + val res = ValidationMapper.validationMapping(dataSourceRequest) + assertResult(1)(res.size) + val exprValid = res.head.validation.asInstanceOf[ExpressionValidation] + assertResult("account_id == abc123")(exprValid.expr) + assertResult(Some("valid desc"))(exprValid.description) + assertResult(Some(2.0))(exprValid.errorThreshold) + assertResult(1)(exprValid.selectExpr.size) + assertResult("*")(exprValid.selectExpr.head) + } + + test("Can convert UI validation mapping for column name validation") { + val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest(Some( + List(ValidationItemRequest(VALIDATION_COLUMN_NAMES, Some(Map(VALIDATION_COLUMN_NAMES_COUNT_EQUAL -> "5")))) + )))) + val res = ValidationMapper.validationMapping(dataSourceRequest) + assertResult(1)(res.size) + val valid = res.head.validation.asInstanceOf[ColumnNamesValidation] + assertResult(VALIDATION_COLUMN_NAME_COUNT_EQUAL)(valid.columnNameType) + assertResult(5)(valid.count) + } + + test("Can convert UI validation mapping for column name validation count between") { + val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest(Some( + List(ValidationItemRequest(VALIDATION_COLUMN_NAMES, Some(Map(VALIDATION_COLUMN_NAMES_COUNT_BETWEEN -> "blah", VALIDATION_MIN -> "1", VALIDATION_MAX -> "2")))) + )))) + val res = ValidationMapper.validationMapping(dataSourceRequest) + assertResult(1)(res.size) + val valid = res.head.validation.asInstanceOf[ColumnNamesValidation] + assertResult(VALIDATION_COLUMN_NAME_COUNT_BETWEEN)(valid.columnNameType) + assertResult(1)(valid.minCount) + assertResult(2)(valid.maxCount) + } + + test("Can convert UI validation mapping for column name validation match order") { + val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest(Some( + List(ValidationItemRequest(VALIDATION_COLUMN_NAMES, Some(Map(VALIDATION_COLUMN_NAMES_MATCH_ORDER -> "account_id,year")))) + )))) + val res = ValidationMapper.validationMapping(dataSourceRequest) + assertResult(1)(res.size) + val valid = res.head.validation.asInstanceOf[ColumnNamesValidation] + assertResult(VALIDATION_COLUMN_NAME_MATCH_ORDER)(valid.columnNameType) + assertResult(Array("account_id", "year"))(valid.names) + } + + test("Can convert UI validation mapping for column name validation match set") { + val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest(Some( + List(ValidationItemRequest(VALIDATION_COLUMN_NAMES, Some(Map(VALIDATION_COLUMN_NAMES_MATCH_SET -> "account_id,year")))) + )))) + val res = ValidationMapper.validationMapping(dataSourceRequest) + assertResult(1)(res.size) + val valid = res.head.validation.asInstanceOf[ColumnNamesValidation] + assertResult(VALIDATION_COLUMN_NAME_MATCH_SET)(valid.columnNameType) + assertResult(Array("account_id", "year"))(valid.names) + } + + test("Can convert UI validation mapping, when unknown option, default to column name count equals 1") { + val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest(Some( + List(ValidationItemRequest(VALIDATION_COLUMN_NAMES, Some(Map("unknown" -> "hello")))) + )))) + val res = ValidationMapper.validationMapping(dataSourceRequest) + assertResult(1)(res.size) + val valid = res.head.validation.asInstanceOf[ColumnNamesValidation] + assertResult(VALIDATION_COLUMN_NAME_COUNT_EQUAL)(valid.columnNameType) + assertResult(1)(valid.count) + } + + test("Can convert UI validation mapping with min group by validation") { + val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest( + Some(List(ValidationItemRequest(VALIDATION_GROUP_BY, Some(Map(VALIDATION_GROUP_BY_COLUMNS -> "account_id")), + Some(ValidationItemRequests(List(ValidationItemRequest( + VALIDATION_COLUMN, Some(Map("aggType" -> VALIDATION_MIN, "aggCol" -> "amount", VALIDATION_EQUAL -> "10")) + )))) + )))))) + val res = ValidationMapper.validationMapping(dataSourceRequest) + assertResult(1)(res.size) + val valid = res.head.validation.asInstanceOf[GroupByValidation] + assertResult(Seq("account_id"))(valid.groupByCols) + assertResult("amount")(valid.aggCol) + assertResult(VALIDATION_MIN)(valid.aggType) + assertResult("min(amount) == 10")(valid.aggExpr) + } + + test("Can convert UI validation mapping with max group by validation") { + val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest( + Some(List(ValidationItemRequest(VALIDATION_GROUP_BY, Some(Map(VALIDATION_GROUP_BY_COLUMNS -> "account_id")), + Some(ValidationItemRequests(List(ValidationItemRequest( + VALIDATION_COLUMN, Some(Map("aggType" -> VALIDATION_MAX, "aggCol" -> "amount", VALIDATION_EQUAL -> "10")) + )))) + )))))) + val res = ValidationMapper.validationMapping(dataSourceRequest) + assertResult(1)(res.size) + val valid = res.head.validation.asInstanceOf[GroupByValidation] + assertResult(Seq("account_id"))(valid.groupByCols) + assertResult("amount")(valid.aggCol) + assertResult(VALIDATION_MAX)(valid.aggType) + assertResult("max(amount) == 10")(valid.aggExpr) + } + + test("Can convert UI validation mapping with count group by validation") { + val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest(Some( + List(ValidationItemRequest(VALIDATION_GROUP_BY, Some(Map(VALIDATION_GROUP_BY_COLUMNS -> "account_id")), + Some(ValidationItemRequests(List(ValidationItemRequest( + VALIDATION_COLUMN, Some(Map("aggType" -> VALIDATION_COUNT, "aggCol" -> "amount", VALIDATION_EQUAL -> "10")) + )))) + )))))) + val res = ValidationMapper.validationMapping(dataSourceRequest) + assertResult(1)(res.size) + val valid = res.head.validation.asInstanceOf[GroupByValidation] + assertResult(Seq("account_id"))(valid.groupByCols) + assertResult("amount")(valid.aggCol) + assertResult(VALIDATION_COUNT)(valid.aggType) + assertResult("count(amount) == 10")(valid.aggExpr) + } + + test("Can convert UI validation mapping with sum group by validation") { + val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest( + Some(List(ValidationItemRequest(VALIDATION_GROUP_BY, Some(Map(VALIDATION_GROUP_BY_COLUMNS -> "account_id")), + Some(ValidationItemRequests(List(ValidationItemRequest( + VALIDATION_COLUMN, Some(Map("aggType" -> VALIDATION_SUM, "aggCol" -> "amount", VALIDATION_EQUAL -> "10")) + )))) + )))))) + val res = ValidationMapper.validationMapping(dataSourceRequest) + assertResult(1)(res.size) + val valid = res.head.validation.asInstanceOf[GroupByValidation] + assertResult(Seq("account_id"))(valid.groupByCols) + assertResult("amount")(valid.aggCol) + assertResult(VALIDATION_SUM)(valid.aggType) + assertResult("sum(amount) == 10")(valid.aggExpr) + } + + test("Can convert UI validation mapping with average group by validation") { + val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest( + Some(List(ValidationItemRequest(VALIDATION_GROUP_BY, Some(Map(VALIDATION_GROUP_BY_COLUMNS -> "account_id")), + Some(ValidationItemRequests(List(ValidationItemRequest( + VALIDATION_COLUMN, Some(Map("aggType" -> VALIDATION_AVERAGE, "aggCol" -> "amount", VALIDATION_EQUAL -> "10")) + )))) + )))))) + val res = ValidationMapper.validationMapping(dataSourceRequest) + assertResult(1)(res.size) + val valid = res.head.validation.asInstanceOf[GroupByValidation] + assertResult(Seq("account_id"))(valid.groupByCols) + assertResult("amount")(valid.aggCol) + assertResult("avg")(valid.aggType) + assertResult("avg(amount) == 10")(valid.aggExpr) + } + + test("Can convert UI validation mapping with standard deviation group by validation") { + val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest( + Some(List(ValidationItemRequest(VALIDATION_GROUP_BY, Some(Map(VALIDATION_GROUP_BY_COLUMNS -> "account_id")), + Some(ValidationItemRequests(List(ValidationItemRequest( + VALIDATION_COLUMN, Some(Map("aggType" -> VALIDATION_STANDARD_DEVIATION, "aggCol" -> "amount", VALIDATION_EQUAL -> "10")) + )))) + )))))) + val res = ValidationMapper.validationMapping(dataSourceRequest) + assertResult(1)(res.size) + val valid = res.head.validation.asInstanceOf[GroupByValidation] + assertResult(Seq("account_id"))(valid.groupByCols) + assertResult("amount")(valid.aggCol) + assertResult("stddev")(valid.aggType) + assertResult("stddev(amount) == 10")(valid.aggExpr) + } + + test("Throw error when given unknown aggregation type") { + val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest( + Some(List(ValidationItemRequest(VALIDATION_GROUP_BY, Some(Map(VALIDATION_GROUP_BY_COLUMNS -> "account_id")), + Some(ValidationItemRequests(List(ValidationItemRequest( + VALIDATION_COLUMN, Some(Map("aggType" -> "unknown", "aggCol" -> "amount", VALIDATION_EQUAL -> "10")) + )))) + )))))) + assertThrows[IllegalArgumentException](ValidationMapper.validationMapping(dataSourceRequest)) + } + + test("Throw error when no aggType or aggCol is given") { + val dataSourceRequest = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest( + Some(List(ValidationItemRequest(VALIDATION_GROUP_BY, Some(Map(VALIDATION_GROUP_BY_COLUMNS -> "account_id")), + Some(ValidationItemRequests(List(ValidationItemRequest( + VALIDATION_COLUMN, Some(Map("aggCol" -> "amount", VALIDATION_EQUAL -> "10")) + )))) + )))))) + val dataSourceRequest1 = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest( + Some(List(ValidationItemRequest(VALIDATION_GROUP_BY, Some(Map(VALIDATION_GROUP_BY_COLUMNS -> "account_id")), + Some(ValidationItemRequests(List(ValidationItemRequest( + VALIDATION_COLUMN, Some(Map("aggType" -> "max", VALIDATION_EQUAL -> "10")) + )))) + )))))) + val dataSourceRequest2 = DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest( + Some(List(ValidationItemRequest(VALIDATION_GROUP_BY, Some(Map(VALIDATION_GROUP_BY_COLUMNS -> "account_id")), + Some(ValidationItemRequests(List(ValidationItemRequest( + VALIDATION_COLUMN, Some(Map(VALIDATION_EQUAL -> "10")) + )))) + )))))) + assertThrows[RuntimeException](ValidationMapper.validationMapping(dataSourceRequest)) + assertThrows[RuntimeException](ValidationMapper.validationMapping(dataSourceRequest1)) + assertThrows[RuntimeException](ValidationMapper.validationMapping(dataSourceRequest2)) + } + + test("Can convert UI upstream validation mapping") { + val connections = List( + FileBuilder().name("task-1").schema(FieldBuilder().name("account_id"), FieldBuilder().name("year")), + FileBuilder().name("task-2").schema(FieldBuilder().name("account_id"), FieldBuilder().name("date")) + ) + val dataSourceRequest = List( + DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest(Some(List(ValidationItemRequest(VALIDATION_UPSTREAM, Some(Map( + VALIDATION_UPSTREAM_TASK_NAME -> "task-2", + VALIDATION_UPSTREAM_JOIN_TYPE -> "outer", + VALIDATION_UPSTREAM_JOIN_COLUMNS -> "account_id", + )), + Some(ValidationItemRequests(List(ValidationItemRequest(VALIDATION_COLUMN, Some(Map(VALIDATION_FIELD -> "year", VALIDATION_EQUAL -> "2020")))))) + )))))) + ) + val res = ValidationMapper.connectionsWithUpstreamValidationMapping(connections, dataSourceRequest) + assertResult(2)(res.size) + val taskWithValidation = res.find(_.task.get.task.name == "task-1").get + val taskValidations = taskWithValidation.step.get.optValidation.get.dataSourceValidation.validations + assertResult(1)(taskValidations.size) + val upstreamValidation = taskValidations.head.validation.asInstanceOf[UpstreamDataSourceValidation] + assertResult(List("account_id"))(upstreamValidation.joinColumns) + assertResult("outer")(upstreamValidation.joinType) + assertResult("task-2")(upstreamValidation.upstreamDataSource.task.get.task.name) + val exprValid = upstreamValidation.validation.validation.asInstanceOf[ExpressionValidation] + assertResult(List("*"))(exprValid.selectExpr) + assertResult("year == 2020")(exprValid.expr) + } + + test("Can convert UI upstream validation mapping with join expression") { + val connections = List( + FileBuilder().name("task-1").schema(FieldBuilder().name("account_id"), FieldBuilder().name("year")), + FileBuilder().name("task-2").schema(FieldBuilder().name("account_id"), FieldBuilder().name("date")) + ) + val dataSourceRequest = List( + DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest(Some(List(ValidationItemRequest(VALIDATION_UPSTREAM, Some(Map( + VALIDATION_UPSTREAM_TASK_NAME -> "task-2", + VALIDATION_UPSTREAM_JOIN_TYPE -> "outer", + VALIDATION_UPSTREAM_JOIN_EXPR -> "account_id == task-2_account_id", + )), + Some(ValidationItemRequests(List(ValidationItemRequest(VALIDATION_COLUMN, Some(Map(VALIDATION_FIELD -> "year", VALIDATION_EQUAL -> "2020")))))) + )))))) + ) + val res = ValidationMapper.connectionsWithUpstreamValidationMapping(connections, dataSourceRequest) + assertResult(2)(res.size) + val taskWithValidation = res.find(_.task.get.task.name == "task-1").get + val taskValidations = taskWithValidation.step.get.optValidation.get.dataSourceValidation.validations + assertResult(1)(taskValidations.size) + val upstreamValidation = taskValidations.head.validation.asInstanceOf[UpstreamDataSourceValidation] + assertResult(List("expr:account_id == task-2_account_id"))(upstreamValidation.joinColumns) + assertResult("outer")(upstreamValidation.joinType) + assertResult("task-2")(upstreamValidation.upstreamDataSource.task.get.task.name) + val exprValid = upstreamValidation.validation.validation.asInstanceOf[ExpressionValidation] + assertResult(List("*"))(exprValid.selectExpr) + assertResult("year == 2020")(exprValid.expr) + } + + test("Can convert UI upstream validation mapping with join columns only") { + val connections = List( + FileBuilder().name("task-1").schema(FieldBuilder().name("account_id"), FieldBuilder().name("year")), + FileBuilder().name("task-2").schema(FieldBuilder().name("account_id"), FieldBuilder().name("date")) + ) + val dataSourceRequest = List( + DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest(Some(List(ValidationItemRequest(VALIDATION_UPSTREAM, Some(Map( + VALIDATION_UPSTREAM_TASK_NAME -> "task-2", + VALIDATION_UPSTREAM_JOIN_COLUMNS -> "account_id", + )), + Some(ValidationItemRequests(List(ValidationItemRequest(VALIDATION_COLUMN, Some(Map(VALIDATION_FIELD -> "year", VALIDATION_EQUAL -> "2020")))))) + )))))) + ) + val res = ValidationMapper.connectionsWithUpstreamValidationMapping(connections, dataSourceRequest) + assertResult(2)(res.size) + val taskWithValidation = res.find(_.task.get.task.name == "task-1").get + val taskValidations = taskWithValidation.step.get.optValidation.get.dataSourceValidation.validations + assertResult(1)(taskValidations.size) + val upstreamValidation = taskValidations.head.validation.asInstanceOf[UpstreamDataSourceValidation] + assertResult(List("account_id"))(upstreamValidation.joinColumns) + assertResult(DEFAULT_VALIDATION_JOIN_TYPE)(upstreamValidation.joinType) + assertResult("task-2")(upstreamValidation.upstreamDataSource.task.get.task.name) + val exprValid = upstreamValidation.validation.validation.asInstanceOf[ExpressionValidation] + assertResult(List("*"))(exprValid.selectExpr) + assertResult("year == 2020")(exprValid.expr) + } + + test("Can convert UI upstream validation mapping with join expression only") { + val connections = List( + FileBuilder().name("task-1").schema(FieldBuilder().name("account_id"), FieldBuilder().name("year")), + FileBuilder().name("task-2").schema(FieldBuilder().name("account_id"), FieldBuilder().name("date")) + ) + val dataSourceRequest = List( + DataSourceRequest("csv-name", "task-1", validations = Some(ValidationRequest(Some(List(ValidationItemRequest(VALIDATION_UPSTREAM, Some(Map( + VALIDATION_UPSTREAM_TASK_NAME -> "task-2", + VALIDATION_UPSTREAM_JOIN_EXPR -> "account_id == task-2_account_id", + )), + Some(ValidationItemRequests(List(ValidationItemRequest(VALIDATION_COLUMN, Some(Map(VALIDATION_FIELD -> "year", VALIDATION_EQUAL -> "2020")))))) + )))))) + ) + val res = ValidationMapper.connectionsWithUpstreamValidationMapping(connections, dataSourceRequest) + assertResult(2)(res.size) + val taskWithValidation = res.find(_.task.get.task.name == "task-1").get + val taskValidations = taskWithValidation.step.get.optValidation.get.dataSourceValidation.validations + assertResult(1)(taskValidations.size) + val upstreamValidation = taskValidations.head.validation.asInstanceOf[UpstreamDataSourceValidation] + assertResult(List("expr:account_id == task-2_account_id"))(upstreamValidation.joinColumns) + assertResult(DEFAULT_VALIDATION_JOIN_TYPE)(upstreamValidation.joinType) + assertResult("task-2")(upstreamValidation.upstreamDataSource.task.get.task.name) + val exprValid = upstreamValidation.validation.validation.asInstanceOf[ExpressionValidation] + assertResult(List("*"))(exprValid.selectExpr) + assertResult("year == 2020")(exprValid.expr) + } +} \ No newline at end of file diff --git a/gradle.properties b/gradle.properties index ad11ebb..de7e51c 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,5 +1,5 @@ groupId=io.github.data-catering -version=0.12.0 +version=0.12.1 scalaVersion=2.12 scalaSpecificVersion=2.12.19 From 78b1d4c54f32eace285bb37c542031bda886fece Mon Sep 17 00:00:00 2001 From: Flook Peter Date: Thu, 17 Oct 2024 18:55:04 +0800 Subject: [PATCH 2/2] Include JDBC to DataSourceRegister --- ....apache.spark.sql.sources.DataSourceRegister | 1 + .../core/plan/PlanProcessorTest.scala | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/app/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/app/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister index 0913584..671386a 100644 --- a/app/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister +++ b/app/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister @@ -3,5 +3,6 @@ org.apache.spark.sql.execution.datasources.v2.json.JsonDataSourceV2 org.apache.spark.sql.execution.datasources.v2.parquet.ParquetDataSourceV2 org.apache.spark.sql.execution.datasources.v2.orc.OrcDataSourceV2 org.apache.spark.sql.execution.datasources.v2.text.TextDataSourceV2 +org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider org.apache.spark.sql.execution.streaming.ConsoleSinkProvider org.apache.spark.sql.kafka010.KafkaSourceProvider diff --git a/app/src/test/scala/io/github/datacatering/datacaterer/core/plan/PlanProcessorTest.scala b/app/src/test/scala/io/github/datacatering/datacaterer/core/plan/PlanProcessorTest.scala index b7d5bfe..add1998 100644 --- a/app/src/test/scala/io/github/datacatering/datacaterer/core/plan/PlanProcessorTest.scala +++ b/app/src/test/scala/io/github/datacatering/datacaterer/core/plan/PlanProcessorTest.scala @@ -9,6 +9,7 @@ import org.junit.runner.RunWith import org.scalatestplus.junit.JUnitRunner import java.sql.{Date, Timestamp} +import java.time.LocalDate @RunWith(classOf[JUnitRunner]) class PlanProcessorTest extends SparkSuite { @@ -112,10 +113,22 @@ class PlanProcessorTest extends SparkSuite { } ignore("Can run Postgres plan run") { - PlanProcessor.determineAndExecutePlan(Some(new AdvancedMySqlPlanRun)) + PlanProcessor.determineAndExecutePlan(Some(new TestPostgres)) } class TestPostgres extends PlanRun { + val accountTask = postgres("customer_postgres", "jdbc:postgresql://localhost:5432/customer") + .table("account", "accounts") + .schema( + field.name("account_number").regex("[0-9]{10}").unique(true), + field.name("customer_id_int").`type`(IntegerType).min(1).max(1000), + field.name("created_by").expression("#{Name.name}"), + field.name("created_by_fixed_length").sql("CASE WHEN account_status IN ('open', 'closed') THEN 'eod' ELSE 'event' END"), + field.name("open_timestamp").`type`(TimestampType).min(Date.valueOf(LocalDate.now())), + field.name("account_status").oneOf("open", "closed", "suspended", "pending") + ) + .count(count.records(100)) + val jsonTask = json("my_json", "/tmp/data/json", Map("saveMode" -> "overwrite")) .schema( field.name("account_id").regex("ACC[0-9]{8}"), @@ -142,7 +155,7 @@ class PlanProcessorTest extends SparkSuite { .generatedReportsFolderPath("/tmp/report") .enableSinkMetadata(true) - execute(conf, jsonTask, csvTask) + execute(conf, accountTask) } class TestCsvPostgres extends PlanRun {