diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 84ba463ba..3f192609e 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1 +1 @@ -* @lokm01 @benedeki @DzMakatun @Zejnilovic @dk1844 @AdrianOlosutean @zakera786 +* @lokm01 @benedeki @DzMakatun @Zejnilovic @dk1844 @lsulak @zakera786 diff --git a/README.md b/README.md index c3223eed2..bd0e74aab 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ $> bundle exec jekyll serve # => Now browse to http://localhost:4000 ``` -### Run convinience scripts +### Run convenience scripts #### Generate new docs ```ruby diff --git a/_data/configuration_3_0_0.yml b/_data/configuration_3_0_0.yml new file mode 100644 index 000000000..14d20502c --- /dev/null +++ b/_data/configuration_3_0_0.yml @@ -0,0 +1,81 @@ +--- + - name: conformance.allowOriginalColumnsMutability + options: + - name: boolean + description: "Allows to modify/drop columns from the original input (default is false)" + - name: conformance.autoclean.standardized.hdfs.folder + options: + - name: boolean + description: 'Automatically delete standardized data folder after successful run of a Conformance job *' + - name: control.info.validation + options: + - name: strict + description: Job will fail on failed _INFO file validation. + - name: warning + description: "(default) A warning message will be displayed on failed validation, + but the job will go on." + - name: none + description: No validation is done. + - name: enceladus.recordId.generation.strategy + options: + - name: uuid + description: "(default) enceladus_record_id column will be added and will contain + a UUID String for each row." + - name: stableHashId + description: "enceladus_record_id column will be added and populated with an + always-the-same Int hash (Murmur3-based, for testing)." + - name: none + description: no column will be added to the output. + - name: max.processing.partition.size + options: + - name: non-negative long integer + description: 'Maximal size (in bytes) for the processing partition, which would influence the written parquet file size + NB! Experimental - sizes might still not fulfill the requested limits' + - name: menas.rest.uri + options: + - name: string with URLs + description: 'Comma-separated list of URLs where Menas will be looked for. E.g.: + http://example.com/menas1,http://domain.com:8080/menas2' + - name: menas.rest.retryCount + options: + - name: non-negative integer + description: Each of the menas.rest.uri URLs can be tried multiple times for fault-tolerance + - name: menas.rest.availability.setup + options: + - name: roundrobin + description: "(default) Starts from a random URL from the menas.rest.uri list, if it fails the next + one is tried, if last is reached start from 0 until all are tried" + - name: fallback + description: "Always starts from the first URL, and only if it fails the second follows etc." + - name: min.processing.partition.size + options: + - name: non-negative long integer + description: 'Minimal size (in bytes) for the processing partition, which would influence the written parquet file size + NB! Experimental - sizes might still not fulfill the requested limits' + - name: standardization.defaultTimestampTimeZone.default + options: + - name: string with any valid time zone name + description: The time zone for normalization of timestamps that don't have their own time zone either in data + itself or in metadata. If left empty the system time zone will be used. + - name: standardization.defaultTimestampTimeZone.[rawFormat] + options: + - name: string with any valid time zone name + description: Same as above standardization.defaultTimestampTimeZone.default, but applies only for + the specific input raw format - then it takes precedence over + standardization.defaultTimestampTimeZone.default. + - name: standardization.defaultDateTimeZone.default + options: + - name: string with any valid time zone name + description: The time zone for normalization of dates that don't have their own time zone either in data itself + or in metadata in case they need it. Most probably this should be left undefined. + - name: standardization.defaultDateTimeZone.[rawFormat] + options: + - name: string with any valid time zone name + description: Same as above standardization.defaultDateTimeZone.default, but applies only for + the specific input raw format - then it takes precedence over + standardization.defaultDateTimeZone.default. + - name: timezone + options: + - name: string with any valid time zone name + description: The time zone the Spark application will operate in. Strongly recommended + to keep it to default UTC diff --git a/_data/menas-configuration_3_0_0.yml b/_data/menas-configuration_3_0_0.yml new file mode 100644 index 000000000..d651f4b81 --- /dev/null +++ b/_data/menas-configuration_3_0_0.yml @@ -0,0 +1,46 @@ +--- + - name: javax.net.ssl.keyStore + options: + - name: string path to JKS file + description: 'KeyStore file containing records of private keys to connect to a secure schema registry. + E.g.: /path/to/keystore.jks' + - name: javax.net.ssl.keyStorePassword + options: + - name: string + description: 'Password for the file referenced in javax.net.ssl.keyStore. E.g.: + password1234' + - name: javax.net.ssl.trustStore + options: + - name: string path to JKS file + description: 'TrustStore file containing records of trusted certificates to connect to a secure schema registry. + E.g.: /path/to/truststore.jks *' + - name: javax.net.ssl.trustStorePassword + options: + - name: string + description: 'Password for the file referenced in javax.net.ssl.trustStore. E.g.: + password123' + - name: menas.auth.admin.role + options: + - name: string + description: 'Specifies the admin role to operate property definition create and update operations.' + - name: menas.auth.roles.regex + options: + - name: string - regular expression + description: 'Regular expression specifying which user roles to include in JWT. E.g.: + ^menas_. If the expression filters out the admin role (menas.auth.admin.role), account won''t be recognized as admin.' + - name: menas.auth.ad.server + options: + - name: string - space-separated AD server domains + description: 'ActiveDirectory server domain(s) - multiple values are supported as fallback options. + DN (e.g. dc=example,dc=com) should not be included as this is supplied in menas.auth.ldap.search.base. + Example: menas.auth.ad.server=ldaps://first.ldap.here ldaps://second.ldap.here ldaps://third.ldap.here (notice no quotes)' + - name: menas.schemaRegistry.baseUrl + options: + - name: string with URL + description: 'Base Url to (secure) schema registry. E.g.: + https://localhost:8081 *' + - name: menas.schemaRegistry.warnUnsecured + options: + - name: boolean + description: 'If set, in case the javax.net.ssl.* settings are missing or incorrect, the application + will issue a warning. Default: True' diff --git a/_data/selected-plugins-configuration_3_0_0.yml b/_data/selected-plugins-configuration_3_0_0.yml new file mode 100644 index 000000000..87d8ea03d --- /dev/null +++ b/_data/selected-plugins-configuration_3_0_0.yml @@ -0,0 +1,9 @@ +--- +- name: atum.hdfs.info.file.permissions + options: + - name: string with FS permissions + description: 'Desired FS permissions for Atum _INFO file. Default: 644.' +- name: spline.hdfs.file.permissions + options: + - name: string with FS permissions + description: "Desired FS permissions for Spline's _LINEAGE file. Default: 644." diff --git a/_data/versions.yaml b/_data/versions.yaml index eccec6c2e..5f1ebd8b5 100755 --- a/_data/versions.yaml +++ b/_data/versions.yaml @@ -1,2 +1,3 @@ - '1.0.0' - '2.0.0' +- '3.0.0' diff --git a/_docs/3.0.0/build-process.md b/_docs/3.0.0/build-process.md new file mode 100644 index 000000000..ca96b10ac --- /dev/null +++ b/_docs/3.0.0/build-process.md @@ -0,0 +1,7 @@ +--- +layout: docs +title: Build Process +version: '3.0.0' +categories: + - '3.0.0' +--- diff --git a/_docs/3.0.0/components.md b/_docs/3.0.0/components.md new file mode 100644 index 000000000..dde3645ef --- /dev/null +++ b/_docs/3.0.0/components.md @@ -0,0 +1,33 @@ +--- +layout: docs +title: Components +version: '3.0.0' +categories: + - '3.0.0' +--- + +### Menas + +Menas is a UI component of the Enceladus project. It is used to define datasets and schemas representing your data. Using dataset definition you define where the data is, where should it land if any conformance rules should be applied. Schema defines how does the data will look (column names, types) after standardization. + +[More...]({{ site.baseurl }}/docs/{{ page.version }}/components/menas) + +### SparkJobs + +Enceladus consists of two spark jobs. One is Standardization, for alignation of data types and format, and the second one is Conformance, which then applies conformance rules onto the data. + +#### Standardization + +Standardization is used to transform almost any data format into a standardized, strongly typed parquet format, so the data can be used/view using unified tools. + +#### Conformance + +Conformance is used to apply conformance rules (mapping, negation, casting, etc.) onto the data. Conformance rules are additional tranformations of the data. + +### Plugins + +[More...]({{ site.baseurl }}/docs/{{ page.version }}/plugins) + +### Built-in Plugins + +[More...]({{ site.baseurl }}/docs/{{ page.version }}/plugins-built-in) diff --git a/_docs/3.0.0/components/menas.md b/_docs/3.0.0/components/menas.md new file mode 100644 index 000000000..f5a5e498e --- /dev/null +++ b/_docs/3.0.0/components/menas.md @@ -0,0 +1,40 @@ +--- +layout: docs +title: Components - Menas +version: '3.0.0' +categories: + - '3.0.0' + - components +--- +## API + +### Monitoring endpoints + +All `/admin` endpoints except `/admin/health` require authentication (and will require strict permissions once [Authorization]({{ site.github.issues_url }}/30) is implemented) +* `GET /admin` - list of all monitoring endpoints +* `GET /admin/heapdump` - downloads a heapdump of the application +* `GET /admin/threaddump` - list of the threaddump of the application +* `GET /admin/loggers` - list of all the application loggers and their log levels +* `POST /admin/loggers/{logger}` - change the log level of a logger in runtime +* `GET /admin/health` - get a detailed status report of the application's health: +```json +{ + "status": "UP", + "details": { + "HDFSConnection": { + "status": "UP" + }, + "MongoDBConnection": { + "status": "UP" + }, + "diskSpace": { + "status": "UP", + "details": { + "total": 1000240963584, + "free": 766613557248, + "threshold": 10485760 + } + } + } +} +``` diff --git a/_docs/3.0.0/deployment.md b/_docs/3.0.0/deployment.md new file mode 100644 index 000000000..4454ce32c --- /dev/null +++ b/_docs/3.0.0/deployment.md @@ -0,0 +1,20 @@ +--- +layout: docs +title: Deployment +version: '3.0.0' +categories: + - '3.0.0' +--- + +## Menas + +### Prerequisits to deploying Menas are + +- Tomcat 8.5+ to deploy the war to +- `HADOOP_CONF_DIR` environment variable. This variable should point to a folder containing Hadoop configuration files (`core-site.xml`, `hdfs-site.xml` and `yarn-site.xml`). These are used to query the HDFS for folder locations. +- MongoDB 4.0+ used as a storage +- _OPTIONAL_ [Spline 0.3.X](https://absaoss.github.io/spline/0.3.html) for viewing of the lineage from Menas. Even without Spline in Menas, Standardization and Conformance will log lineage to Mongo. + +### Deploying Menas + +The easiest way to deploy Menas is to copy the `menas-VERSION.war` to `$TOMCAT_HOME/webapps`. This will create `/menas-VERSION` path on your local server. diff --git a/_docs/3.0.0/plugins-built-in.md b/_docs/3.0.0/plugins-built-in.md new file mode 100644 index 000000000..fe979ec2b --- /dev/null +++ b/_docs/3.0.0/plugins-built-in.md @@ -0,0 +1,66 @@ +--- +layout: docs +title: Built-in Plugins +version: '3.0.0' +categories: + - '3.0.0' +--- + +- [What are built-in plugins](#what-are-built-in-plugins) +- [Existing built-in plugins](#existing-built-in-plugins) + - [KafkaInfoPlugin](#kafkainfoplugin) + - [KafkaErrorSenderPlugin](#kafkaerrorsenderplugin) + + +## What are built-in plugins + +Built-in plugins provide some additional but relatively elementary functionality. And also serve as an example how plugins +are written. Unlike externally created plugins they are automatically included in the `SparkJobs.jar` file and therefore +don't need to be included using the `--jars` option. + +## Existing built-in plugins + +The plugin class name is specified for Standardization and Conformance separately since some plugins need to run only +during execution of one of these jobs. Plugin class name keys have numeric suffixes (`.1` in this example). The numeric +suffix specifies the order at which plugins are invoked. It should always start with `1` and be incremented by 1 without +gaps. + +### KafkaInfoPlugin + +The purpose of this plugin is to send control measurements to a Kafka topic each time a checkpoint is reached or job +status is changed. This can help to monitor production issues and react to errors as quickly as possible. +Control measurements are sent in `Avro` format and the schema is automatically registered in a schema registry. + +This plugin is a built-in one. In order to enable it, you need to provide the following configuration settings in +`application.conf`: + +``` +standardization.plugin.control.metrics.1=za.co.absa.enceladus.plugins.builtin.controlinfo.mq.kafka.KafkaInfoPlugin +conformance.plugin.control.metrics.1=za.co.absa.enceladus.plugins.builtin.controlinfo.mq.kafka.KafkaInfoPlugin +kafka.schema.registry.url="http://127.0.0.1:8081" +kafka.bootstrap.servers="127.0.0.1:9092" +kafka.info.metrics.client.id="controlInfo" +kafka.info.metrics.topic.name="control.info" +# Optional security settings +#kafka.security.protocol="SASL_SSL" +#kafka.sasl.mechanism="GSSAPI" +# Optional Schema Registry Security Parameters +#kafka.schema.registry.basic.auth.credentials.source=USER_INFO +#kafka.schema.registry.basic.auth.user.info=user:password +``` + +### KafkaErrorSenderPlugin + +The purpose of this plugin is to send errors to a Kafka topic. + +This plugin is a built-in one. In order to enable it, you need to provide the following configuration settings in +`application.conf`: + +``` +standardization.plugin.postprocessor.1=za.co.absa.enceladus.plugins.builtin.errorsender.mq.kafka.KafkaErrorSenderPlugin +conformance.plugin.postprocessor.1=za.co.absa.enceladus.plugins.builtin.errorsender.mq.kafka.KafkaErrorSenderPlugin +`kafka.schema.registry.url`= +`kafka.bootstrap.servers`= +`kafka.error.client.id`= +`kafka.error.topic.name`= +``` diff --git a/_docs/3.0.0/plugins.md b/_docs/3.0.0/plugins.md new file mode 100644 index 000000000..67cb6d058 --- /dev/null +++ b/_docs/3.0.0/plugins.md @@ -0,0 +1,37 @@ +--- +layout: docs +title: Plugins +version: '3.0.0' +categories: + - '3.0.0' +--- + +**Standardization** and **Conformance** support plugins that allow executing additional actions at certain times of the computation. + +A plugin can be externally developed. In this case, in order to use the plugin a plugin jar needs to be supplied to +`spark-submit` using the `--jars` option. You can also use built-in plugins by enabling them in `application.conf` +or passing configuration information directly to `spark-submit`. + +The way it works is like this. A plugin factory (a class that implements `PluginFactory`) overrides the +apply method. Standardization and Conformance will invoke this method when job starts and provides a configuration that +includes all settings from `application.conf` plus settings passed to JVM via `spark-submit`. The factory then +instantiates a plugin and returns it to the caller. If the factory throws an exception the Spark application +(Standardization or Conformance) will be stopped. If the factory returns `null` an error will be logged by the application, +but it will continue to run. + +There's one type of plugins supported for now: + +## Control Metrics Plugins + +_Control metrics plugins_ allow execution of additional actions any time a checkpoint is created +or job status changes. In order to write such a plugin to Enceladus you need to implement the `ControlMetricsPlugin` and +`ControlMetricsPluginFactory` interfaces. + +Controls metrics plugins are invoked each time a job status changes (e.g. from `running` to `succeeded`) or when a checkpoint +is reached. A `Checkpoint` is an [Atum][atum] concept to ensure accuracy and completeness of data. +A checkpoint is created at the end of Standardization and Conformance, and after each conformance rule +configured to create control measurements. At this point the `onCheckpoint()` callback is called with an instance of control +measurements. It is up to the plugin to decide what to do at this point. All exceptions thrown from a plugin will be +logged, but the spark application will continue to run. + +[atum]: https://github.com/AbsaOSS/atum diff --git a/_docs/3.0.0/usage.md b/_docs/3.0.0/usage.md new file mode 100644 index 000000000..79320bdc7 --- /dev/null +++ b/_docs/3.0.0/usage.md @@ -0,0 +1,54 @@ +--- +layout: docs +title: Usage +version: '3.0.0' +categories: + - '3.0.0' +--- +{% capture docs_path %}{{ site.baseurl }}/docs/{{ page.version }}{% endcapture %} + +## Table Of Contents + + +- [Table Of Contents](#table-of-contents) +- [Intro](#intro) +- [Quick Start](#quick-start) +- [Spark Jobs](#spark-jobs) +- [Menas](#menas) +- [General](#general) +- [Details](#details) + + +## Intro + +This part of the documentations will show you how to use Enceladus, Menas as its UI and how to run its spark-jobs. This part expects you to have Menas already deployed and running and `spark-jobs.jar` ready at hand. If not, please look at previous parts of [build process][build] and [deployment][deploy] + +## Quick Start + +- [Menas Quick Start]({{ docs_path }}/usage/menas-quick-start) +- [Data & Data Quality Quick Start]({{ docs_path }}/usage/data-quick-start) +- [Spark Jobs Quick Start]({{ docs_path }}/usage/spark-jobs-quick-start) + +## Spark Jobs + +- [Configuration]({{ docs_path }}/usage/config) +- [Standardization Input Formats]({{ docs_path }}/usage/standardization-formats) + +## Menas + +- [Configuration]({{ docs_path }}/usage/menas-config) +- [Conformance Rules]({{ docs_path }}/usage/menas-conformance-rules) +- [API]({{ docs_path }}/usage/menas-api) + +## General + +- [How to run]({{ docs_path }}/usage/run) +- [\_INFO file]({{ docs_path }}/usage/info-file) +- [Schema]({{ docs_path }}/usage/schema) + +## Details + +- [Error columns]({{ docs_path }}/usage/errcol) + +[build]: {{ docs_path }}/build-process +[deploy]: {{ docs_path }}/deployment diff --git a/_docs/3.0.0/usage/config.md b/_docs/3.0.0/usage/config.md new file mode 100644 index 000000000..83d4a6f1c --- /dev/null +++ b/_docs/3.0.0/usage/config.md @@ -0,0 +1,45 @@ +--- +layout: docs +title: Usage - Configuration +version: '3.0.0' +categories: + - '3.0.0' + - usage +--- +## Table Of Contents + +- [Table Of Contents](#table-of-contents) +- [Intro](#intro) +- [General options](#general-options) +- [Selected plugin options](#selected-plugin-options) + + +## Intro + +This page describes the usage of configuration of _Standardization_ and _Conformance_. +There are number of default options that [Project's readme][readme] documents. +This page describes the configuration values stored in `spark-jobs`'s `reference.conf` ([link][spark-app-conf]) or `application.conf` provided by the user. +These values can be overridden using the `-D` property values as in: + +```shell +spark-submit --conf "spark.driver.extraJavaOptions= -Dkey1=value1 -Dkey2=value2" ... +``` + +## General options + +{% include config_options.html file="configuration_3_0_0" %} + + + +Note that + +* `conformance.autoclean.standardized.hdfs.folder` when set to true and the job is writing to S3, there could be a leftover empty file like `conformance-output_$folder$` after the autoclean. +This, however, will not negatively impact the functionality of other jobs even when using the same path and is due to the EMR committer. + + +## Selected plugin options: +{% include config_options.html file="selected-plugins-configuration_3_0_0" %} + + +[readme]: https://github.com/AbsaOSS/enceladus/blob/master/README.md +[spark-app-conf]: https://github.com/AbsaOSS/enceladus/blob/master/spark-jobs/src/main/resources/reference.conf diff --git a/_docs/3.0.0/usage/data-quick-start.md b/_docs/3.0.0/usage/data-quick-start.md new file mode 100644 index 000000000..695db4e54 --- /dev/null +++ b/_docs/3.0.0/usage/data-quick-start.md @@ -0,0 +1,40 @@ +--- +layout: docs +title: Usage - Data & Data Quality Quick Start +version: '3.0.0' +categories: + - '3.0.0' + - 'usage' +--- +{% capture docs_path %}{{ site.baseurl }}/docs/{{ page.version }}{% endcapture %} + +## Prerequisites + +This quick start guide presumes that you have went through : + +- [Menas Quick Start]({{ docs_path }}/usage/menas-quick-start) guide + +## Data Quality + +Data quality is all about 2 things: + +- Spline +- _INFO files + +## Spline + +but as Spline is under heavy development, we will postpone extensive documentation about it. Enceladus currently runs with version 0.3.X and it works fine out of the box. Spline 0.3.X just needs to be deployed next to Menas. + +Data for Spline are recorded even if the Spline UI is not up and running. This means they can be viewed later without the need to care about it now. + +More about spline at [Spline](https://github.com/AbsaOSS/spline) Github. + +## _INFO files + +`_INFO` file is our way of tracking where the data came from and how much data is there. It checks mainly that no data were lost on the way to and through standardization and conformance. All this is made possible by [Atum](https://github.com/AbsaOSS/atum). `_INFO` file needs to be placed within the source directory together with the raw data. + +More info about `_INFO` files [here](info-file). + +Examples of `_INFO` files [here](https://github.com/AbsaOSS/enceladus/tree/develop/examples). + +[**Next Spark Jobs Quick Sart**]({{ site.baseurl }}/docs/{{ page.version }}/usage/spark-jobs-quick-start) diff --git a/_docs/3.0.0/usage/errcol.md b/_docs/3.0.0/usage/errcol.md new file mode 100644 index 000000000..564334053 --- /dev/null +++ b/_docs/3.0.0/usage/errcol.md @@ -0,0 +1,45 @@ +--- +layout: docs +title: Usage - Error Column +version: '3.0.0' +categories: + - '3.0.0' + - usage +--- +## Table Of Contents + +- [Table Of Contents](#table-of-contents) +- [Description](#description) +- [Error Types](#error-types) +- [Notes](#notes) + + +## Description + +`errCol` is a special, automatically created, composite column, that contains descriptions of all the issues encountered +during **Standardization** and **Conformance** of the particular row. + +It's an array, where each member represents one error that happened on the particular row during its processing. +The array element is structured as follows: + +- `errType` - string representation of the type of the error +- `errCode` - code representation of the type of the error in the form of _E#####_, where # is a digit (e.g. E00001) +- `errMsg` - description of the error +- `errCol` - name of the column, in which the error occurred[^1] +- `rawValues` - the input values for the error +- `mappings` - ??? + +## Error Types + +| Error type | Description | +|-----------------|-------------| +| `stdCastError` | Column value failed to standardize to the expected type | +| `stdNullErr` | Column value was `null` in non-nullable column during standardization | +| `confMappingErr`| Mapping of the value failed during conformance | +| `confCastErr` | Casting failed during conformance | +| `confNegErr` | Negation of numeric type with minimum value overflowed during conformance | +| `confLitErr` | During Conformance special column value has changed | + +## Notes + +[^1]: When **Standardization** of a value fails and the column has a `sourcecolumn` *metadata* property defined, the `sourcecolumn` value, the actual source of the data, will be mentioned in the error. Not the output column name. diff --git a/_docs/3.0.0/usage/info-file.md b/_docs/3.0.0/usage/info-file.md new file mode 100644 index 000000000..f8e219221 --- /dev/null +++ b/_docs/3.0.0/usage/info-file.md @@ -0,0 +1,108 @@ +--- +layout: docs +title: Usage - _INFO file +version: '3.0.0' +categories: + - '3.0.0' + - usage +--- +## Table Of Contents + + +- [Table Of Contents](#table-of-contents) +- [Description](#description) +- [Validation](#validation) +- [Additional Information](#additional-information) + + +## Description + +File named `_INFO` placed within the source directory together with the raw data is a JSON file tracking control measures +via [Atum][atum]. Example how the file should contain can be found +[in the code][info-file]. + +## Validation + +The _\_INFO file_ verification consists of checking that it has an array field of name `checkpoints`. The array has to +have at least two objects one named (field `name`) _"Raw"_ and one _"Source"_. Each of them has to have an array field +`controls`. This array has to contain a control of type count (`"controlType": "controlType.Count"`) with control value +(field `controlValue`) containing a positive integer. + +E.g. + +```json +{ + ... + "checkpoints": [ + { + "name": "Source", + "processStartTime": "??? (timestamp)", + "processEndTime": "??? (timestamp)", + "workflowName": "Source", + "order": "??? (positive integer)", + "controls": [ + { + "controlName": "recordCount", + "controlType": "controlType.Count", + "controlCol": "???", + "controlValue": "??? (positive integer)" + } + ] + }, + { + "name": "Raw", + "processStartTime": "??? (timestamp)", + "processEndTime": "??? (timestamp)", + "workflowName": "Raw", + "order": "???", + "controls": [ + { + "controlName": "recordCount", + "controlType": "controlType.Count", + "controlCol": "???", + "controlValue": "??? (positive integer)" + } + ] + } + ] +} +``` + +For a fully expanded example go [here][info-file]. + +## Additional Information + +Additional information regarding the processing of information is added into the \_INFO file during **Standardization** and **Conformance**. + +| Metadata-Key | Description | +|-----------------------------|-----------------| +| `conform_driver_memory` | The amount of memory used to run Conformance | +| `conform_enceladus_version` | Which version of Enceladus was used to run Conformance | +| `conform_errors_count` | Number of errors after conformance | +| `conform_executor_memory` | Number of executors running conformance | +| `conform_executors_num` | How many executors used for conformance | +| `conform_input_data_size` | The size of the input data (without metadata) to the Conformance.Usually it is the same as the size of standardized data since Conformance is ran after Standardization | +| `conform_output_data_size` | The size of conformed/published data (without metadata such as lineage or _INFO file) | +| `conform_output_dir_size` | The size of the published directory including metadata | +| `conform_records_failed` | Number of records that has at least one error after Conformance | +| `conform_size_ratio` | Size of the conformed/published folder in relation to a standardized folder | +| `conform_spark_master` | Spark master of the Conformance job (usually yarn) | +| `conform_username` | User account under which Conformance was performed | +| `csv_delimiter` | dependant on the input file eg. csv | +| `raw_format` | Format of raw data, eg. `csv`, `json`, `xml`, `cobol` | +| `source_record_count` | The number of records in the dataset when it was exported from the source syste | +| `std_application_id` | Spark Application unique id of the Standardization Job | +| `std_errors_count` | Number of errors after standardization | +| `std_executor_memory` | Memory requested per executor for Standardization | +| `std_executors_num` | How many executors used for Standardization | +| `std_input_dir_size` | The size of the raw folder | +| `std_output_data_size` | Size of the output data after standardization | +| `std_output_dir_size` | The size of the standardized folder | +| `std_records_failed` | Number of records that has at least one error after standardization | +| `std_records_succeeded` | Number of records that has no errors after standardization | +| `std_spark_master` | Spark master of the Standardization job (usually yarn) | +| `std_username` | User account under which Standardization was performed | +| `std_yarn_deploy_mode` | Yarn deployment mode used (client or cluster) | + +[atum]: https://github.com/AbsaOSS/atum +[info-file]: https://github.com/AbsaOSS/enceladus/blob/master/examples/data/input/_INFO diff --git a/_docs/3.0.0/usage/menas-api.md b/_docs/3.0.0/usage/menas-api.md new file mode 100644 index 000000000..44d606900 --- /dev/null +++ b/_docs/3.0.0/usage/menas-api.md @@ -0,0 +1,154 @@ +--- +layout: docs +title: Usage - Menas API +version: '3.0.0' +categories: + - '3.0.0' + - usage +--- + +- [Export Entity](#export-entity) + - [Example Import Response for Schema](#example-import-response-for-schema) +- [Entity Import](#entity-import) + - [JSON payload](#json-payload) + - [Example Export Response for Schema](#example-export-response-for-schema) + + +### Export Entity + +This endpoint exports a single entity from Menas. All exports are versionless, except for connected entities. They are kept as is, but need to be validated before import. + +`GET {menas_url}/api/{entity}/exportItem/{name}/{version}` + +- entity is `schema`, `dataset` or `mappingTable` +- name is the name of the entity as seen in Menas +- version is the version of the entity as seen in Menas. Version is optional and in that case the latest version will be used + +**Resposes:** + +| Code | Description | +|------|-------------| +| 200 | Successful export with exported entity in the response payload | +| 404 | Requested entity was not found | + +#### Example Import Response for Schema + +**Top level keys** + +| Key | Value | Description | +|---|---|---| +| metadata | Struct | Metadata for the export functionality | +| item | Struct | exported entity | + +**Metadata keys** + +| Key | Value | Description | +|---|---|---| +| exportVersion | Integer | Version of the export and model used | + +**Item keys** + +| Key | Value | Description | +|---|---|---| +| name | String | the name of the entity | +| description | String | the description of the entity | +| fields | Struct | fields specified in the schema. A complex type representing columns. | + +**Fields keys** + +| Key | Value | Description | +|---|---|---| +| name | String | the name of the column | +| type | String | the type of the column | +| path | String | parent column name | +| absolutePath | String | absolute path to the column | +| elementType | String | if field is a struct or array | +| nullable | Boolean | boolean value specifying nullability | +| metadata | Struct | metadata of the field, like patters, default value, etc. | +| children | Array of Fields | array of fields. Sub-columns/fields to this one. | + + +Example: +```json +{ + "metadata": { + "exportVersion": 1 + }, + "item": { + "name": "Name", + "description": "", + "fields": [ + { + "name": "Boolean", + "type": "boolean", + "path": "", + "elementType": null, + "containsNull": null, + "nullable": true, + "metadata": { + "default": null + }, + "children": [], + "absolutePath": "Boolean" + }, + { + "name": "Byte", + "type": "byte", + "path": "", + "elementType": null, + "containsNull": null, + "nullable": true, + "metadata": {}, + "children": [], + "absolutePath": "Byte" + } + ] + } +} +``` + +### Entity Import + +This endpoint imports a single entity. All imports are versionless. If the import does not find an entity with the same name it will create a new one and start the version from 1. If the import finds an existing version, it will update the previous version. + +Versions of connected entities need to be specified properly. Export of a Dataset carries a Schema and maybe some Mapping tables as connected entities. These have versions and these versions need to exist on Import. + +`POST {menas_url}/api/{entity}/importItem` + +- entity is `schema`, `dataset` or `mappingTable` +- expects a JSON payload + +**Resposes:** + +| Code | Description | +|------|-------------| +| 201 | Successful imported with created/updated entity in the response payload | +| 400 | Bad request. Import payload had validation errors. Errors returned in response json payload | + +#### JSON payload + +JSON payload is the same as the JSON response from the [export](#example-import-response-for-schema). + +#### Example Export Response for Schema + +On success, it is the same as JSON payload of update or create API, depending if the entity name already existed or not. + +On failure, you will get a list of errors produced by the validation like bellow + +```json +{ + "errors": { + "item.name": [ + "name 'null' contains unsupported characters" + ], + "metadata.exportApiVersion": [ + "Export/Import API version mismatch. Acceptable version is 1. Version passed is 2" + ] + } +} +``` + +You have a key `errors`, which is a struct that will hold other keys from the JSON Payload sent and messages of the issues found with this key. +There can be multiple messages connected to one key. + +In this example we see that we have forgoten to send `name` of the entity and there was a mismatch between versions of export/import used. diff --git a/_docs/3.0.0/usage/menas-config.md b/_docs/3.0.0/usage/menas-config.md new file mode 100644 index 000000000..ca8a0fcf0 --- /dev/null +++ b/_docs/3.0.0/usage/menas-config.md @@ -0,0 +1,33 @@ +--- +layout: docs +title: Usage - Menas Configuration +version: '3.0.0' +categories: + - '3.0.0' + - usage +--- +## Table Of Contents + +- [Table Of Contents](#table-of-contents) +- [Intro](#intro) +- [General options](#general-options) + + +## Intro + +This page describes the usage of the configuration of _Menas_. +This page describes the configuration values stored in `menas`'s `application.properties` (or its +[template][app-props-template]). + +## General options + +{% include config_options.html file="menas-configuration_3_0_0" %} + +## Note, that +* `menas.schemaRegistry.baseUrl` may not be present (in that case, the option to load schema from a schema registry by subject name will disappear in the Menas UI) + +* specifying `javax.net.ssl.{trustStore|keyStore}` (and the passwords) is usually both necessary to successfully load a schema file from a secure schema registry, but this setting will be used by the by-URL loading as well (if supported by webserver reached) + + +[readme]: https://github.com/AbsaOSS/enceladus/blob/master/README.md +[app-props-template]: https://github.com/AbsaOSS/enceladus/blob/master/menas/src/main/resources/application.properties.template diff --git a/_docs/3.0.0/usage/menas-conformance-rules.md b/_docs/3.0.0/usage/menas-conformance-rules.md new file mode 100644 index 000000000..a6861821a --- /dev/null +++ b/_docs/3.0.0/usage/menas-conformance-rules.md @@ -0,0 +1,95 @@ +--- +layout: docs +title: Usage - Menas Conformance Rules +version: '3.0.0' +categories: + - '3.0.0' + - usage +--- + +## Table Of Contents + +- [Table Of Contents](#table-of-contents) +- [Intro](#intro) +- [Casting Conformance Rule](#casting-conformance-rule) +- [Coalesce Conformance Rule](#coalesce-conformance-rule) +- [Concatenation Conformance Rule](#concatenation-conformance-rule) +- [Drop Conformance Rule](#drop-conformance-rule) +- [FillNulls Conformance Rule](#fillnulls-conformance-rule) +- [Literal Conformance Rule](#literal-conformance-rule) +- [Mapping Conformance Rule](#mapping-conformance-rule) +- [Negation Conformance Rule](#negation-conformance-rule) +- [SingleColumn Conformance Rule](#singlecolumn-conformance-rule) +- [SparkSessionConf Conformance Rule](#sparksessionconf-conformance-rule) +- [Uppercase Conformance Rule](#uppercase-conformance-rule) + + +## Intro + +Conformance Rules are a way for the user to enhance the data. Every conformance rule has an output column and an option for running a control measure. Output column text field defines the name of the column into which the rule will output the result into. Control measure check will run an Atum control measure check as in previous stages as defined in controls of `_INFO` file. + +Every column also has one or more input columns. + +**Important** - We never override a column. Each rule produces a new column. + +## Casting Conformance Rule + +Casting conformance rule allows users to cast a specific column to another type. This conformance rule provides a selection of other types. + +Allowed Conversions are: + +| From | To | +|---|---| +| Anything | String | +| Boolean | Any Numeric | +| Any Numeric Integer | Any Numeric given that it fits | +| Any Floating point Numeric | Any Floating point Numeric given that it fits | +| String | Any Numeric or Time, or Boolean given it is `"true"` or `"false"` | +| Date | Timestamp | +| Timestamp | Date | + +## Coalesce Conformance Rule + +Coalesce conformance rule applies value for the new column from the first non-null value from the list of columns. + +## Concatenation Conformance Rule + +Concatenation conformance rule concatenates two or more input columns together into a single column. Columns are first transformed into their string representation before the actual concatenation. + +## Drop Conformance Rule + +Drop conformance rule removes a column from the output data. + +## FillNulls Conformance Rule + +FillNulls conformance rule takes a column and replaces all the nulls with a literal provided by the user. + +## Literal Conformance Rule + +Literal conformance rule adds a column with a string literal provided by the user. + +## Mapping Conformance Rule + +To use a mapping conformance rule, the user first needs to define a Mapping Table in Menas. Mapping Tables have the same properties and rules around them as Datasets, and it is expected of them to be in a parquet format. + +When defining a mapping conformance rule, users first need to pick a correct Mapping Table and correct version. Then there is a question if the data can have Null values in join conditions. This means if the join that will be executed should be null safe or not. + +Then the join conditions convey the relationship between the Dataset and Mapping Table. The join condition specifies how the rows from one table will be combined with the rows of the other table. This is based on the equality of the values in the selected columns. + +Last are the output columns that specify which columns from the mapping table will be written into their respective output column. The mapping table column is called the _target value_. + +## Negation Conformance Rule + +Negation conformance rule negates any Numerical or Boolean value. + +## SingleColumn Conformance Rule + +Single column conformance rule transforms column into a column of structs of previous column's values. Input column alias here will be the name/key of the struct. + +## SparkSessionConf Conformance Rule + +SparkSessionConf conformance rule is able to pull out a configuration value from the SparkSession based on the key provided. + +## Uppercase Conformance Rule + +Uppercase conformance rule transforms all character letters in the column into capital letters. diff --git a/_docs/3.0.0/usage/menas-quick-start.md b/_docs/3.0.0/usage/menas-quick-start.md new file mode 100644 index 000000000..2de7192ac --- /dev/null +++ b/_docs/3.0.0/usage/menas-quick-start.md @@ -0,0 +1,80 @@ +--- +layout: docs +title: Usage - Menas Quick Start +version: '3.0.0' +categories: + - '3.0.0' + - usage +--- + +First of all, to run anything we will need a dataset and its corresponding schema defined in Menas UI. _Standardization_ and _Conformance_ spark jobs will then take this information from Menas when run. First, let us go to the URL where it is deployed. There you will be greeted by a login window. If you made no changes to the `application.conf` your default login is `user` for username and `changeme` for password. Otherwise log in with the credentials you have chosen. + +{% include image.html name="usage/menas_login.png" alt="Menas Login Page" %} + +After logging in, you should be greeted by the dashboard. In our case we have no entities created yet so everything shows 0, but after we create something numbers will change and after we run something we will be even able to see changes in Today's runs. + +{% include image.html name="usage/menas_dashboard.png" alt="Menas Dashboard" %} + +In the top left corner of our current view we can see a button usually called `hamburger menu`, this will open a left pane with Navigation. This navigation can be opened by clicking any of the squares from the dashboard as well, but these will lead you directly to the selected items active entities. Also this menu will be visible always in case of wider screens. + +{% include image.html name="usage/menas_navigation.png" alt="Menas Navigation" %} + +The first thing we need to do to be able to run Standardization and Conformance is creating a data schema for the job. The schema represents the data, its types, defaults, etc. If we click on the schema in the navigation pane it will open up all schemas view. This view will be currently empty and at the bottom there is a button `+ New Schema`. Pressing the button will lead to a modal window showing up. + +{% include image.html name="usage/menas_schemas_empty.png" alt="Menas Empty Schemas Pane" %} + +Creation of a new schema has only `Name` and `Description`. Names in all Menas models must always be unique and a combination of Name and Version will always provide a unique identifier of an object. + +{% include image.html name="usage/menas_new_schema.png" alt="Menas New Schema" %} + +Clicking on the _Save_ button, Menas will validate the _Name_ inputted and save the new object. + +{% include image.html name="usage/menas_newly_created_schema.png" alt="Menas Newly Created Schema" %} + +Now we need to input a schema. Schemas can be uploaded as many types. Spark Struct, Cobol Copybook or Avro Schema. Choose what you like the most and upload your schema. For test purposes here, we will use a simple Spart Struct schema: + +```json +{ + "type" : "struct", + "fields" : [ { + "name" : "A1", + "type" : "string", + "nullable" : true, + "metadata" : { } + }, { + "name" : "A2", + "type" : "string", + "nullable" : true, + "metadata" : { } + }, { + "name" : "A3", + "type" : "string", + "nullable" : true, + "metadata" : { } + } ] +} +``` + +For more rules about schemas check the [this][schema] schema section. Now click the tab `Upload New`. + +{% include image.html name="usage/menas_upload_schema.png" alt="Menas Upload Schema" %} + +Click button `Browse...` to select schema file from your local disk. When selected click upload. This will validate inputed schema and if everything is allright, it will upload the schema. You will be moved back to `Basic Info` tab and you can see that your Version rose by 1. If you click the tab `Fields` you will se your new schema. + +{% include image.html name="usage/menas_schema_view.png" alt="Menas Schema View" %} + +Moving on to creating a dataset. Datasets in Menas represent a general location where your datasets of the same type will be periodically landed. You will have to specify which schema the data comply with, where they are and where they should be when Standardization and Conformance are finished. + +Creating new Dataset goes the same way as the schema. We open the left menu pane and select Dataset and then click `+ New dataset`. + +We will fill in Name and Description same as in Schema, then we will have to assign a schema. When you click on _Select schema_, find the schema you created and select it. It will automatically select the highest version, but in a combobox bellow you can choose which schema version you would like to use. + +Then there are _raw_ and _publish_ HDFS folder paths. These point to where your data will land in the future. The _raw_ folder path denotes the input location of the data before they go through Standardization and Conformance while the _publish_ folder path marks the target location the data is saved after the process. These paths are not full paths to the data. They are only the first part of the data paths. The second part is automatically generated by the `report-date` and `report-version` of the job you will be running. For now, it suffices to say that the path to your data is `////v/*`. This allows you to have easy daily ingestions of data and even if you need to have a second ingestion you do not need to overwrite your data. + +_Don't mind the orange color. It only means the paths don't exist at the moment of creating the Dataset_ + +{% include image.html name="usage/menas_new_dataset.png" alt="Menas New Dataset" %} + +After clicking on _Save_, we have both the Dataset and accompanying Schema ready and we are able to run Standardization and Conformance. + +[**Next Data Quality Quick Sart**]({{ site.baseurl }}/docs/{{ page.version }}/usage/data-quick-start) diff --git a/_docs/3.0.0/usage/run.md b/_docs/3.0.0/usage/run.md new file mode 100644 index 000000000..3f3657966 --- /dev/null +++ b/_docs/3.0.0/usage/run.md @@ -0,0 +1,238 @@ +--- +layout: docs +title: Usage - How to run +version: '3.0.0' +categories: + - '3.0.0' + - usage +--- + +{% capture docs_path %}{{ site.baseurl }}/docs/{{ page.version }}{% endcapture %} + +## Table Of Content + + +- [Table Of Content](#table-of-content) +- [Requirements and deploy](#requirements-and-deploy) +- [Running Standardization](#running-standardization) +- [Running Conformance](#running-conformance) +- [Running Standardization and Conformance together](#running-standardization-and-conformance-together) +- [Helper scripts](#helper-scripts) +- [Command line options](#command-line-options) + + +## Requirements and deploy + +For description of requirements and deployment see the [README.md][project-readme] in the project root. + +## Running Standardization + +``` +/spark-submit \ +--num-executors \ +--executor-memory G \ +--master yarn \ +--deploy-mode \ +--driver-cores \ +--driver-memory G \ +--conf "spark.driver.extraJavaOptions=-Dmenas.rest.uri= -Dstandardized.hdfs.path=-{0}-{1}-{2}-{3} -Dspline.mongodb.url= -Dspline.mongodb.name= -Dhdp.version=" \ +--class za.co.absa.enceladus.standardization.StandardizationJob \ +.jar> \ +--menas-auth-keytab \ +--dataset-name \ +--dataset-version \ +--report-date \ +--report-version \ +--raw-format \ +--row-tag +``` + +* Here `row-tag` is a specific option for `raw-format` of type `XML`. For more options for different types please see bellow. +* In case Menas is configured for in-memory authentication (e.g. in dev environments), replace `--menas-auth-keytab` with `--menas-credentials-file` + +## Running Conformance + +``` +/spark-submit \ +--num-executors \ +--executor-memory G \ +--master yarn \ +--deploy-mode \ +--driver-cores \ +--driver-memory G \ +--conf 'spark.ui.port=29000' \ +--conf "spark.driver.extraJavaOptions=-Dmenas.rest.uri= -Dstandardized.hdfs.path=-{0}-{1}-{2}-{3} -Dconformance.mappingtable.pattern=reportDate={0}-{1}-{2} -Dspline.mongodb.url= -Dspline.mongodb.name=" -Dhdp.version= \ +--packages za.co.absa:enceladus-parent:,za.co.absa:enceladus-conformance: \ +--class za.co.absa.enceladus.conformance.DynamicConformanceJob \ +.jar> \ +--menas-auth-keytab \ +--dataset-name \ +--dataset-version \ +--report-date \ +--report-version +``` + +## Running Standardization And Conformance together as one Spark job + +``` +/spark-submit \ +--num-executors \ +--executor-memory G \ +--master yarn \ +--deploy-mode \ +--driver-cores \ +--driver-memory G \ +--conf "spark.driver.extraJavaOptions=-Dmenas.rest.uri= -Dstandardized.hdfs.path=-{0}-{1}-{2}-{3} -Dspline.mongodb.url= -Dspline.mongodb.name= -Dhdp.version=" \ +--class za.co.absa.enceladus.standardization_conformance.StandardizationAndConformanceJob \ +.jar> \ +--menas-auth-keytab \ +--dataset-name \ +--dataset-version \ +--report-date \ +--report-version \ +--raw-format \ +--row-tag +``` + +* In case Menas is configured for in-memory authentication (e.g. in dev environments), replace `--menas-auth-keytab` with `--menas-credentials-file` + +## Helper scripts + +The Scripts in `scripts` folder can be used to simplify command lines for running Standardization and Conformance jobs. + +Steps to configure the scripts are as follows: +* Copy all the scripts in `scripts` directory to a location in your environment. +* Copy `enceladus_env.template.sh` to `enceladus_env.sh`. +* Change `enceladus_env.sh` according to your environment settings. +* Use `run_standardization.sh` and `run_conformance.sh` or `run_standardization_conformance.sh` scripts instead of directly invoking `spark-submit` to run your jobs. + +The syntax for running Standardization and Conformance is similar to running them using `spark-submit`. The only difference +is that you don't have to provide environment-specific settings. The scripts are set to use Spark's _Dynamic Resource Allocation_ +(DRA) strategy. The scripts also ensure adaptive execution is enabled together with DRA. Upon shuffle operations, it +adjusts the data partitioning to target sizes i.g. HDFS blocks sizes. This improves the efficiency of resource usage and +prevents the issue with small files (when output is split into too many tiny files). + +DRA gets auto-disabled, and spark submit falls back to Spark defaults (preserving other custom parameters) when: + +- `--num-executors` is set. We assume, that this parameter is set when user knows how many resources are needed exactly. Remove this parameter, to enable DRA back. +- `--conf-spark-dynamicAllocation-maxExecutors` is set to empty value. The parameter prevents a job from taking over entire cluster. This parameter is set by script defaults to a safe value, but can be overwritten. + + +The basic command to run **Standardization** becomes: + +``` +/run_standardization.sh \ +--deploy-mode \ +--menas-auth-keytab \ +--dataset-name \ +--dataset-version \ +--report-date \ +--report-version \ +--raw-format \ +--row-tag +``` + +The basic command to run **Conformance** becomes: + +``` +/run_conformance.sh \ +--deploy-mode \ +--menas-auth-keytab \ +--dataset-name \ +--dataset-version \ +--report-date \ +--report-version +``` + +The basic command to run **Standardization And Conformance** becomes: + +``` +/run_standardization_conformance.sh \ +--deploy-mode \ +--menas-auth-keytab \ +--dataset-name \ +--dataset-version \ +--report-date \ +--report-version \ +--raw-format \ +--row-tag +``` + +## Command line options + +The list of options for configuring Spark deployment mode in Yarn and resource specification: + +| Option | Default | Description | +|----------------------------------------------------------------------|-------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| --deploy-mode **cluster/client** | | Specifies a Spark Application deployment mode when Spark runs on Yarn. Can be either `client` or `cluster`. | +| --set-dra **true/false** | `true` | Explicitly eneable/disable Dynamic Resource Allocation. | +| --conf-spark-dynamicAllocation-minExecutors **n** | `0` | Is strongly recommended to be left to default value of 0. | +| --conf-spark-dynamicAllocation-maxExecutors **n** | `4` | Sets max limit on the number of executors. Should never be empty string or infinity on a shared cluster. Can be adjusted based on expected range on input data. | +| --conf-spark-dynamicAllocation-executorAllocationRatio **float** | `0.5` | How many executors per task are allocated. (1/value = tasks per executor). | +| --conf-spark-sql-adaptive-shuffle-targetPostShuffleInputSize **mem** | `134217728` | Target post-shuffle partition size. | +| --num-executors **n** | | Specifies the number of executors to use. Effective only if DRA is off, | +| --executor-memory **mem** | | Specifies an amount of memory to request for each executor. See memory specification syntax in Spark. Examples: `4g`, `8g`. Effective only if DRA is off. | +| --executor-cores **mem** | `1` | Specifies a number of cores to request for each executor. Effective only if DRA is off. | +| --dra-num-executors **n** | | Same as `--num-executors` but used when DRA is enabled. Use with care! DRA won't scale below this number. | +| --dra-executor-memory **mem** | | Same as `--executor-memory` but used when DRA is enabled. | +| --dra-executor-cores **mem** | `1` | Same as `--executor-cores` but used when DRA is enabled. | +| --driver-cores **n** | | Specifies a number of CPU cores to allocate for the driver process. | +| --driver-memory **mem** | | Specifies an amount of memory to request for the driver process. See memory specification syntax in Spark. Examples: `4g`, `8g`. | +| --persist-storage-level **level** | | **Advanced** Specifies the storage level to use for persisting intermediate results. Can be one of `NONE`, `DISK_ONLY`, `MEMORY_ONLY`, `MEMORY_ONLY_SER`, `MEMORY_AND_DISK` (default), `MEMORY_AND_DISK_SER`, etc. See more [here](https://spark.apache.org/docs/3.2.1/api/java/index.html?org/apache/spark/storage/StorageLevel.html). | +| --conf-spark-executor-memoryOverhead **mem** | | **Advanced**. The amount of off-heap memory to be allocated per executor, in MiB unless otherwise specified. Sets `spark.executor.memoryOverhead` Spark configuration parameter. See the detailed description [here](http://spark.apache.org/docs/latest/configuration.html#available-properties). See memory specification syntax in Spark. Examples: `4g`, `8g`. | +| --conf-spark-memory-fraction **value** | | **Advanced**. Fraction of (heap space - 300MB) used for execution and storage (default=`0.6`). Sets `spark.memory.fraction` Spark configuration parameter. See the detailed description [here](http://spark.apache.org/docs/latest/configuration.html#memory-management). | + +For more information on these options see the official documentation on [running Spark on Yarn][spark-running-yarn] + +The list of all options for running both **Standardization** and **Conformance**: + +| Option | Description | +|---------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| --menas-auth-keytab **filename** | A keytab file used for Kerberized authentication to Menas. Cannot be used together with `--menas-credentials-file`. | +| --menas-credentials-file **filename** | A credentials file containing a login and a password used to authenticate to Menas. Cannot be used together with `--menas-auth-keytab`. | +| --dataset-name **name** | A dataset name to be standardized or conformed. | +| --dataset-version **version** | A version of a dataset to be standardized or conformed. | +| --report-date **YYYY-mm-dd** | A date specifying a day for which a raw data is landed. | +| --report-version **version** | A version of the data for a particular day. | +| --std-hdfs-path **path** | A path pattern where to put standardized data. The following tokens are expending in the pattern: `{0}` - dataset name, `{1}` - dataset version, `{2}`- report date, `{3}`- report version. | + +The list of additional options available for running **Standardization**: + +| Option | Default | Description | +|--------------------------------------------------|-----------------------|------------------------------------------------------------------------------------------------------------------------------------------| +| --raw-format **format** | | A format for input data. Can be one of `parquet`, `json`, `csv`, `xml`, `cobol`, `fixed-width`. | +| --charset **charset** | `UTF-8` | Specifies a charset to use for `csv`, `json`, `xml`, `cobol` or `fixed-width.` | +| --cobol-encoding **encoding** | | Specifies the encoding of a mainframe file (`ascii` or `ebcdic`). Code page can be specified using `--charset` option. | +| --cobol-is-text **true/false** | | Specifies if the mainframe file is ASCII text file. | +| --cobol-trimming-policy **policy** | | Specifies the way leading and trailing spaces should be handled. Can be `none` (do not trim spaces), `left`, `right`, `both`(default). | +| --copybook **string** | | Path to a copybook for COBOL data format. | +| --csv-escape **character** | `\` | Specifies a character to be used for escaping other characters. | +| --csv-ignore-leading-white-space **true/false** | `false` | Defines whether or not leading whitespaces from values being read should be skipped. | +| --csv-ignore-trailing-white-space **true/false** | `false` | Defines whether or not trailing whitespaces from values being read should be skipped. | +| --csv-quote **character** | `"` | Specifies a character to be used as a quote for creating fields that might contain delimiter character. | +| --debug-set-raw-path **path** | | Override the path of the raw data (used for testing purposes). | +| --delimiter **character** | `,` | Specifies a delimiter character to use for CSV format. | +| --empty-values-as-nulls **true/false** | `false` | If `true` treats empty values as `null`s. | +| --folder-prefix **prefix** | | Adds a folder prefix before the date tokens. | +| --header **true/false** | | Indicates if in the input CSV data has headers as the first row of each file. | +| --is-xcom **true/false** | | If `true` a mainframe input file is expected to have XCOM RDW headers. | +| --null-value **value** | `""` _(empty string)_ | Defines how null values are represented in a `fixed-width` file format. | +| --row-tag **tag** | | A row tag if the input format is `xml`. | +| --strict-schema-check **true/false** | `false` | If `true` processing ends the moment a row not adhering to the schema is encountered, `false` proceeds over it with an entry in _errCol. | +| --trimValues **true/false** | | Indicates if string fields of fixed with text data should be trimmed. | + +Most of these options are format specific. For details see [the documentation]({{ docs_path }}/usage/standardization-formats). + +The list of additional options available for running **Conformance**: + +| Option | Default | Description | +|--------------------------------------------|--------------------------|------------------------------------------------------------------------------| +| --mapping-table-pattern **pattern** | `reportDate={0}-{1}-{2}` | A pattern to look for mapping table for the specified date.
The list of possible substitutions: `{0}` - year, `{1}` - month, `{2}` - day of month. Special symbols in the pattern need to be escaped. For example, an empty pattern can be be specified as `\'\'` (single quotes are escaped using a backslash character). | +| --experimental-mapping-rule **true/false** | build-specific and is set in 'application.properties' | If `true`, the experimental optimized mapping rule implementation is used. | +| --catalyst-workaround **true/false** | `true` | Turns on (`true`) or off (`false`) workaround for Catalyst optimizer issue. Turn this off only is you encounter timing freeze issues when running Conformance. | +| --autoclean-std-folder **true/false** | | If `true`, the standardized folder will be cleaned automatically after successful execution of a Conformance job. If present, overrides `conformance.autoclean.standardized.hdfs.folder` from [application configuration]({{ site.baseurl }}/docs/usage/config#general-options). | + +All the additional options valid for both _Standardization_ and _Conformance_ can also be specified when running the combined _Standardization And Conformance_ job + +[project-readme]: https://github.com/AbsaOSS/enceladus/blob/master/README.md#how-to-run +[spark-running-yarn]: https://spark.apache.org/docs/latest/running-on-yarn.html diff --git a/_docs/3.0.0/usage/schema.md b/_docs/3.0.0/usage/schema.md new file mode 100644 index 000000000..d1f4c3689 --- /dev/null +++ b/_docs/3.0.0/usage/schema.md @@ -0,0 +1,622 @@ +--- +layout: docs +title: Usage - Schema +version: '3.0.0' +categories: + - '3.0.0' + - usage +--- + +{% capture docs_path %}{{ site.baseurl }}/docs/{{ page.version }}{% endcapture %} + +## Table Of Contents + +- [Table Of Contents](#table-of-contents) +- [Intro](#intro) +- [Automatically added columns](#automatically-added-columns) +- [Data types](#data-types) + - [String](#string) + - [Boolean](#boolean) + - [Decimal](#decimal) + - [Long](#long) + - [Integer](#integer) + - [Short](#short) + - [Byte](#byte) + - [Double](#double) + - [Float](#float) + - [Timestamp](#timestamp) + - [Date](#date) + - [Binary](#binary) + - [Struct](#struct) + - [Array](#array) +- [Metadata](#metadata) + - [sourcecolumn](#sourcecolumn) + - [default](#default) + - [pattern](#pattern) + - [timezone](#timezone) + - [decimal_separator](#decimal_separator) + - [grouping_separator](#grouping_separator) + - [minus_sign](#minus_sign) + - [allow_infinity](#allow_infinity) + - [radix](#radix) + - [encoding](#encoding) + - [width](#width) +- [Parsing](#parsing) + - [Parsing timestamps and dates](#parsing-timestamps-and-dates) + - [Time Zone support](#time-zone-support) + - [Parsing numbers](#parsing-numbers) + - [Radix usage](#radix-usage) + - [Pattern parsing](#pattern-parsing) + - [Number parsing peculiarities](#number-parsing-peculiarities) +- [Defaults](#defaults) + - [Explicit default](#explicit-default) + - [Global default values](#global-default-values) + - [Explicit default values restrictions](#explicit-default-values-restrictions) +- [Notes](#notes) + + +## Intro + +Schema is the description of fields in a dataset. All and only the fields defined in the schema will be in the output +table. That means fields not mentioned in the schema won't be in the input. There's an exception of three fields added +automatically - [see bellow](#automatically-added-columns). Fields are defined in the order they are to be in the output table and have three basic common properties: + +- `name` - the field (column) name +- `type` - data type of the field +- `nullable` (optional) - flag indicating if the data can contain the value *null*, if not specified considered set to *false* + +Furthermore, some type can have additional properties. The details of each supported type, their meaning and additional +properties will be described in the following chapters. + +Thanks to *Data Types* `StructType` and `ArrayType` the fields can be nested – +fields within fields. + +You provide *Schema* to **Standardization** in a JSON file: + +```json +{ + "type": "struct", + "fields": [{ + "name": "name", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "surname", + "type": "string", + "nullable": false, + "metadata": { + "default": "Unknown Surname" + } + }, + { + "name": "hoursWorked", + "type": { + "type": "array", + "elementType": "integer", + "containsNull": false + }, + "nullable": false, + "metadata": {} + }, + { + "name": "employeeNumbers", + "type": { + "type": "array", + "elementType": { + "type": "struct", + "fields": [{ + "name": "numberType", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "numbers", + "type": { + "type": "array", + "elementType": "integer", + "containsNull": true + }, + "nullable": true, + "metadata": {} + } + ] + }, + "containsNull": true + }, + "nullable": true, + "metadata": {} + }, + { + "name": "startDate", + "type": "date", + "nullable": false, + "metadata": { + "pattern": "yyyy-MM-dd" + } + }, + { + "name": "updated", + "type": "timestamp", + "nullable": true, + "metadata": { + "pattern": "yyyyMMdd.HHmmss" + } + } + ] +} +``` + +Example of data adhering to the above schema can be found [here][test-samples]. + +## Automatically added columns + +There is a column automatically added to each **Standardization** output. Its name is `errCol` and it contains information +on all errors that happened on the particular row *standardization*. If defined in schema its structure there has to +adhere exactly to the automatically added one. More on this field [see in dedicated documentation][errcol]. + +## Data types + +### String + +The data type representing *String* values. + +**Metadata keys:** [sourcecolumn](#sourcecolumn), [default](#default), [width](#width) + +### Boolean + +The data type representing *Boolean* values. + +**Metadata keys:** [sourcecolumn](#sourcecolumn), [default](#default), [width](#width) + +### Decimal + +The data type representing *BigDecimal* values, a fixed-point numeric type. The type is specified by two additional +parameters, *precision* and *scale*. *Precision* limits the number of digits and cannot be greater than 38. *Scale* +specifies the number of digits after the decimal point and has to be equal or less than the *precision*. + +The type is specified as `decimal(`*precision*, *scale*`)`, for example: `decimal(15, 3)` + +**Metadata keys:** [sourcecolumn](#sourcecolumn), [default](#default), [pattern](#pattern), [timezone](#timezone), [decimal_separator](#decimal_separator), [grouping_separator](#grouping_separator), [minus_sign](#minus_sign), [width](#width) + +### Long + +The data type representing *Long* values. That is a whole number between -9223372036854775808 and 9223372036854775807. + +**Metadata keys:** [sourcecolumn](#sourcecolumn), [default](#default), [pattern](#pattern), [decimal_separator](#decimal_separator), [grouping_separator](#grouping_separator), [minus_sign](#minus_sign), [radix](#radix), [width](#width) + +### Integer + +The data type representing *Integer* values. That is a whole number between -2147483648 and 2147483647. + +**Metadata keys:** [sourcecolumn](#sourcecolumn), [default](#default), [pattern](#pattern), [decimal_separator](#decimal_separator), [grouping_separator](#grouping_separator), [minus_sign](#minus_sign), [radix](#radix), [width](#width) + +### Short + +The data type representing *Short* values. That is a whole number between -32768 and 32767. + +**Metadata keys:** [sourcecolumn](#sourcecolumn), [default](#default), [pattern](#pattern), [decimal_separator](#decimal_separator), [grouping_separator](#grouping_separator), [minus_sign](#minus_sign), [radix](#radix), [width](#width) + +### Byte + +The data type representing *Byte* values. That is a whole number between -128 and 127. + +**Metadata keys:** [sourcecolumn](#sourcecolumn), [default](#default), [pattern](#pattern), [decimal_separator](#decimal_separator), [grouping_separator](#grouping_separator), [minus_sign](#minus_sign), [radix](#radix), [width](#width) + +### Double + +The data type representing *Double* values, 64-bit (IEEE 754) double-precision float. + +**Metadata keys:** [sourcecolumn](#sourcecolumn), [default](#default), [pattern](#pattern), [decimal_separator](#decimal_separator), [grouping_separator](#grouping_separator), [minus_sign](#minus_sign), [allow_infinity](#allow_infinity), [width](#width) + +### Float + +The data type representing *Float* values, 32-bit (IEEE 754) single-precision float. + +**Metadata keys:** [sourcecolumn](#sourcecolumn), [default](#default), [pattern](#pattern), [decimal_separator](#decimal_separator), [grouping_separator](#grouping_separator), [minus_sign](#minus_sign), [allow_infinity](#allow_infinity), [width](#width) + +### Timestamp + +The data type representing *java.sql.Timestamp* values. Upon entry they are normalized to UTC time zone. + +**Metadata keys:** [sourcecolumn](#sourcecolumn), [default](#default), [pattern](#pattern), [timezone](#timezone), [width](#width) + +### Date + +The data type representing *java.sql.Date* values. If time zone is specified the date is adjusted to UTC. + +**Metadata keys:** [sourcecolumn](#sourcecolumn), [default](#default), [pattern](#pattern), [timezone](#timezone), [width](#width) + +### Binary + +The data type representing *Binary* values. + +**Metadata keys:** [sourcecolumn](#sourcecolumn), [default](#default), [encoding](#encoding), [width](#width) + +### Struct + +The data type representing a structure of one or more sub-fields. + +The type is specified as struct of the following properties: + +- `type` - string value *"array"* +- `fields` - array of fields + +**Metadata keys:** [sourcecolumn](#sourcecolumn) + +### Array + +The data type representing an array of values of another type. + +The type is specified as struct of following properties: + +- `type` - string value *"array"* +- `elementType` - the type of the elements of the array, can be any of the types including [`struct`](#struct) and +[`array`](#array) +- `containsNull`- boolean value + +**Metadata keys:** [sourcecolumn](#sourcecolumn) + +## Metadata + +*Standardization* can be influenced by `metadata` in the schema of the data. The `metadata` are optional properties. +Here are the recognized ones with the description of their purpose (with detailed description below): + +| Property | Target data type | Description | Example | Default[^1] | +|-------------------------------------------|-----------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|----------------------------------------------| +| [sourcecolumn](#sourcecolumn) | any | The source column to provide data of the described column | *id* | `-`[^2] | +| [default](#default) | any atomic type | Default value to use in case data are missing | *0* | `-`[^2] | +| [pattern](#pattern) | timestamp & date | Pattern for the timestamp or date representation | *dd.MM.yy* | *yyyy-MM-dd HH:mm:ss* **/** *yyyy-MM-dd* | +| [timezone](#timezone) | timestamp (also date) | The time zone of the timestamp when that is not part of the pattern (NB! for date it can return unexpected results) | *US/Pacific* | *UTC*[^3] | +| [pattern](#pattern) | any numeric type | Pattern for the number representation | \#,\#\#0.\# | `-`[^2] | +| [decimal_separator](#decimal_separator) | any numeric type | The character separating the integer and the fractional parts of the number | *,* | *.* | +| [grouping_separator](#grouping_separator) | any numeric type | Character to mark boundaries between orders of magnitude, usually to mark thousands, millions etc. | *\_* | *,* | +| [minus_sign](#minus_sign) | any numeric type | Character to mark the number is negative. | *N* | *-* | +| [allow_infinity](#allow_infinity) | float & double | Flag indicating if the column accepts infinity as a value (and positive/negative numbers which are too large are converted to *infinity*/*-infinity*) | *true* | *false* | +| [strict_parsing](#strict_parsing) | decimal | Flag indicating if strict parsing should be done on the input value. Strict parsing rejects a value with more decimal places than the defined scale | *true* | *false* | +| [radix](#radix) | long, integer, short, byte | The base of the numbers provided | *hex* | *10* | +| [encoding](#encoding) | binary | Encoding is used for string to binary conversion | *base64*,*none* | `-` (explained in [encoding](#encoding)) | +| [width](#width) | any atomic type | Specifies the width of a column for a fixed-width formats | "10" | - | + +**NB!** All values in _metadata_ have to be entered as *string*. Even if they would conform to other types, like number +or boolean. + +### sourcecolumn + +**Supported by types:** [String](#string), [Boolean](#boolean), [Decimal](#decimal), [Long](#long), [Integer](#integer), [Short](#short), [Byte](#byte), [Double](#double), [Float](#float), [Timestamp](#struct), [Date](#timestamp), [Struct](#struct), [Array](#array) + +The name of the column to get the data from (so it only makes sense if it's different +from field name). The most common use case is when the original column +name is not a valid Parquet field name. It can also be used in the rare cases +when the column needs to be standardized into more fields and/or different types. + +### default + +**Supported by types:** [String](#string), [Boolean](#boolean), [Decimal](#decimal), [Long](#long), [Integer](#integer), [Short](#short), [Byte](#byte), [Double](#double), [Float](#float), [Timestamp](#struct), [Date](#timestamp) + +This is the value to be used in case the input is missing (and nulls are not +allowed) or when the casting (*standardization*) fails. +You can think of this as a *fallback value*. + +It should be noted, that this is the only _metadata_ key which accepts the `null` value (written without quotes) next to +string values (of course, such a field has to be nullable: `"nullable": true`) + +For more about the topic see chapter [Defaults](#defaults). + +### pattern + +**Supported by types:** [Decimal](#decimal), [Long](#long), [Integer](#integer), [Short](#short), [Byte](#byte), [Double](#double), [Float](#float), [Timestamp](#struct), [Date](#timestamp) + +The format the input adheres to. Mostly used for timestamp and date entries but +it can be leveraged for numeric types too. Details for valid patterns are in +the chapter [Parsing](#parsing). + +In case ``default`` value is specified in _metadata_, it needs to adhere to +the pattern. + +If [radix](#radix) is specified and differs from the default 10, `pattern` value +will be ignored. + +### timezone + +**Supported by types:** [Timestamp](#struct), [Date](#timestamp) + +Time zone of the timestamp or date (not recommended for the latter). For details see the chapter +[Parsing timestamps and dates](#parsing-timestamps-and-dates). + +In case the [`pattern`](#pattern) already includes information to recognize the time zone, the `timezone` entry in _metadata_ will +be ignored. Namely if the pattern includes the *"z"*, *"Z"* or *"X"* placeholder or the *"epoch"*, *"epochmilli"*, +*"epochmicro"* and *"epochnano"* keywords. + +**NB!** Due to a Spark limitation, only time zone IDs are accepted as valid values. To get the full list of supported time + zone denominators see the output of Java's +[`TimeZone.getAvailableIDs()` function][oracle-tz-ids]. + +### decimal_separator + +**Supported by types:** [Decimal](#decimal), [Long](#long), [Integer](#integer), [Short](#short), [Byte](#byte), [Double](#double), [Float](#float) + +The character separating the integer and the fractional parts of the number. + +For whole numbers which use *"."* as the [`grouping_separator`](#grouping_separator), the `decimal_separator` has to be +redefined, to avoid conflict. + +### grouping_separator + +**Supported by types:** [Decimal](#decimal), [Long](#long), [Integer](#integer), [Short](#short), [Byte](#byte), [Double](#double), [Float](#float) + +Character to mark boundaries between orders of magnitude, usually to mark +thousands, millions etc. + +It has to be used in the pattern to be taken into consideration. + +### minus_sign + +**Supported by types:** [Decimal](#decimal), [Long](#long), [Integer](#integer), [Short](#short), [Byte](#byte), [Double](#double), [Float](#float) + +Character to mark the number is negative. By default, it’s the standard minus +sign (*"-"*). + +### allow_infinity + +Supported by types:** [Double](#double), [Float](#float) + +Flag indicating if the column accepts infinity as a value. When set to true *infinity*/*-infinity* are recognized as a +valid value, instead of failing with casting error ([see here][errcol]). +The string representing infinity on input is *"∞"* and *"-∞"* respectively. Positive and negative numbers with values +that are too large are converted to *infinity* and *-infinity*, respectively. + +### strict_parsing + +**Supported by types:** [Decimal](#decimal) + +Flag indicating strict parsing should be applied on the input values. +Strict parsing rejects a value with more decimal places than the scale (second number) of the field's Decimal type definition. +This results in a casting error in error column ([see here][errcol]). +For example for Decimal(X,2), the values with longer scale (like 10.12345, 0.1234) will be rejected. +Default value has to fit the scale as well in the case of strict parsing. + +### radix + +**Supported by types:** [Decimal](#decimal), [Long](#long), [Integer](#integer), [Short](#short), [Byte](#byte) + +The radix (base) of the numbers entered. Accepted values are numbers between 1 +and 36, as well as the following keywords (case insensitive): *"dec"*, +*"decimal"*, *"hex"*, *"hexadecimal"*, *"bin"*, *"binary"*, *"oct"*, *"octal"*. + +For higher bases, letters (A, B, C etc.) are used for digits (case insensitive). + +For hexadecimal value entries in the form *"0xFF"* are accepted as well. + +If `radix` is specified as anything other than the default 10, [pattern](#pattern) +value will be ignored. + +### encoding + +**Supported by types:** [Binary](#Binary) + +**When a string value is being converted to binary**, the supplied `encoding` indicates how the values are going to be +treated. This applies for the default value, too: + - `none` - the input will get cast as-is to binary. E.g. "abc" -> [97, 98, 99] + - `base64` - the input is considered as Base64-encoded and will get unencoded. Contrary to the basic Spark behavior of + `unbase64` (which skips characters invalid for Base64), this will result in an error. + +If `encoding` is missing altogether when it would be needed (e.g. when default value is given), `ValidationWarning` is + issued and the encoding value is considered to be `none`. + +`encoding` is not considered if BinaryType is already found in the input (no conversion is happening there). + +### width + +**Supported by types:** [String](#string), [Boolean](#boolean), [Decimal](#decimal), [Long](#long), [Integer](#integer), [Short](#short), [Byte](#byte), [Double](#double), [Float](#float), [Timestamp](#struct), [Date](#timestamp) + +Specifically for the Fixed-Width data format. Specifies the width of the column. + +## Parsing + +### Parsing timestamps and dates + +Dates and especially timestamps (date + time) can be tricky. Currently Spark considers all time entries to be in the +system's time zone by default. (For more detailed explanation of possible issues with that see +[Consistent timestamp types in Hadoop SQL engines][timestamp-types].) + +To address this potential source of discrepancies the following has been implemented: + +1. All Enceladus components are set to run in UTC +1. As part of **Standardization** all time related entries are normalized to UTC +1. There are several methods how to ensure that a timestamp entry is normalized as expected +1. We urge users, that all timestamp entries should include time zone information in one of the supported ways +1. While this is all valid for date entries too, it should be noted that UTC normalization of a date can have unexpected +consequences - namely all dates west from UTC would be shifted to a day earlier + +To enable processing of time entries from other systems **Standardization** offers the possibility to convert +string and even numeric values to timestamp or date types. This is done using Spark's ability to convert strings to +timestamp/date with some enhancements. The pattern placeholders and usage is described in Java's +[`SimpleDateFormat` class description][oracle-simple-date-format] with +the addition of recognizing some keywords (like `epoch` and `milliepoch` (case insensitive)) to denote the number of +seconds/milliseconds since epoch (1970/01/01 00:00:00.000 UTC) and some additional placeholders. +It should be noted explicitly that *"epoch"*, *"epochmilli"*, *"epochmicro"* and *"epochnano"* are considered a pattern +including time zone. + +Summary: + +| placeholder | Description | Example | +|--------------------------------------------|--------------------------------------------------|----------------------------------------| +| `G` | Era designator | AD | +| `y` | Year | 1996; 96 | +| `Y` | Week year | 2009; 09 | +| `M` | Month in year (context sensitive) | July; Jul; 07 | +| `L` | Month in year (standalone form) | July; Jul; 07 | +| `w` | Week in year | 27 | +| `W` | Week in month | 2 | +| `D` | Day in year | 189 | +| `d` | Day in month | 10 | +| `F` | Day of week in month | 2 | +| `E` | Day name in week | Tuesday; Tue | +| `u` | Day number of week (1 = Monday, ..., 7 = Sunday) | 1 | +| `a` | Am/pm marker | PM | +| `H` | Hour in day (0-23) | 0 | +| `k` | Hour in day (1-24) | 24 | +| `K` | Hour in am/pm (0-11) | 0 | +| `h` | Hour in am/pm (1-12) | 12 | +| `m` | Minute in hour | 30 | +| `s` | Second in minute | 55 | +| `S` | Millisecond | 978 | +| `z` | General time zone | Pacific Standard Time; PST; GMT-08:00 | +| `Z` | RFC 822 time zone | -0800 | +| `X` | ISO 8601 time zone | -08; -0800; -08:00 | +| `epoch` | Seconds since 1970/01/01 00:00:00 | 1557136493, 1557136493.136 | +| `epochmilli` | Milliseconds since 1970/01/01 00:00:00.0000 | 1557136493128, 1557136493128.001 | +| `epochmicro` | Microseconds since 1970/01/01 00:00:00.0000 | 1557136493128789, 1557136493128789.999 | +| `epochnano`[^4] | Nanoseconds since 1970/01/01 00:00:00.0000 | 1557136493128789101 | +| `i` | Microsecond | 111, 321001 | +| `n`[*](#parsing-star) | Nanosecond | 999, 542113879 | + +**NB!** Spark uses US Locale and because on-the-fly conversion would be complicated, at the moment we stick to this +hardcoded locale as well. E.g. `am/pm` for `a` placeholder, English names of days and months etc. + +**NB!** The keywords are case **insensitive**. Therefore, there is no difference between `epoch` and `EpoCH`. + +#### Time Zone support + +As mentioned, it is highly recommended to use timestamps with time zone but it's not unlikely that the +source for standardization doesn't provide time zone information. On the other hand, these times are usually within +one time zone. To ensure proper standardization, the schema's _metadata_ can include the `timezone` value. +When set, all timestamps in the column will be standardized as belonging to the particular time zone. + +E.g. _2019-05-04 11:31:10_ with `timzone` specified as _CET_ will be standardized to _2019-05-04 10:31:10_ (UTC of +course) + +### Parsing numbers + +When converting *string* to any of the numeric types there are two standard formats accepted: + +1. the usual string of digits with the eventual minus or plus sign in front and optional decimal separator (e.g *3.14*) +1. the scientific notation, where the numbers are expressed as the product of a mantissa and a power of ten +(e.g. 1234 can be expressed as 1.234 x 10^3 = 1.234E3) + +Note, that for whole numbers ([Long](#long), [Integer](#integer), [Short](#short) and [Byte](#byte)), the decimal +separator must not be present. + +If the string is being parsed to [decimal type](#decimal) and the input has more decimal places than is the *scale* of +the decimal type, the result will be rounded to the number of decimal places allowed by *scale*. + +#### Radix usage + +For whole numbers, the numbers can be entered using a different radix (base) than the usual 10. For radices smaller than +10, the appropriate subset of numeric digits are accepted. For radices above 10, letters are used for the digit +representation. The letters are case insensitive, therefore 1Fa = 1fA. + +To specify a non-standard radix (different from 10), use the [`Radix` *metadata* key](#radix). The radix has to be between 1 +and 36. + +#### Pattern parsing + +When the number is formatted in some non-standard way you can use a pattern. The parsing is executed using +the *Java* class `DecimalFormat`, whose [documentation][oracle-decimal-format] +provides the most comprehensive explanation of patterns and their usage. + +Pattern contains a positive and negative subpattern, for example, `#,##0.00;(#,##0.00)`. Each subpattern has a prefix, +numeric part, and suffix. The negative subpattern is optional; if absent, the positive subpattern prefixed with the +minus sign ('-' in most locales) is used as the negative subpattern. That is, `0.00` alone is equivalent to `0.00;-0.00`. +If there is an explicit negative subpattern, it serves only to specify the negative prefix and suffix; the number of +digits, minimal digits, and other characteristics are all the same as the positive pattern. That means that `#,##0.0#;(#)` +produces precisely the same behavior as `#,##0.0#;(#,##0.0#)`. + +The prefixes, suffixes, and various symbols used for infinity, digits, thousands separators, decimal separators, etc. may +be set to arbitrary values. However, care must be taken that the symbols and strings do not conflict, or parsing will be +unreliable. For example, either the positive and negative prefixes or the suffixes must be distinct for parsing to be able +to distinguish positive from negative values. Another example is that the decimal separator and thousands separator should +be distinct characters, or parsing will be impossible. + +| Symbol | Location | Meaning | Metadata to change | +|--------|-----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------| +| `0` | Number | Digit | | +| `#` | Number | Digit, zero shows as absent | | +| `.` | Number | Decimal separator | [Decimal separator](#decimal_separator)[^5] NB! | +| `-` | Number | Minus sign | [Minus sign](#minus_sign)[^5] NB! | +| `,` | Number | Grouping separator | [Grouping separator](#grouping_separator)[^5] NB! | +| `E` | Number | Separates mantissa and exponent in scientific notation. Need not be quoted in prefix or suffix. | | +| `;` | Subpattern boundary | Separates positive and negative subpatterns | | +| `%` | Prefix or suffix | Divide by 100 on parsing | | +| `‰` | Prefix or suffix | Divide by 1000 on parsing | | +| `'` | Prefix or suffix | Used to quote special characters in a prefix or suffix, for example, the "'#'#" pattern allows the value `#123` to be read in as the number `123`. To create a single quote itself, use two in a row: "# o''clock". | | +| `∞` | *not part of pattern* | String to represent infinity | | + +**NB!** If there's no special format of the input, it's advised to avoid the usage of patterns due to the added +performance hit (parsing using a pattern adds computational overhead). It's still possible to redefine the special +symbols like [decimal separator](#decimal_separator) or [minus sign](#minus_sign) even without the pattern. + +#### Number parsing peculiarities + +- number of `#` and `0` in the pattern is not reliable, the input is parsed regardless (e.g. *"0.0"* produces the same +result as *"##.#*) +- Grouping separator placement is "flexible". If the grouping separator has been added to the pattern the number is +parsed regardless of the placement(s) of the grouping separator in the parsed strings (e.g. pattern `#,##0.#` will parse +_1,000.0_, _1,1234.5_ so as _2000._) +- Grouping separator is not accepted in decimal places +- _"+"_ is not accepted when pattern is specified, unless explicitly present in the pattern +- `%` and `‰` in fractional and decimal values and patterns divide the number by 100 and 1000 (in integrals as well, but +then they are usually not whole numbers) +- Even if redefined the standard grouping and decimal separators **need to be used** in the pattern +- If pattern is used, `e` is not accepted only `E` in the exponential expresion (without a pattern both are recognized) + +## Defaults + +As described, when a field fails to standardize, either because of missing data in a non-nullable column +or because it was being cast to the wrong type, the field is populated with a default value and an error is added to +the [error column][errcol]. + +### Explicit default + +The value optionally specified in the schema of the dataset for **Standardization** in _metadata_ [Default](#default) +property to be used for the particular column. + +### Global default values + +The value used when _explicit default_ was not defined in the schema: + +- `null` for nullable column +- `0` for numeric column +- `false` for Boolean column +- `""` (empty string) for string column +- `Array.empty[Byte]` (empty array of bytes) for binary column +- `1970/01/01` for date column +- `1970/01/01 00:00:00` for timestamp column + +- default timezone if not specified for both timestamps and dates is *UTC*; this can be changed via application settings +`defaultTimestampTimeZone` and `defaultDateTimeZone` +- default locale and the decimal symbols based on that is *US*. Therefore: + - minus sign is *"-"* + - decimal separator is *"."* + - grouping separator is *","* + +### Explicit default values restrictions + +- The value has to be a string convertible to the field's type and fitting within its size limitations (e.g._"200"_ +cannot be a `default` for the type [`Short`](#short), or _"∞"_ if `allow_infinity` is _"false"_ for [`Double`](#double)/ +[`Float`](#float)) +- If it's a type supporting [`pattern`](#pattern) and it is defined, the default value has to adhere to the `pattern` +- If 'strict_parsing' is enabled for [`Decimal`](#decimal), the number of decimal places has to fit into the type's scale + +## Notes + +[^1]: Value used if nothing is specified in _metadata_ + +[^2]: No default exists (as not needed) + +[^3]: Unless a different default time zone is specified via`defaultTimestampTimeZone` and `defaultDateTimeZone` application settings + +[^4]: While _nanoseconds_ designation is supported on input, it's not supported in storage or further usage. So any value behind microseconds precision will be truncated. + +[^5]: While the decimal and grouping separators and minus sign symbols can be changed, in the [`pattern`]($pattern) the default value *must* be used. E.g. *"."*, *"."* and *"-"*. + +[test-samples]: https://github.com/AbsaOSS/enceladus/blob/master/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/samples/TestSamples.scala +[oracle-tz-ids]: https://docs.oracle.com/javase/8/docs/api/java/util/TimeZone.html#getAvailableIDs-- +[timestamp-types]: https://docs.google.com/document/d/1gNRww9mZJcHvUDCXklzjFEQGpefsuR_akCDfWsdE35Q/edit#heading=h.n699ftkvhjlo +[oracle-simple-date-format]: https://docs.oracle.com/javase/8/docs/api/java/text/SimpleDateFormat.html +[oracle-decimal-format]: https://docs.oracle.com/javase/7/docs/api/java/text/DecimalFormat.html +[errcol]: {{ docs_path }}/usage/errcol + diff --git a/_docs/3.0.0/usage/spark-jobs-quick-start.md b/_docs/3.0.0/usage/spark-jobs-quick-start.md new file mode 100644 index 000000000..fa1bbc506 --- /dev/null +++ b/_docs/3.0.0/usage/spark-jobs-quick-start.md @@ -0,0 +1,186 @@ +--- +layout: docs +title: Usage - Spark Job Quick Start +version: '3.0.0' +categories: + - '3.0.0' + - 'usage' +redirect_from: /docs/usage/spark-jobs-quick-start +--- +{% capture docs_path %}{{ site.baseurl }}/docs/{{ page.version }}{% endcapture %} + +## Prerequsites + +This quick start guide presumes that you have gone through : + +- [Menas Quick Start]({{ docs_path }}/usage/menas-quick-start) guide +- [Data & Data Quality Quick Start]({{ docs_path }}/usage/data-quick-start) guide + +## Running with spark-submit + +### Running Standardization + +```shell +/spark-submit \ +--num-executors 2 \ +--executor-memory 2G \ +--master yarn \ +--deploy-mode cluster \ +--driver-cores 2 \ +--driver-memory 2G \ +--class za.co.absa.enceladus.standardization.StandardizationJob \ +spark-jobs_.jar \ +--menas-auth-keytab \ +--dataset-name \ +--dataset-version \ +--report-date \ +--report-version \ +--raw-format \ +--row-tag +``` + +where: + +- `dataset_name` is the name given the dataset per [Menas Quick Start]({{ docs_path }}/usage/menas-quick-start) guide Guide +- `dataset_version` is a version of the dataset to use, which should have the correct schema and all the desired conformance rules +- `report_date` represents the date on which the data landed in the HDFS (in raw) and also the suffix part of the data output path. So if in Menas raw was specified as `/path/on/hdfs/raw` and the input parameter `report_date` as `2020-12-24` then the path where standardization will look for input files will be `/path/on/hdfs/raw/2020/12/24`. For the final part we are missing the report versions. +- `report_version` is the final part of the path on HDSF. With `report_date` we finished with `/path/on/hdfs/raw/2020/12/24/v`. This is the location where standardization will look for raw data. +- `raw-format` and its specifics. Raw format tells the standardization which format the data is in on the HDFS and what are its specifics. CSV might have a header, XML has a row-tag, etc. Here in the example, we use the `row-tag`. For more options for different types and run parameters see our [run documentation](run) or just run `--help` + + +### Running Conformance + +```shell +/spark-submit \ +--num-executors 2 \ +--executor-memory 2G \ +--master yarn \ +--deploy-mode cluster \ +--driver-cores 2 \ +--driver-memory 2G \ +--class za.co.absa.enceladus.conformance.DynamicConformanceJob \ +spark-jobs_.jar \ +--menas-auth-keytab \ +--dataset-name \ +--dataset-version \ +--report-date \ +--report-version +``` + +### Running Standardization and Conformance together + +```shell +/spark-submit \ +--num-executors 2 \ +--executor-memory 2G \ +--master yarn \ +--deploy-mode cluster \ +--driver-cores 2 \ +--driver-memory 2G \ +--class za.co.absa.enceladus.standardization_conformance.StandardizationAndConformanceJob \ +spark-jobs_.jar \ +--menas-auth-keytab \ +--dataset-name \ +--dataset-version \ +--report-date \ +--report-version \ +--raw-format \ +--row-tag +``` + +Here, nothing new is added for the quick run. Of course, there might be special options which are all documented in the [run documentation](https://absaoss.github.io/enceladus/docs/3.0.0/usage/run) + +## Running with helper scripts + +If your local DevOps/SysAdmin set up helper scripts for you, then it is even easier. You can omit all the spark options if sensible defaults are provided or Dynamic Resource Allocation is enabled. For more about this ask the people who set up your environment. + +Steps to configure the scripts are as follows (_Linux_/_Windows_): +* Copy all the scripts in `scripts/bash`/`scripts/cmd` directory to a location in your environment. +* Copy `enceladus_env.template.sh`/`enceladus_env.template.cmd` to `enceladus_env.sh`/`enceladus_env.cmd`. +* Change `enceladus_env.sh`/`enceladus_env.cmd` according to your environment settings. +* Use `run_standardization.sh`/`run_standardization.cmd` and `run_conformance.sh`/`run_conformance.cmd` or `run_standardization_conformance.sh`/`run_standardization_conformance.cmd` scripts instead of directly invoking `spark-submit` to run your jobs. + +When scripts are properly set up, then only a few parameters need to be specified. + +### Linux + +The basic command to run Standardization becomes: + +```shell +/run_standardization.sh \ +--menas-auth-keytab \ +--dataset-name \ +--dataset-version \ +--report-date \ +--report-version \ +--raw-format \ +--row-tag +``` + +The basic command to run Conformance becomes: + +```shell +/run_conformance.sh \ +--deploy-mode \ +--menas-auth-keytab \ +--dataset-name \ +--dataset-version \ +--report-date \ +--report-version +``` + +The basic command to run Standardization and Conformance together becomes: + +```shell +/run_standardization_conformance.sh \ +--menas-auth-keytab \ +--dataset-name \ +--dataset-version \ +--report-date \ +--report-version \ +--raw-format \ +--row-tag +``` + +### Windows + +The basic command to run Standardization becomes: + +```cmd +/run_standardization.cmd ^ +--menas-auth-keytab ^ +--dataset-name ^ +--dataset-version ^ +--report-date ^ +--report-version ^ +--raw-format ^ +--row-tag +``` + +The basic command to run Conformance becomes: + +```cmd +/run_conformance.cmd ^ +--deploy-mode ^ +--menas-auth-keytab ^ +--dataset-name ^ +--dataset-version ^ +--report-date ^ +--report-version +``` + +The basic command to run Standardization and Conformance together becomes: + +```cmd +/run_standardization_conformance.cmd ^ +--menas-auth-keytab ^ +--dataset-name ^ +--dataset-version ^ +--report-date ^ +--report-version ^ +--raw-format ^ +--row-tag +``` + + +For more options and arguments check the [run documentation](run) diff --git a/_docs/3.0.0/usage/standardization-formats.md b/_docs/3.0.0/usage/standardization-formats.md new file mode 100644 index 000000000..26b181ae7 --- /dev/null +++ b/_docs/3.0.0/usage/standardization-formats.md @@ -0,0 +1,86 @@ +--- +layout: docs +title: Usage - Standardization Input Formats +version: '3.0.0' +categories: + - '3.0.0' + - usage +redirect_from: /docs/usage/standardization-formats +--- + +Currently, Standardization supports these formats of input files + +- [Cobol](#cobol) (see [Cobrix GitHub](https://github.com/AbsaOSS/cobrix/)) +- [CSV](#csv) (see [rfc4180](https://tools.ietf.org/html/rfc4180)) +- [FixedWidth](#fixed-width) (see `Link to be added repo does not exist yet`) +- [JSON](#json) (see [json.org](https://www.json.org/json-en.html)) +- [Parquet](#parquet) (see [Apache Parquet](https://parquet.apache.org/documentation/latest/)) +- [XML](#xml) (see [xml.com](https://www.xml.com/)) + +When running standardization one of the formats of the list has to be specified. + +```shell +...standardization options... +--format +--format-specific-optionX valueY +``` + +## Cobol + +Cobol `format` value is `cobol`. Format options are + +| Option | Values domain | Description | Default | +|---|---|---|---| +| charset | Any valid charset name | The character set of the input. | `UTF-8` | +| cobol-encoding | `ascii` or `ebcdic` | Specifies encoding of mainframe files | - | +| cobol-is-text | Boolean | Specifies if the mainframe file is ASCII text file | `false` | +| cobol-trimming-policy | `none`, `left`, `right`, `both` | Specify string trimming policy for mainframe files | `none` | +| copybook | String | Path to a copybook for COBOL data format | - | +| is-xcom | Boolean | Does a mainframe file in COBOL format contain XCOM record headers | `false` | + +## CSV + +CSV `format` value is `csv`. Format options are + +| Option | Values domain | Description | Default | +|---|---|---|---| +| charset | Any valid charset names | The character set. | `UTF-8` | +| csv-escape | Any char | Escape character. Escaped quote characters are ignored. | `\` | +| csv-quote | Any char | Quote character. Delimiters inside quotes are ignored. | `"` | +| delimiter | Any char or unicode such as `U+00A1` | Delimiter the column values on a row | `,` | +| header | Boolean | Specifies if the input data have a CSV style header | `false` | +| null-value | String | Defines how null values are represented in a `fixed-width` file format | `""` _(empty string)_ | + +## Fixed Width + +Fixed Width is a custom in house made format. Requires width metadata, more in [Usage - Schema]({{ docs_path }}/usage/schema#width). + +Fixed Width `format` value is `fixed-width`. Format options are + +| Option | Values domain | Description | Default | +|---|---|---|---| +| charset | Any valid charset names | The character set. | `UTF-8` | +| empty-values-as-nulls | Boolean | If `true` treats empty values as `null`s | `false` | +| null-value | String | Defines how null values are represented in a `fixed-width` file format | `""` _(empty string)_ | +| trimValues | Boolean | Uses Java's String `.trim` method. Removes whitespaces from left and right ends. Required if data is to be casted to any Numeric | `false` | + +## JSON + +JSON `format` value is `json`. Format options are + +| Option | Values domain | Description | Default | +|---|---|---|---| +| charset | Any valid charset names | The character set. | `UTF-8` | + +## Parquet + +Has no extra options. Only `--format parquet`. + +## XML + +XML `format` value is `xml`. Format options are + +| Option | Values domain | Description | Default | +|---|---|---|---| +| charset | Any valid charset names | The character set. | `UTF-8` | +| row-tag | String | The tag of the xml file to treat as a row. For example, in the following xml ` ...`, the appropriate value would be `book`. | - |