diff --git a/docs/setup/configuration.md b/docs/setup/configuration.md index bf19597d..2140c929 100644 --- a/docs/setup/configuration.md +++ b/docs/setup/configuration.md @@ -10,17 +10,17 @@ seen [here](../sample/docker/data/custom/application.conf). Flags are used to control which processes are executed when you run Data Caterer. -| Config | Default | Paid | Description | -|------------------------------|---------|------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| enableGenerateData | true | N | Enable/disable data generation | -| enableCount | true | N | Count the number of records generated. Can be disabled to improve performance | -| enableFailOnError | true | N | Whilst saving generated data, if there is an error, it will stop any further data from being generated | -| enableSaveSinkMetadata | true | N | Enable/disable HTML reports summarising data generated, metadata of data generated (if `enableSinkMetadata` is enabled) and validation results (if `enableValidation` is enabled) | -| enableSinkMetadata | true | N | Run data profiling for the generated data. Shown in HTML reports if `enableSaveSinkMetadata` is enabled | -| enableValidation | false | N | Run validations as described in plan. Results can be viewed from logs or from HTML report if `enableSaveSinkMetadata` is enabled | -| enableGeneratePlanAndTasks | false | Y | Enable/disable plan and task auto generation based off data source connections | -| enableRecordTracking | false | Y | Enable/disable which data records have been generated for any data source | -| enableDeleteGeneratedRecords | false | Y | Delete all generated records based off record tracking (if `enableRecordTracking` has been set to true) | +| Config | Default | Paid | Description | +|--------------------------------|---------|------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `enableGenerateData` | true | N | Enable/disable data generation | +| `enableCount` | true | N | Count the number of records generated. Can be disabled to improve performance | +| `enableFailOnError` | true | N | Whilst saving generated data, if there is an error, it will stop any further data from being generated | +| `enableSaveSinkMetadata` | true | N | Enable/disable HTML reports summarising data generated, metadata of data generated (if `enableSinkMetadata` is enabled) and validation results (if `enableValidation` is enabled) | +| `enableSinkMetadata` | true | N | Run data profiling for the generated data. Shown in HTML reports if `enableSaveSinkMetadata` is enabled | +| `enableValidation` | false | N | Run validations as described in plan. Results can be viewed from logs or from HTML report if `enableSaveSinkMetadata` is enabled | +| `enableGeneratePlanAndTasks` | false | Y | Enable/disable plan and task auto generation based off data source connections | +| `enableRecordTracking` | false | Y | Enable/disable which data records have been generated for any data source | +| `enableDeleteGeneratedRecords` | false | Y | Delete all generated records based off record tracking (if `enableRecordTracking` has been set to true) | ## Folders @@ -29,14 +29,14 @@ records generated. These folder pathways can be defined as a cloud storage pathway (i.e. `s3a://my-bucket/task`). -| Config | Default | Paid | Description | -|--------------------------------|-----------------------------------------|------|---------------------------------------------------------------------------------------------------------------------| -| planFilePath | /opt/app/plan/customer-create-plan.yaml | N | Plan file path to use when generating and/or validating data | -| taskFolderPath | /opt/app/task | N | Task folder path that contains all the task files (can have nested directories) | -| validationFolderPath | /opt/app/validation | N | Validation folder path that contains all the validation files (can have nested directories) | -| generatedDataResultsFolderPath | /opt/app/html | N | Where HTML reports get generated that contain information about data generated along with any validations performed | -| generatedPlanAndTaskFolderPath | /tmp | Y | Folder path where generated plan and task files will be saved | -| recordTrackingFolderPath | /opt/app/record-tracking | Y | Where record tracking parquet files get saved | +| Config | Default | Paid | Description | +|----------------------------------|-----------------------------------------|------|---------------------------------------------------------------------------------------------------------------------| +| `planFilePath` | /opt/app/plan/customer-create-plan.yaml | N | Plan file path to use when generating and/or validating data | +| `taskFolderPath` | /opt/app/task | N | Task folder path that contains all the task files (can have nested directories) | +| `validationFolderPath` | /opt/app/validation | N | Validation folder path that contains all the validation files (can have nested directories) | +| `generatedDataResultsFolderPath` | /opt/app/html | N | Where HTML reports get generated that contain information about data generated along with any validations performed | +| `generatedPlanAndTaskFolderPath` | /tmp | Y | Folder path where generated plan and task files will be saved | +| `recordTrackingFolderPath` | /opt/app/record-tracking | Y | Where record tracking parquet files get saved | ## Metadata @@ -51,11 +51,11 @@ You may face issues if the number of records in the data source is large as data Similarly, it can be expensive when analysing the generated data if the number of records generated is large. -| Config | Default | Paid | Description | -|------------------------------------|---------|------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| numRecordsFromDataSource | 10000 | Y | Number of records read in from the data source | -| numRecordsForAnalysis | 10000 | Y | Number of records used for data profiling from the records gathered in `numRecordsFromDataSource` | -| oneOfDistinctCountVsCountThreshold | 0.1 | Y | Threshold ratio to determine if a field is of type `oneOf` (i.e. a field called `status` that only contains `open` or `closed`. Distinct count = 2, total count = 10, ratio = 2 / 10 = 0.2 therefore marked as `oneOf`) | +| Config | Default | Paid | Description | +|--------------------------------------|---------|------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `numRecordsFromDataSource` | 10000 | Y | Number of records read in from the data source | +| `numRecordsForAnalysis` | 10000 | Y | Number of records used for data profiling from the records gathered in `numRecordsFromDataSource` | +| `oneOfDistinctCountVsCountThreshold` | 0.1 | Y | Threshold ratio to determine if a field is of type `oneOf` (i.e. a field called `status` that only contains `open` or `closed`. Distinct count = 2, total count = 10, ratio = 2 / 10 = 0.2 therefore marked as `oneOf`) | ## Generation @@ -64,6 +64,6 @@ sources prone to failure under load. To help alleviate these issues or speed up performance, you can control the number of records that get generated in each batch. -| Config | Default | Paid | Description | -|--------------------|---------|------|-----------------------------------------------------------------| -| numRecordsPerBatch | 100000 | N | Number of records across all data sources to generate per batch | +| Config | Default | Paid | Description | +|----------------------|---------|------|-----------------------------------------------------------------| +| `numRecordsPerBatch` | 100000 | N | Number of records across all data sources to generate per batch | diff --git a/docs/setup/connection/connection.md b/docs/setup/connection/connection.md index 2fc4c409..8d34004c 100644 --- a/docs/setup/connection/connection.md +++ b/docs/setup/connection/connection.md @@ -22,16 +22,17 @@ All connection details follow the same pattern. } ``` -When defining a configuration value that can be defined by a system property or environment variable at runtime, you can -define that via the following: - -``` -url = "localhost" -url = ${?POSTGRES_URL} -``` - -The above defines that if there is a system property or environment variable named `POSTGRES_URL`, then that value will -be used for the `url`, otherwise, it will default to `localhost`. +!!! info "Overriding configuration" + When defining a configuration value that can be defined by a system property or environment variable at runtime, you can + define that via the following: + + ``` + url = "localhost" + url = ${?POSTGRES_URL} + ``` + + The above defines that if there is a system property or environment variable named `POSTGRES_URL`, then that value will + be used for the `url`, otherwise, it will default to `localhost`. ### Example task per data source @@ -126,46 +127,43 @@ jdbc { } ``` -Ensure that the user has write permission so it is able to save the table to the target tables. -
+Ensure that the user has write permission, so it is able to save the table to the target tables. -```sql -GRANT INSERT ON . TO ; -``` +??? tip "SQL Permission Statements" - + ```sql + GRANT INSERT ON .
TO ; + ``` #### Postgres ##### Permissions Following permissions are required when generating plan and tasks: -
- -```sql -GRANT SELECT ON information_schema.tables TO < user >; -GRANT SELECT ON information_schema.columns TO < user >; -GRANT SELECT ON information_schema.key_column_usage TO < user >; -GRANT SELECT ON information_schema.table_constraints TO < user >; -GRANT SELECT ON information_schema.constraint_column_usage TO < user >; -``` -
+??? tip "SQL Permission Statements" + + ```sql + GRANT SELECT ON information_schema.tables TO < user >; + GRANT SELECT ON information_schema.columns TO < user >; + GRANT SELECT ON information_schema.key_column_usage TO < user >; + GRANT SELECT ON information_schema.table_constraints TO < user >; + GRANT SELECT ON information_schema.constraint_column_usage TO < user >; + ``` #### MySQL ##### Permissions Following permissions are required when generating plan and tasks: -
-```sql -GRANT SELECT ON information_schema.columns TO < user >; -GRANT SELECT ON information_schema.statistics TO < user >; -GRANT SELECT ON information_schema.key_column_usage TO < user >; -``` +??? tip "SQL Permission Statements" -
+ ```sql + GRANT SELECT ON information_schema.columns TO < user >; + GRANT SELECT ON information_schema.statistics TO < user >; + GRANT SELECT ON information_schema.key_column_usage TO < user >; + ``` ### Cassandra @@ -189,24 +187,22 @@ org.apache.spark.sql.cassandra { ##### Permissions -Ensure that the user has write permission so it is able to save the table to the target tables. -
+Ensure that the user has write permission, so it is able to save the table to the target tables. -```sql -GRANT INSERT ON .
TO ; -``` +??? tip "CQL Permission Statements" - + ```sql + GRANT INSERT ON .
TO ; + ``` Following permissions are required when generating plan and tasks: -
-```sql -GRANT SELECT ON system_schema.tables TO ; -GRANT SELECT ON system_schema.columns TO ; -``` +??? tip "CQL Permission Statements" -
+ ```sql + GRANT SELECT ON system_schema.tables TO ; + GRANT SELECT ON system_schema.columns TO ; + ``` ### Kafka diff --git a/mkdocs.yml b/mkdocs.yml index 5c6dfdc3..c0a70064 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -40,6 +40,9 @@ markdown_extensions: - attr_list - def_list - md_in_html + - admonition + - pymdownx.details + - pymdownx.superfences - pymdownx.emoji: emoji_index: !!python/name:materialx.emoji.twemoji emoji_generator: !!python/name:materialx.emoji.to_svg diff --git a/site/advanced/advanced/index.html b/site/advanced/advanced/index.html index 30befcfd..1ee5ac80 100644 --- a/site/advanced/advanced/index.html +++ b/site/advanced/advanced/index.html @@ -706,26 +706,26 @@

Foreign keys across data sets

definition. For example, if I have the column account_number in a data source named customer-postgres and column account_id in transaction-cassandra,

-
sinkOptions:
-  foreignKeys:
-    #The foreign key name with naming convention [dataSourceName].[taskName].[columnName]
-    "customer-postgres.accounts.account_number":
-      #List of columns to match with same naming convention
-      - "transaction-cassandra.transactions.account_id"
-
+
sinkOptions:
+  foreignKeys:
+    #The foreign key name with naming convention [dataSourceName].[taskName].[columnName]
+    "customer-postgres.accounts.account_number":
+      #List of columns to match with same naming convention
+      - "transaction-cassandra.transactions.account_id"
+

Sample can be found here. You can define any number of foreign key relationships as you want.

Edge cases

For each given data type, there are edge cases which can cause issues when your application processes the data. This can be controlled at a column level by including the following flag in the generator options:

-
fields:
-  - name: "amount"
-    type: "double"
-    generator:
-      type: "random"
-      options:
-        enableEdgeCases: "true" 
-
+
fields:
+  - name: "amount"
+    type: "double"
+    generator:
+      type: "random"
+      options:
+        enableEdgeCases: "true" 
+

If you want to know all the possible edge cases for each data type, can check the documentation here.

Scenario testing

@@ -741,27 +741,27 @@

Scenario testing

Storing plan/task(s) in cloud storage

You can generate and store the plan/task files inside either AWS S3, Azure Blob Storage or Google GCS. This can be controlled via configuration set in the application.conf file where you can set something like the below:

-
folders {
-   generatedPlanAndTaskFolderPath = "s3a://my-bucket/data-caterer/generated"
-   planFilePath = "s3a://my-bucket/data-caterer/generated/plan/customer-create-plan.yaml"
-   taskFolderPath = "s3a://my-bucket/data-caterer/generated/task"
-}
-
-spark {
-    config {
-        ...
-        #S3
-        "spark.hadoop.fs.s3a.directory.marker.retention" = "keep"
-        "spark.hadoop.fs.s3a.bucket.all.committer.magic.enabled" = "true"
-        "spark.hadoop.fs.defaultFS" = "s3a://my-bucket"
-        #can change to other credential providers as shown here
-        #https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Changing_Authentication_Providers
-        "spark.hadoop.fs.s3a.aws.credentials.provider" = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider"
-        "spark.hadoop.fs.s3a.access.key" = "access_key"
-        "spark.hadoop.fs.s3a.secret.key" = "secret_key"
-   }
-}
-
+
folders {
+   generatedPlanAndTaskFolderPath = "s3a://my-bucket/data-caterer/generated"
+   planFilePath = "s3a://my-bucket/data-caterer/generated/plan/customer-create-plan.yaml"
+   taskFolderPath = "s3a://my-bucket/data-caterer/generated/task"
+}
+
+spark {
+    config {
+        ...
+        #S3
+        "spark.hadoop.fs.s3a.directory.marker.retention" = "keep"
+        "spark.hadoop.fs.s3a.bucket.all.committer.magic.enabled" = "true"
+        "spark.hadoop.fs.defaultFS" = "s3a://my-bucket"
+        #can change to other credential providers as shown here
+        #https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Changing_Authentication_Providers
+        "spark.hadoop.fs.s3a.aws.credentials.provider" = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider"
+        "spark.hadoop.fs.s3a.access.key" = "access_key"
+        "spark.hadoop.fs.s3a.secret.key" = "secret_key"
+   }
+}
+
diff --git a/site/get-started/docker/index.html b/site/get-started/docker/index.html index 855b1937..df69af98 100644 --- a/site/get-started/docker/index.html +++ b/site/get-started/docker/index.html @@ -760,13 +760,13 @@

Run Data Caterer

Docker

Quick start

-
git clone git@github.com:pflooky/data-caterer-docs.git
-cd data-caterer-docs/docs/sample/docker
-docker-compose up -d datacaterer
-
-

To run for another data source, you can set DATA_SOURCE like below:

-
DATA_SOURCE=postgres docker-compose up -d datacaterer
-
+
git clone git@github.com:pflooky/data-caterer-docs.git
+cd data-caterer-docs/docs/sample/docker
+docker-compose up -d datacaterer
+
+

To run for another data source, you can set DATA_SOURCE like below: +

DATA_SOURCE=postgres docker-compose up -d datacaterer
+

Can set it to one of the following:

  • postgres
  • @@ -781,16 +781,16 @@

    Report

    Check the report generated under docs/sample/docker/data/custom/report/index.html

    Run with multiple sub data sources

    In the context of Postgres data sources, tables are sub data sources that data can be generated for.

    -

    Try to run the following command:

    -
    PLAN=plan/postgres-multiple-tables docker-compose up -d datacaterer
    -
    +

    Try to run the following command: +

    PLAN=plan/postgres-multiple-tables docker-compose up -d datacaterer
    +

    Run with multiple data sources

    Postgres and CSV File

    -
    PLAN=plan/scenario-based docker-compose up -d datacaterer
    -head data/custom/csv/transactions/part-00000*
    -sample_account=$(head -1 data/custom/csv/transactions/part-00000* | awk -F "," '{print $1}')
    -docker exec docker-postgres-1 psql -Upostgres -d customer -c "SELECT * FROM account.accounts WHERE account_number='$sample_account'"
    -
    +
    PLAN=plan/scenario-based docker-compose up -d datacaterer
    +head data/custom/csv/transactions/part-00000*
    +sample_account=$(head -1 data/custom/csv/transactions/part-00000* | awk -F "," '{print $1}')
    +docker exec docker-postgres-1 psql -Upostgres -d customer -c "SELECT * FROM account.accounts WHERE account_number='$sample_account'"
    +

    You should be able to see the linked data between Postgres and the CSV file created along with 1 to 10 records per account_id, name combination in the CSV file.

    Run with custom data sources

    @@ -805,25 +805,25 @@

    Run with custom data sources

    -
    DATA_SOURCE=<data source name> docker-compose up -d datacaterer
    -
    +
    DATA_SOURCE=<data source name> docker-compose up -d datacaterer
    +

    Generate plan and tasks

    -
    APPLICATION_CONFIG_PATH=/opt/app/custom/application-dvd.conf ENABLE_GENERATE_DATA=false ENABLE_GENERATE_PLAN_AND_TASKS=true DATA_SOURCE=postgresdvd docker-compose up -d datacaterer
    -cat data/custom/generated/plan/plan_*
    -
    +
    APPLICATION_CONFIG_PATH=/opt/app/custom/application-dvd.conf ENABLE_GENERATE_DATA=false ENABLE_GENERATE_PLAN_AND_TASKS=true DATA_SOURCE=postgresdvd docker-compose up -d datacaterer
    +cat data/custom/generated/plan/plan_*
    +

    Generate data with record tracking

    -
    APPLICATION_CONFIG_PATH=/opt/app/custom/application-dvd.conf ENABLE_GENERATE_DATA=true ENABLE_GENERATE_PLAN_AND_TASKS=false ENABLE_RECORD_TRACKING=true DATA_SOURCE=postgresdvd PLAN=generated/plan/$(ls data/custom/generated/plan/ | grep plan | head -1 | awk -F " " '{print $NF}' | sed 's/\.yaml//g') docker-compose up -d datacaterer
    -
    +
    APPLICATION_CONFIG_PATH=/opt/app/custom/application-dvd.conf ENABLE_GENERATE_DATA=true ENABLE_GENERATE_PLAN_AND_TASKS=false ENABLE_RECORD_TRACKING=true DATA_SOURCE=postgresdvd PLAN=generated/plan/$(ls data/custom/generated/plan/ | grep plan | head -1 | awk -F " " '{print $NF}' | sed 's/\.yaml//g') docker-compose up -d datacaterer
    +

    Delete the generated data

    -
    APPLICATION_CONFIG_PATH=/opt/app/custom/application-dvd.conf ENABLE_GENERATE_DATA=false ENABLE_GENERATE_PLAN_AND_TASKS=false ENABLE_DELETE_GENERATED_RECORDS=true DATA_SOURCE=postgresdvd PLAN=generated/plan/$(ls data/custom/generated/plan/ | grep plan | head -1 | awk -F " " '{print $NF}' | sed 's/\.yaml//g') docker-compose up -d datacaterer
    -
    +
    APPLICATION_CONFIG_PATH=/opt/app/custom/application-dvd.conf ENABLE_GENERATE_DATA=false ENABLE_GENERATE_PLAN_AND_TASKS=false ENABLE_DELETE_GENERATED_RECORDS=true DATA_SOURCE=postgresdvd PLAN=generated/plan/$(ls data/custom/generated/plan/ | grep plan | head -1 | awk -F " " '{print $NF}' | sed 's/\.yaml//g') docker-compose up -d datacaterer
    +

    Helm

    Link to sample helm on GitHub here

    Update the configuration to your own data connections and configuration.

    -
    git clone git@github.com:pflooky/data-caterer-docs.git
    -helm install data-caterer ./data-caterer-docs/helm/data-caterer
    -
    +
    git clone git@github.com:pflooky/data-caterer-docs.git
    +helm install data-caterer ./data-caterer-docs/helm/data-caterer
    +
    diff --git a/site/sample/docker/index.html b/site/sample/docker/index.html index ce56d7c0..2a848358 100644 --- a/site/sample/docker/index.html +++ b/site/sample/docker/index.html @@ -619,8 +619,8 @@

    Data Caterer - Docker Compose

    If you want to try out data caterer generating data for various data sources, you do use the following docker-compose file.

    All you need to do is define which data source you want to run with via a command like below:

    -
    DATA_SOURCE=postgres docker-compose up -d datacaterer
    -
    +
    DATA_SOURCE=postgres docker-compose up -d datacaterer
    +

    You can change DATA_SOURCE to one of the following:
    - postgres - mysql diff --git a/site/search/search_index.json b/site/search/search_index.json index 71337bae..e0cb436a 100644 --- a/site/search/search_index.json +++ b/site/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Home","text":"

    Using Data Caterer, you have the ability to generate production like data based on any source/target system whether it be a CSV file, database table, etc. anywhere you want the data to be. Whether it be in a test environment or even in your local laptop. Just define your data source connections and data will be generated. It can also be manually altered to produce data or scenarios the way you want.

    Main features of the data generator include:

    • Metadata discovery
    • Batch or event data generation
    • Maintain referential integrity across any dataset
    • Create custom data generation scenarios
    • Clean up generated data
    • Validate data
    "},{"location":"advanced/advanced/","title":"Advanced use cases","text":""},{"location":"advanced/advanced/#special-data-formats","title":"Special data formats","text":"

    There are many options available for you to use when you have a scenario when data has to be a certain format.

    1. Create expression datafaker
      1. Can be used to create names, addresses, or anything that can be found under here
    2. Create regex
    "},{"location":"advanced/advanced/#foreign-keys-across-data-sets","title":"Foreign keys across data sets","text":"

    If you have a use case where you require a columns value to match in another data set, this can be achieved in the plan definition. For example, if I have the column account_number in a data source named customer-postgres and column account_id in transaction-cassandra,

    sinkOptions:\n  foreignKeys:\n    #The foreign key name with naming convention [dataSourceName].[taskName].[columnName]\n    \"customer-postgres.accounts.account_number\":\n      #List of columns to match with same naming convention\n      - \"transaction-cassandra.transactions.account_id\"\n

    Sample can be found here. You can define any number of foreign key relationships as you want.

    "},{"location":"advanced/advanced/#edge-cases","title":"Edge cases","text":"

    For each given data type, there are edge cases which can cause issues when your application processes the data. This can be controlled at a column level by including the following flag in the generator options:

    fields:\n  - name: \"amount\"\n    type: \"double\"\n    generator:\n      type: \"random\"\n      options:\n        enableEdgeCases: \"true\" \n

    If you want to know all the possible edge cases for each data type, can check the documentation here.

    "},{"location":"advanced/advanced/#scenario-testing","title":"Scenario testing","text":"

    You can create specific scenarios by adjusting the metadata found in the plan and tasks to your liking. For example, if you had two data sources, a Postgres database and a parquet file, and you wanted to save account data into Postgres and transactions related to those accounts into a parquet file. You can alter the status column in the account data to only generate open accounts and define a foreign key between Postgres and parquet to ensure the same account_id is being used. Then in the parquet task, define 1 to 10 transactions per account_id to be generated.

    Postgres account generation example task Parquet transaction generation example task Plan

    "},{"location":"advanced/advanced/#storing-plantasks-in-cloud-storage","title":"Storing plan/task(s) in cloud storage","text":"

    You can generate and store the plan/task files inside either AWS S3, Azure Blob Storage or Google GCS. This can be controlled via configuration set in the application.conf file where you can set something like the below:

    folders {\n   generatedPlanAndTaskFolderPath = \"s3a://my-bucket/data-caterer/generated\"\n   planFilePath = \"s3a://my-bucket/data-caterer/generated/plan/customer-create-plan.yaml\"\n   taskFolderPath = \"s3a://my-bucket/data-caterer/generated/task\"\n}\n\nspark {\n    config {\n        ...\n        #S3\n        \"spark.hadoop.fs.s3a.directory.marker.retention\" = \"keep\"\n        \"spark.hadoop.fs.s3a.bucket.all.committer.magic.enabled\" = \"true\"\n        \"spark.hadoop.fs.defaultFS\" = \"s3a://my-bucket\"\n        #can change to other credential providers as shown here\n        #https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Changing_Authentication_Providers\n        \"spark.hadoop.fs.s3a.aws.credentials.provider\" = \"org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider\"\n        \"spark.hadoop.fs.s3a.access.key\" = \"access_key\"\n        \"spark.hadoop.fs.s3a.secret.key\" = \"secret_key\"\n   }\n}\n
    "},{"location":"get-started/docker/","title":"Run Data Caterer","text":""},{"location":"get-started/docker/#docker","title":"Docker","text":""},{"location":"get-started/docker/#quick-start","title":"Quick start","text":"
    git clone git@github.com:pflooky/data-caterer-docs.git\ncd data-caterer-docs/docs/sample/docker\ndocker-compose up -d datacaterer\n

    To run for another data source, you can set DATA_SOURCE like below:

    DATA_SOURCE=postgres docker-compose up -d datacaterer\n

    Can set it to one of the following:

    • postgres
    • mysql
    • cassandra
    • solace
    • kafka
    • http

    If you want to test it out with your own setup, you can alter the corresponding files under docs/sample/docker/data

    "},{"location":"get-started/docker/#report","title":"Report","text":"

    Check the report generated under docs/sample/docker/data/custom/report/index.html

    "},{"location":"get-started/docker/#run-with-multiple-sub-data-sources","title":"Run with multiple sub data sources","text":"

    In the context of Postgres data sources, tables are sub data sources that data can be generated for.

    Try to run the following command:

    PLAN=plan/postgres-multiple-tables docker-compose up -d datacaterer\n
    "},{"location":"get-started/docker/#run-with-multiple-data-sources","title":"Run with multiple data sources","text":""},{"location":"get-started/docker/#postgres-and-csv-file","title":"Postgres and CSV File","text":"
    PLAN=plan/scenario-based docker-compose up -d datacaterer\nhead data/custom/csv/transactions/part-00000*\nsample_account=$(head -1 data/custom/csv/transactions/part-00000* | awk -F \",\" '{print $1}')\ndocker exec docker-postgres-1 psql -Upostgres -d customer -c \"SELECT * FROM account.accounts WHERE account_number='$sample_account'\"\n

    You should be able to see the linked data between Postgres and the CSV file created along with 1 to 10 records per account_id, name combination in the CSV file.

    "},{"location":"get-started/docker/#run-with-custom-data-sources","title":"Run with custom data sources","text":"
    1. Create/alter plan under data/custom/plan
    2. Create/alter tasks under data/custom/task
      1. Define your schemas and generator configurations such as record count
    3. Create/alter application configuration data/custom/application.conf
      1. This is where you define your connection properties and other flags/configurations
    DATA_SOURCE=<data source name> docker-compose up -d datacaterer\n
    "},{"location":"get-started/docker/#generate-plan-and-tasks","title":"Generate plan and tasks","text":"
    APPLICATION_CONFIG_PATH=/opt/app/custom/application-dvd.conf ENABLE_GENERATE_DATA=false ENABLE_GENERATE_PLAN_AND_TASKS=true DATA_SOURCE=postgresdvd docker-compose up -d datacaterer\ncat data/custom/generated/plan/plan_*\n
    "},{"location":"get-started/docker/#generate-data-with-record-tracking","title":"Generate data with record tracking","text":"
    APPLICATION_CONFIG_PATH=/opt/app/custom/application-dvd.conf ENABLE_GENERATE_DATA=true ENABLE_GENERATE_PLAN_AND_TASKS=false ENABLE_RECORD_TRACKING=true DATA_SOURCE=postgresdvd PLAN=generated/plan/$(ls data/custom/generated/plan/ | grep plan | head -1 | awk -F \" \" '{print $NF}' | sed 's/\\.yaml//g') docker-compose up -d datacaterer\n
    "},{"location":"get-started/docker/#delete-the-generated-data","title":"Delete the generated data","text":"
    APPLICATION_CONFIG_PATH=/opt/app/custom/application-dvd.conf ENABLE_GENERATE_DATA=false ENABLE_GENERATE_PLAN_AND_TASKS=false ENABLE_DELETE_GENERATED_RECORDS=true DATA_SOURCE=postgresdvd PLAN=generated/plan/$(ls data/custom/generated/plan/ | grep plan | head -1 | awk -F \" \" '{print $NF}' | sed 's/\\.yaml//g') docker-compose up -d datacaterer\n
    "},{"location":"get-started/docker/#helm","title":"Helm","text":"

    Link to sample helm on GitHub here

    Update the configuration to your own data connections and configuration.

    git clone git@github.com:pflooky/data-caterer-docs.git\nhelm install data-caterer ./data-caterer-docs/helm/data-caterer\n
    "},{"location":"roadmap/roadmap/","title":"Roadmap","text":"
    • Support for other data sources
      • GCP and Azure related data services, Delta, RabbitMQ, ActiveMQ
    • Metadata discovery for HTTP, JMS
    • API for developers and testers
    • UI for metadata and data generation
    • Report for data generated and validation rules
    • Metadata stored in database
    • Integration with existing metadata services (i.e. amundsen, datahub)
    • Data dictionary
      • Business definitions
    • Verification rules after data generation
    • Alerting
    • Overriding tasks
      • Can customise tasks without copying whole schema definitions
      • Easier to create scenarios
    "},{"location":"sample/","title":"Samples","text":"

    Below are examples of different types of plans and tasks that can be helpful when trying to create your own. You can use these as a template or to search for something related to your particular use case.

    "},{"location":"sample/#base-concept","title":"Base Concept","text":"

    The execution of the data generator is based on the concept of plans and tasks. A plan represent the set of tasks that need to be executed, along with other information that spans across tasks, such as foreign keys between data sources. A task represent the component(s) of a data source and its associated metadata so that it understands what the data should look like and how many steps (sub data sources) there are (i.e. tables in a database, topics in Kafka). Tasks can define one or more steps.

    "},{"location":"sample/#plan","title":"Plan","text":""},{"location":"sample/#foreign-keys","title":"Foreign Keys","text":"

    Define foreign keys across data sources in your plan to ensure generated data can match Link to associated task 1 Link to associated task 2

    "},{"location":"sample/#task","title":"Task","text":"Data Source Type Data Source Sample Task Notes Database Postgres Sample Database MySQL Sample Database Cassandra Sample File CSV Sample File JSON Sample Contains nested schemas and use of SQL for generated values File Parquet Sample Partition by year column Kafka Kafka Sample Specific base schema to be used, define headers, key, value, etc. JMS Solace Sample JSON formatted message HTTP PUT Sample JSON formatted PUT body"},{"location":"sample/#configuration","title":"Configuration","text":"

    Basic configuration

    "},{"location":"sample/docker/","title":"Data Caterer - Docker Compose","text":"

    If you want to try out data caterer generating data for various data sources, you do use the following docker-compose file.

    All you need to do is define which data source you want to run with via a command like below:

    DATA_SOURCE=postgres docker-compose up -d datacaterer\n

    You can change DATA_SOURCE to one of the following: - postgres - mysql - cassandra - solace - kafka - http

    "},{"location":"setup/","title":"Setup","text":"

    All the configurations and customisation related to Data Caterer can be found under here.

    • Configurations - Configurations relating to feature flags, folder pathways, metadata analysis
    • Connections - Explore the data source connections available
    • Generators - Choose and configure the type of generator you want used for fields
    • Validations - How to validate data to ensure your system is performing as expected
    "},{"location":"setup/configuration/","title":"Configuration","text":"

    A number of configurations can be made and customised within Data Caterer to help control what gets run and/or where any metadata gets saved.

    These configurations are defined from within your application.conf file as seen here.

    "},{"location":"setup/configuration/#flags","title":"Flags","text":"

    Flags are used to control which processes are executed when you run Data Caterer.

    Config Default Paid Description enableGenerateData true N Enable/disable data generation enableCount true N Count the number of records generated. Can be disabled to improve performance enableFailOnError true N Whilst saving generated data, if there is an error, it will stop any further data from being generated enableSaveSinkMetadata true N Enable/disable HTML reports summarising data generated, metadata of data generated (if enableSinkMetadata is enabled) and validation results (if enableValidation is enabled) enableSinkMetadata true N Run data profiling for the generated data. Shown in HTML reports if enableSaveSinkMetadata is enabled enableValidation false N Run validations as described in plan. Results can be viewed from logs or from HTML report if enableSaveSinkMetadata is enabled enableGeneratePlanAndTasks false Y Enable/disable plan and task auto generation based off data source connections enableRecordTracking false Y Enable/disable which data records have been generated for any data source enableDeleteGeneratedRecords false Y Delete all generated records based off record tracking (if enableRecordTracking has been set to true)"},{"location":"setup/configuration/#folders","title":"Folders","text":"

    Depending on which flags are enabled, there are folders that get used to save metadata, store HTML reports or track the records generated.

    These folder pathways can be defined as a cloud storage pathway (i.e. s3a://my-bucket/task).

    Config Default Paid Description planFilePath /opt/app/plan/customer-create-plan.yaml N Plan file path to use when generating and/or validating data taskFolderPath /opt/app/task N Task folder path that contains all the task files (can have nested directories) validationFolderPath /opt/app/validation N Validation folder path that contains all the validation files (can have nested directories) generatedDataResultsFolderPath /opt/app/html N Where HTML reports get generated that contain information about data generated along with any validations performed generatedPlanAndTaskFolderPath /tmp Y Folder path where generated plan and task files will be saved recordTrackingFolderPath /opt/app/record-tracking Y Where record tracking parquet files get saved"},{"location":"setup/configuration/#metadata","title":"Metadata","text":"

    When metadata gets generated, there are some configurations that can be altered to help with performance or accuracy related issues. Metadata gets generated from two processes: 1) if enableGeneratePlanAndTasks or 2) if enableSinkMetadata are enabled.

    During the generation of plan and tasks, data profiling is used to create the metadata for each of the fields defined in the data source. You may face issues if the number of records in the data source is large as data profiling is an expensive task. Similarly, it can be expensive when analysing the generated data if the number of records generated is large.

    Config Default Paid Description numRecordsFromDataSource 10000 Y Number of records read in from the data source numRecordsForAnalysis 10000 Y Number of records used for data profiling from the records gathered in numRecordsFromDataSource oneOfDistinctCountVsCountThreshold 0.1 Y Threshold ratio to determine if a field is of type oneOf (i.e. a field called status that only contains open or closed. Distinct count = 2, total count = 10, ratio = 2 / 10 = 0.2 therefore marked as oneOf)"},{"location":"setup/configuration/#generation","title":"Generation","text":"

    When generating data, you may have some limitations such as limited CPU or memory, large number of data sources, or data sources prone to failure under load. To help alleviate these issues or speed up performance, you can control the number of records that get generated in each batch.

    Config Default Paid Description numRecordsPerBatch 100000 N Number of records across all data sources to generate per batch"},{"location":"setup/connection/connection/","title":"Data Source Connections","text":"

    Details of all the connection configuration supported can be found in the below subsections for each type of connection.

    "},{"location":"setup/connection/connection/#supported-data-connections","title":"Supported Data Connections","text":"Data Source Type Data Source Database Postgres, MySQL, Cassandra File CSV, JSON, ORC, Parquet Kafka Kafka JMS Solace HTTP GET, PUT, POST, DELETE, PATCH, HEAD, TRACE, OPTIONS

    All connection details follow the same pattern.

    <connection format> {\n    <connection name> {\n        <key> = <value>\n    }\n}\n

    When defining a configuration value that can be defined by a system property or environment variable at runtime, you can define that via the following:

    url = \"localhost\"\nurl = ${?POSTGRES_URL}\n

    The above defines that if there is a system property or environment variable named POSTGRES_URL, then that value will be used for the url, otherwise, it will default to localhost.

    "},{"location":"setup/connection/connection/#example-task-per-data-source","title":"Example task per data source","text":"

    To find examples of a task for each type of data source, please check out this page.

    "},{"location":"setup/connection/connection/#file","title":"File","text":"

    Linked here is a list of generic options that can be included as part of your file data source configuration if required. Links to specific file type configurations can be found below.

    "},{"location":"setup/connection/connection/#csv","title":"CSV","text":"
    csv {\n  customer_transactions {\n    path = \"/data/customer/transaction\"\n    path = ${?CSV_PATH}\n  }\n}\n

    Other available configuration for CSV can be found here

    "},{"location":"setup/connection/connection/#json","title":"JSON","text":"
    json {\n  customer_transactions {\n    path = \"/data/customer/transaction\"\n    path = ${?JSON_PATH}\n  }\n}\n

    Other available configuration for JSON can be found here

    "},{"location":"setup/connection/connection/#orc","title":"ORC","text":"
    orc {\n  customer_transactions {\n    path = \"/data/customer/transaction\"\n    path = ${?ORC_PATH}\n  }\n}\n

    Other available configuration for ORC can be found here

    "},{"location":"setup/connection/connection/#parquet","title":"Parquet","text":"
    parquet {\n  customer_transactions {\n    path = \"/data/customer/transaction\"\n    path = ${?PARQUET_PATH}\n  }\n}\n

    Other available configuration for Parquet can be found here

    "},{"location":"setup/connection/connection/#delta-not-supported-yet","title":"Delta (not supported yet)","text":"
    delta {\n  customer_transactions {\n    path = \"/data/customer/transaction\"\n    path = ${?DELTA_PATH}\n  }\n}\n
    "},{"location":"setup/connection/connection/#jdbc","title":"JDBC","text":"

    Follows the same configuration used by Spark as found here. Sample can be found below

    jdbc {\n    postgres {\n        url = \"jdbc:postgresql://localhost:5432/customer\"\n        url = ${?POSTGRES_URL}\n        user = \"postgres\"\n        user = ${?POSTGRES_USERNAME}\n        password = \"postgres\"\n        password = ${?POSTGRES_PASSWORD}\n        driver = \"org.postgresql.Driver\"\n    }\n}\n

    Ensure that the user has write permission so it is able to save the table to the target tables.

    GRANT INSERT ON <schema>.<table> TO <user>;\n
    "},{"location":"setup/connection/connection/#postgres","title":"Postgres","text":""},{"location":"setup/connection/connection/#permissions","title":"Permissions","text":"

    Following permissions are required when generating plan and tasks:

    GRANT SELECT ON information_schema.tables TO < user >;\nGRANT SELECT ON information_schema.columns TO < user >;\nGRANT SELECT ON information_schema.key_column_usage TO < user >;\nGRANT SELECT ON information_schema.table_constraints TO < user >;\nGRANT SELECT ON information_schema.constraint_column_usage TO < user >;\n
    "},{"location":"setup/connection/connection/#mysql","title":"MySQL","text":""},{"location":"setup/connection/connection/#permissions_1","title":"Permissions","text":"

    Following permissions are required when generating plan and tasks:

    GRANT SELECT ON information_schema.columns TO < user >;\nGRANT SELECT ON information_schema.statistics TO < user >;\nGRANT SELECT ON information_schema.key_column_usage TO < user >;\n
    "},{"location":"setup/connection/connection/#cassandra","title":"Cassandra","text":"

    Follows same configuration as defined by the Spark Cassandra Connector as found here

    org.apache.spark.sql.cassandra {\n    cassandra {\n        spark.cassandra.connection.host = \"localhost\"\n        spark.cassandra.connection.host = ${?CASSANDRA_HOST}\n        spark.cassandra.connection.port = \"9042\"\n        spark.cassandra.connection.port = ${?CASSANDRA_PORT}\n        spark.cassandra.auth.username = \"cassandra\"\n        spark.cassandra.auth.username = ${?CASSANDRA_USERNAME}\n        spark.cassandra.auth.password = \"cassandra\"\n        spark.cassandra.auth.password = ${?CASSANDRA_PASSWORD}\n    }\n}\n
    "},{"location":"setup/connection/connection/#permissions_2","title":"Permissions","text":"

    Ensure that the user has write permission so it is able to save the table to the target tables.

    GRANT INSERT ON <schema>.<table> TO <user>;\n

    Following permissions are required when generating plan and tasks:

    GRANT SELECT ON system_schema.tables TO <user>;\nGRANT SELECT ON system_schema.columns TO <user>;\n
    "},{"location":"setup/connection/connection/#kafka","title":"Kafka","text":"

    Define your Kafka bootstrap server to connect and send generated data to corresponding topics. Topic gets set at a step level. Further details can be found here

    kafka {\n    kafka {\n        kafka.bootstrap.servers = \"localhost:9092\"\n        kafka.bootstrap.servers = ${?KAFKA_BOOTSTRAP_SERVERS}\n    }\n}\n

    When defining your schema for pushing data to Kafka, it follows a specific top level schema. An example can be found here. You can define the key, value, headers, partition or topic by following the linked schema.

    "},{"location":"setup/connection/connection/#jms","title":"JMS","text":"

    Uses JNDI lookup to send messages to JMS queue. Ensure that the messaging system you are using has your queue/topic registered via JNDI otherwise a connection cannot be created.

    jms {\n    solace {\n        initialContextFactory = \"com.solacesystems.jndi.SolJNDIInitialContextFactory\"\n        connectionFactory = \"/jms/cf/default\"\n        url = \"smf://localhost:55555\"\n        url = ${?SOLACE_URL}\n        user = \"admin\"\n        user = ${?SOLACE_USER}\n        password = \"admin\"\n        password = ${?SOLACE_PASSWORD}\n        vpnName = \"default\"\n        vpnName = ${?SOLACE_VPN}\n    }\n}\n
    "},{"location":"setup/connection/connection/#http","title":"HTTP","text":"

    Define any username and/or password needed for the HTTP requests. The url is defined in the tasks to allow for generated data to be populated in the url.

    http {\n    customer_api {\n        user = \"admin\"\n        user = ${?HTTP_USER}\n        password = \"admin\"\n        password = ${?HTTP_PASSWORD}\n    }\n}\n
    "},{"location":"setup/generator/count/","title":"Record Count","text":"

    There are options related to controlling the number of records generated that can help in generating the scenarios or data required.

    "},{"location":"setup/generator/count/#total-count","title":"Total Count","text":"

    Total count is the simplest as you define the total number of records you require for that particular step. For example, in the below step, it will generate 1000 records for the CSV file

    name: \"csv_file\"\nsteps:\n  - name: \"transactions\"\n    type: \"csv\"\n    options:\n      path: \"app/src/test/resources/sample/csv/transactions\"\n    count:\n      total: 1000\n
    "},{"location":"setup/generator/count/#generated-count","title":"Generated Count","text":"

    As like most things in data-caterer, the count can be generated based on some metadata. For example, if I wanted to generate between 1000 and 2000 records, I could define that by the below configuration:

    name: \"csv_file\"\nsteps:\n  - name: \"transactions\"\n    type: \"csv\"\n    options:\n      path: \"app/src/test/resources/sample/csv/transactions\"\n    count:\n      generator:\n        type: \"random\"\n        options:\n          min: 1000\n          max: 2000\n
    "},{"location":"setup/generator/count/#per-column-count","title":"Per Column Count","text":"

    When defining a per column count, this allows you to generate records \"per set of columns\". This means that for a given set of columns, it will generate a particular amount of records per combination of values for those columns.

    One example of this would be when generating transactions relating to a customer. A customer may be defined by columns account_id, name. A number of transactions would be generated per account_id,name.

    You can also use a combination of the above two methods to generate the number of records per column.

    "},{"location":"setup/generator/count/#total","title":"Total","text":"

    When defining a total count within the perColumn configuration, it translates to only creating (count.total * count.perColumn.total) records. This is a fixed number of records that will be generated each time, with no variation between runs.

    In the example below, we have count.total = 1000 and count.perColumn.total = 2. Which means that 1000 * 2 = 2000 records will be generated for this CSV file every time data gets generated.

    name: \"csv_file\"\nsteps:\n  - name: \"transactions\"\n    type: \"csv\"\n    options:\n      path: \"app/src/test/resources/sample/csv/transactions\"\n    count:\n      total: 1000\n      perColumn:\n        total: 2\n        columnNames:\n          - \"account_id\"\n          - \"name\"\n
    "},{"location":"setup/generator/count/#generated","title":"Generated","text":"

    You can also define a generator for the count per column. This can be used in scenarios where you want a variable number of records per set of columns.

    In the example below, it will generate between (count.total * count.perColumn.generator.options.minValue) = (1000 * 1) = 1000 and (count.total * count.perColumn.generator.options.maxValue) = (1000 * 2) = 2000 records.

    name: \"csv_file\"\nsteps:\n  - name: \"transactions\"\n    type: \"csv\"\n    options:\n      path: \"app/src/test/resources/sample/csv/transactions\"\n    count:\n      total: 1000\n      perColumn:\n        columnNames:\n          - \"account_id\"\n          - \"name\"\n        generator:\n          type: \"random\"\n          options:\n            maxValue: 2\n            minValue: 1\n
    "},{"location":"setup/generator/generator/","title":"Data Generators","text":""},{"location":"setup/generator/generator/#data-types","title":"Data Types","text":"

    Below is a list of all supported data types for generating data:

    Data Type Spark Data Type Options Description string StringType minLen, maxLen, expression, enableNull integer IntegerType min, minValue, max, maxValue long LongType min, minValue, max, maxValue short ShortType min, minValue, max, maxValue decimal(precision, scale) DecimalType(precision, scale) min, minValue, max, maxValue double DoubleType min, minValue, max, maxValue float FloatType min, minValue, max, maxValue date DateType min, max, enableNull timestamp TimestampType min, max, enableNull boolean BooleanType binary BinaryType minLen, maxLen, enableNull byte ByteType array ArrayType listMinLen, listMaxLen _ StructType Implicitly supported when a schema is defined for a field"},{"location":"setup/generator/generator/#options","title":"Options","text":""},{"location":"setup/generator/generator/#all-data-types","title":"All data types","text":"

    Some options are available to use for all types of data generators. Below is the list along with example and descriptions:

    Option Default Example Description enableEdgeCases false enableEdgeCases: \"true\" Enable/disable generated data to contain edge cases based on the data type. For example, integer data type has edge cases of (Int.MaxValue, Int.MinValue and 0) isUnique false isUnique: \"true\" Enable/disable generated data to be unique for that column. Errors will be thrown when it is unable to generate unique data seed seed: \"1\" Defines the random seed for generating data for that particular column. It will override any seed defined at a global level sql sql: \"CASE WHEN amount < 10 THEN true ELSE false END\" Define any SQL statement for generating that columns value. Computation occurs after all non-SQL fields are generated. This means any columns used in the SQL cannot be based on other SQL generated columns. Data type of generated value from SQL needs to match data type defined for the field"},{"location":"setup/generator/generator/#string","title":"String","text":"Option Default Example Description minLen 1 minLen: \"2\" Ensures that all generated strings have at least length minLen maxLen 10 maxLen: \"15\" Ensures that all generated strings have at most length maxLen expression expression: \"#{Name.name}\" expression:\"#{Address.city}/#{Demographic.maritalStatus}\" Will generate a string based on the faker expression provided. All possible faker expressions can be found here Expression has to be in format #{<faker expression name>} enableNull false enableNull: \"true\" Enable/disable null values being generated

    Edge cases: (\"\", \"\\n\", \"\\r\", \"\\t\", \" \", \"\\u0000\", \"\\ufff\")

    "},{"location":"setup/generator/generator/#numeric","title":"Numeric","text":"

    For all the numeric data types, there are 4 options to choose from: min, minValue, max and maxValue. Generally speaking, you only need to define one of min or minValue, similarly with max or maxValue. The reason why there are 2 options for each is because of when metadata is automatically gathered, we gather the statistics of the observed min and max values. Also, it will attempt to gather any restriction on the min or max value as defined by the data source (i.e. max value as per database type).

    "},{"location":"setup/generator/generator/#integerlongshortdecimal","title":"Integer/Long/Short/Decimal","text":"Option Default Example Description minValue 0 minValue: \"2\" Ensures that all generated values are greater than or equal to minValue min 0 min: \"2\" Ensures that all generated values are greater than or equal to min. If minValue is defined, minValue will define the lowest possible generated value maxValue 1000 maxValue: \"25\" Ensures that all generated values are less than or equal to maxValue max 1000 max: \"25\" Ensures that all generated values are less than or equal to maxValue. If maxValue is defined, maxValue will define the largest possible generated value

    Edge cases Integer: (2147483647, -2147483648, 0) Edge cases Long/Decimal: (9223372036854775807, -9223372036854775808, 0) Edge cases Short: (32767, -32768, 0)

    "},{"location":"setup/generator/generator/#doublefloat","title":"Double/Float","text":"Option Default Example Description minValue 0.0 minValue: \"2.1\" Ensures that all generated values are greater than or equal to minValue min 0.0 min: \"2.1\" Ensures that all generated values are greater than or equal to min. If minValue is defined, minValue will define the lowest possible generated value maxValue 1000.0 maxValue: \"25.9\" Ensures that all generated values are less than or equal to maxValue max 1000.0 max: \"25.9\" Ensures that all generated values are less than or equal to maxValue. If maxValue is defined, maxValue will define the largest possible generated value

    Edge cases Double: (+infinity, 1.7976931348623157e+308, 4.9e-324, 0.0, -0.0, -1.7976931348623157e+308, -infinity, NaN) Edge cases Float: (+infinity, 3.4028235e+38, 1.4e-45, 0.0, -0.0, -3.4028235e+38, -infinity, NaN)

    "},{"location":"setup/generator/generator/#date","title":"Date","text":"Option Default Example Description min now() - 365 days min: \"2023-01-31\" Ensures that all generated values are greater than or equal to min max now() max: \"2023-12-31\" Ensures that all generated values are less than or equal to max enableNull false enableNull: \"true\" Enable/disable null values being generated

    Edge cases: (0001-01-01, 1582-10-15, 1970-01-01, 9999-12-31) (reference)

    "},{"location":"setup/generator/generator/#timestamp","title":"Timestamp","text":"Option Default Example Description min now() - 365 days min: \"2023-01-31 23:10:10\" Ensures that all generated values are greater than or equal to min max now() max: \"2023-12-31 23:10:10\" Ensures that all generated values are less than or equal to max enableNull false enableNull: \"true\" Enable/disable null values being generated

    Edge cases: (0001-01-01 00:00:00, 1582-10-15 23:59:59, 1970-01-01 00:00:00, 9999-12-31 23:59:59)

    "},{"location":"setup/generator/generator/#binary","title":"Binary","text":"Option Default Example Description minLen 1 minLen: \"2\" Ensures that all generated array of bytes have at least length minLen maxLen 20 maxLen: \"15\" Ensures that all generated array of bytes have at most length maxLen enableNull false enableNull: \"true\" Enable/disable null values being generated

    Edge cases: (\"\", \"\\n\", \"\\r\", \"\\t\", \" \", \"\\u0000\", \"\\ufff\", -128, 127)

    "},{"location":"setup/generator/generator/#list","title":"List","text":"Option Default Example Description listMinLen 0 listMinLen: \"2\" Ensures that all generated lists have at least length listMinLen listMaxLen 5 listMaxLen: \"15\" Ensures that all generated lists have at most length listMaxLen enableNull false enableNull: \"true\" Enable/disable null values being generated"},{"location":"setup/validation/validation/","title":"Validations","text":"

    Validations can be used to run data checks after you have run the data generator or even as a standalone task. A report summarising the success or failure of the validations, is produced and can be examined for further investigation.

    "},{"location":"setup/validation/validation/#sample","title":"Sample","text":"
    ---\nname: \"account_checks\"\ndescription: \"Check account related fields have gone through system correctly\"\ndataSources:\n  accountJson:\n    options:\n      path: \"sample/json/txn-gen\"\n    validations:\n      - expr: \"amount < 100\"\n      - expr: \"year == 2021\"\n        errorThreshold: 0.1\n      - expr: \"regexp_like(name, 'Peter .*')\"\n        errorThreshold: 200\n
    "},{"location":"use-case/business-value/","title":"Business Value","text":"

    Below is a list of the business related benefits from using Data Caterer which may be applicable for your use case.

    Problem Data Caterer Solution Resources Effects Reliable test data creation - Profile existing data- Create scenarios- Generate data Software Engineers, QA, Testers Cost reduction in labor, more time spent on development, more bugs caught before production Faster development cycles - Generate data in local, test, UAT, pre-prod- Run different scenarios Software Engineers, QA, Testers More defects caught in lower environments, features pushed to production faster, common framework used across all environments Data compliance - Profiling existing data- Generate based on metadata- No complex masking- No production data used in lower environments Audit and compliance No chance for production data breaches Storage costs - Delete generated data- Test specific scenarios Infrastructure Lower data storage costs, less time spent on data management and clean up Schema evolution - Create metadata from data sources- Generate data based off fresh metadata Software Engineers, QA, Testers Less time spent altering tests due to schema changes, ease of use between environments and application versions"},{"location":"use-case/use-case/","title":"Use cases","text":""},{"location":"use-case/use-case/#replicate-production-in-lower-environment","title":"Replicate production in lower environment","text":"

    Having a stable and reliable test environment is a challenge for a number of companies, especially where teams are asynchronously deploying and testing changes at faster rates. Data Caterer can help alleviate these issues by doing the following:

    1. Generates data with the latest schema changes and production like field values
    2. Run as a job on a daily/regular basis to replicate production traffic or data flows
    3. Validate data to ensure your system runs as expected
    4. Clean up data to avoid build up of generated data

    "},{"location":"use-case/use-case/#local-development","title":"Local development","text":"

    Similar to the above, being able to replicate production like data in your local environment can be key to developing more reliable code as you can test directly against data in your local computer. This has a number of benefits including:

    1. Fewer assumptions or ambiguities when the developer codes
    2. Direct feedback loop in local computer rather than waiting for test environment for more reliable test data
    3. No domain expertise required to understand the data
    4. Easy for new developers to be onboarded and developing/testing code for jobs/services
    "},{"location":"use-case/use-case/#systemintegration-testing","title":"System/integration testing","text":"

    When working with third-party, external or internal data providers, it can be difficult to have all setup ready to produce reliable data that abides by relationship contracts between each of the systems. You have to rely on these data providers in order for you to run your tests which may not align to their priorities. With Data Caterer, you can generate the same data that they would produce, along with maintaining referential integrity across the data providers, so that you can run your tests without relying on their systems being up and reliable in their corresponding lower environments.

    "},{"location":"use-case/use-case/#scenario-testing","title":"Scenario testing","text":"

    If you want to set up particular data scenarios, you can customise the generated data to fit your scenario. Once the data gets generated and is consumed, you can also run validations to ensure your system has consumed the data correctly. These scenarios can be put together from existing tasks or data sources can be enabled/disabled based on your requirement. Built into Data Caterer and controlled via feature flags, is the ability to test edge cases based on the data type of the fields used for data generation (enableEdgeCases flag within <field>.generator.options, see more here).

    "},{"location":"use-case/use-case/#data-debugging","title":"Data debugging","text":"

    When data related issues occur in production, it may be difficult to replicate in a lower or local environment. It could be related to specific fields not containing expected results, size of data is too large or missing corresponding referenced data. This becomes key to resolving the issue as you can directly code against the exact data scenario and have confidence that your code changes will fix the problem. Data Caterer can be used to generate the appropriate data in whichever environment you want to test your changes against.

    "},{"location":"use-case/use-case/#data-profiling","title":"Data profiling","text":"

    When using Data Caterer with the feature flag enableGeneratePlanAndTasks enabled (see here), metadata relating all the fields defined in the data sources you have configured will be generated via data profiling. You can run this as a standalone job (can disable enableGenerateData) so that you can focus on the profile of the data you are utilising. This can be run against your production data sources to ensure the metadata can be used to accurately generate data in other environments. This is a key feature of Data Caterer as no direct production connections need to be maintained to generate data in other environments (which can lead to serious concerns about data security as seen here).

    "},{"location":"use-case/use-case/#schema-gathering","title":"Schema gathering","text":"

    When using Data Caterer with the feature flag enableGeneratePlanAndTasks enabled (see here), all schemas of the data sources defined will be tracked in a common format (as tasks). This data, along with the data profiling metadata, could then feed back into your schema registries to help keep them up to date with your system.

    "},{"location":"use-case/use-case/#comparison-to-similar-tools","title":"Comparison to similar tools","text":"Tool Description Features Pros Cons DBLDatagen Python based data generation tool - Scalable and predictable data generation across data scenarios- Plugin third-party libraries- Generate from existing data- Generate based on combination of other fields - Open source- Good documentation- Customisable and scalable- Generate from existing data/schemas - Limited support if issues- Code required- No clean up- No validation DataCebo Synthetic Data Vault Python based data generation tool with focus on ML generation, evaluating generated data - Create synthetic data using machine learning- Evaluate and visualize synthetic data- Preprocess, anonymize and define constraints Tonic Platform solution for generating data - Integration with many data sources- UI with RBAC- Quality and security checks- Auditing and alerting- Dashboards and reporting Datafaker Realistic data generation library - Generate realistic data- Push to CSV/JSON format- Create your own data providers- Performant Gatling HTTP API load testing tool - Load testing- Validating data and responses- Scenario testing- Reporting- Extensive API support- Integration with CI/CD tools Tricentis - Data integrity Testing tool that focuses on data integrity - Data testing- Pre-screening data- Reconciliation, profiling and report testing- Support SQL DB, noSQL DB, files, API Broadcom - Test data manager Test data provisioning tool with PII detection and reusable datasets - Identify sensitive data- Generate synthetic data- Store and reuse existing data- Create virtual copies of data"}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Home","text":"

    Using Data Caterer, you have the ability to generate production like data based on any source/target system whether it be a CSV file, database table, etc. anywhere you want the data to be. Whether it be in a test environment or even in your local laptop. Just define your data source connections and data will be generated. It can also be manually altered to produce data or scenarios the way you want.

    Main features of the data generator include:

    • Metadata discovery
    • Batch or event data generation
    • Maintain referential integrity across any dataset
    • Create custom data generation scenarios
    • Clean up generated data
    • Validate data
    "},{"location":"advanced/advanced/","title":"Advanced use cases","text":""},{"location":"advanced/advanced/#special-data-formats","title":"Special data formats","text":"

    There are many options available for you to use when you have a scenario when data has to be a certain format.

    1. Create expression datafaker
      1. Can be used to create names, addresses, or anything that can be found under here
    2. Create regex
    "},{"location":"advanced/advanced/#foreign-keys-across-data-sets","title":"Foreign keys across data sets","text":"

    If you have a use case where you require a columns value to match in another data set, this can be achieved in the plan definition. For example, if I have the column account_number in a data source named customer-postgres and column account_id in transaction-cassandra,

    sinkOptions:\nforeignKeys:\n#The foreign key name with naming convention [dataSourceName].[taskName].[columnName]\n\"customer-postgres.accounts.account_number\":\n#List of columns to match with same naming convention\n- \"transaction-cassandra.transactions.account_id\"\n

    Sample can be found here. You can define any number of foreign key relationships as you want.

    "},{"location":"advanced/advanced/#edge-cases","title":"Edge cases","text":"

    For each given data type, there are edge cases which can cause issues when your application processes the data. This can be controlled at a column level by including the following flag in the generator options:

    fields:\n- name: \"amount\"\ntype: \"double\"\ngenerator:\ntype: \"random\"\noptions:\nenableEdgeCases: \"true\" 

    If you want to know all the possible edge cases for each data type, can check the documentation here.

    "},{"location":"advanced/advanced/#scenario-testing","title":"Scenario testing","text":"

    You can create specific scenarios by adjusting the metadata found in the plan and tasks to your liking. For example, if you had two data sources, a Postgres database and a parquet file, and you wanted to save account data into Postgres and transactions related to those accounts into a parquet file. You can alter the status column in the account data to only generate open accounts and define a foreign key between Postgres and parquet to ensure the same account_id is being used. Then in the parquet task, define 1 to 10 transactions per account_id to be generated.

    Postgres account generation example task Parquet transaction generation example task Plan

    "},{"location":"advanced/advanced/#storing-plantasks-in-cloud-storage","title":"Storing plan/task(s) in cloud storage","text":"

    You can generate and store the plan/task files inside either AWS S3, Azure Blob Storage or Google GCS. This can be controlled via configuration set in the application.conf file where you can set something like the below:

    folders {\ngeneratedPlanAndTaskFolderPath = \"s3a://my-bucket/data-caterer/generated\"\nplanFilePath = \"s3a://my-bucket/data-caterer/generated/plan/customer-create-plan.yaml\"\ntaskFolderPath = \"s3a://my-bucket/data-caterer/generated/task\"\n}\n\nspark {\nconfig {\n...\n#S3\n\"spark.hadoop.fs.s3a.directory.marker.retention\" = \"keep\"\n\"spark.hadoop.fs.s3a.bucket.all.committer.magic.enabled\" = \"true\"\n\"spark.hadoop.fs.defaultFS\" = \"s3a://my-bucket\"\n#can change to other credential providers as shown here\n#https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Changing_Authentication_Providers\n\"spark.hadoop.fs.s3a.aws.credentials.provider\" = \"org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider\"\n\"spark.hadoop.fs.s3a.access.key\" = \"access_key\"\n\"spark.hadoop.fs.s3a.secret.key\" = \"secret_key\"\n}\n}\n
    "},{"location":"get-started/docker/","title":"Run Data Caterer","text":""},{"location":"get-started/docker/#docker","title":"Docker","text":""},{"location":"get-started/docker/#quick-start","title":"Quick start","text":"
    git clone git@github.com:pflooky/data-caterer-docs.git\ncd data-caterer-docs/docs/sample/docker\ndocker-compose up -d datacaterer\n

    To run for another data source, you can set DATA_SOURCE like below:

    DATA_SOURCE=postgres docker-compose up -d datacaterer\n

    Can set it to one of the following:

    • postgres
    • mysql
    • cassandra
    • solace
    • kafka
    • http

    If you want to test it out with your own setup, you can alter the corresponding files under docs/sample/docker/data

    "},{"location":"get-started/docker/#report","title":"Report","text":"

    Check the report generated under docs/sample/docker/data/custom/report/index.html

    "},{"location":"get-started/docker/#run-with-multiple-sub-data-sources","title":"Run with multiple sub data sources","text":"

    In the context of Postgres data sources, tables are sub data sources that data can be generated for.

    Try to run the following command:

    PLAN=plan/postgres-multiple-tables docker-compose up -d datacaterer\n

    "},{"location":"get-started/docker/#run-with-multiple-data-sources","title":"Run with multiple data sources","text":""},{"location":"get-started/docker/#postgres-and-csv-file","title":"Postgres and CSV File","text":"
    PLAN=plan/scenario-based docker-compose up -d datacaterer\nhead data/custom/csv/transactions/part-00000*\nsample_account=$(head -1 data/custom/csv/transactions/part-00000* | awk -F \",\" '{print $1}')\ndocker exec docker-postgres-1 psql -Upostgres -d customer -c \"SELECT * FROM account.accounts WHERE account_number='$sample_account'\"\n

    You should be able to see the linked data between Postgres and the CSV file created along with 1 to 10 records per account_id, name combination in the CSV file.

    "},{"location":"get-started/docker/#run-with-custom-data-sources","title":"Run with custom data sources","text":"
    1. Create/alter plan under data/custom/plan
    2. Create/alter tasks under data/custom/task
      1. Define your schemas and generator configurations such as record count
    3. Create/alter application configuration data/custom/application.conf
      1. This is where you define your connection properties and other flags/configurations
    DATA_SOURCE=<data source name> docker-compose up -d datacaterer\n
    "},{"location":"get-started/docker/#generate-plan-and-tasks","title":"Generate plan and tasks","text":"
    APPLICATION_CONFIG_PATH=/opt/app/custom/application-dvd.conf ENABLE_GENERATE_DATA=false ENABLE_GENERATE_PLAN_AND_TASKS=true DATA_SOURCE=postgresdvd docker-compose up -d datacaterer\ncat data/custom/generated/plan/plan_*\n
    "},{"location":"get-started/docker/#generate-data-with-record-tracking","title":"Generate data with record tracking","text":"
    APPLICATION_CONFIG_PATH=/opt/app/custom/application-dvd.conf ENABLE_GENERATE_DATA=true ENABLE_GENERATE_PLAN_AND_TASKS=false ENABLE_RECORD_TRACKING=true DATA_SOURCE=postgresdvd PLAN=generated/plan/$(ls data/custom/generated/plan/ | grep plan | head -1 | awk -F \" \" '{print $NF}' | sed 's/\\.yaml//g') docker-compose up -d datacaterer\n
    "},{"location":"get-started/docker/#delete-the-generated-data","title":"Delete the generated data","text":"
    APPLICATION_CONFIG_PATH=/opt/app/custom/application-dvd.conf ENABLE_GENERATE_DATA=false ENABLE_GENERATE_PLAN_AND_TASKS=false ENABLE_DELETE_GENERATED_RECORDS=true DATA_SOURCE=postgresdvd PLAN=generated/plan/$(ls data/custom/generated/plan/ | grep plan | head -1 | awk -F \" \" '{print $NF}' | sed 's/\\.yaml//g') docker-compose up -d datacaterer\n
    "},{"location":"get-started/docker/#helm","title":"Helm","text":"

    Link to sample helm on GitHub here

    Update the configuration to your own data connections and configuration.

    git clone git@github.com:pflooky/data-caterer-docs.git\nhelm install data-caterer ./data-caterer-docs/helm/data-caterer\n
    "},{"location":"roadmap/roadmap/","title":"Roadmap","text":"
    • Support for other data sources
      • GCP and Azure related data services, Delta, RabbitMQ, ActiveMQ
    • Metadata discovery for HTTP, JMS
    • API for developers and testers
    • UI for metadata and data generation
    • Report for data generated and validation rules
    • Metadata stored in database
    • Integration with existing metadata services (i.e. amundsen, datahub)
    • Data dictionary
      • Business definitions
    • Verification rules after data generation
    • Alerting
    • Overriding tasks
      • Can customise tasks without copying whole schema definitions
      • Easier to create scenarios
    "},{"location":"sample/","title":"Samples","text":"

    Below are examples of different types of plans and tasks that can be helpful when trying to create your own. You can use these as a template or to search for something related to your particular use case.

    "},{"location":"sample/#base-concept","title":"Base Concept","text":"

    The execution of the data generator is based on the concept of plans and tasks. A plan represent the set of tasks that need to be executed, along with other information that spans across tasks, such as foreign keys between data sources. A task represent the component(s) of a data source and its associated metadata so that it understands what the data should look like and how many steps (sub data sources) there are (i.e. tables in a database, topics in Kafka). Tasks can define one or more steps.

    "},{"location":"sample/#plan","title":"Plan","text":""},{"location":"sample/#foreign-keys","title":"Foreign Keys","text":"

    Define foreign keys across data sources in your plan to ensure generated data can match Link to associated task 1 Link to associated task 2

    "},{"location":"sample/#task","title":"Task","text":"Data Source Type Data Source Sample Task Notes Database Postgres Sample Database MySQL Sample Database Cassandra Sample File CSV Sample File JSON Sample Contains nested schemas and use of SQL for generated values File Parquet Sample Partition by year column Kafka Kafka Sample Specific base schema to be used, define headers, key, value, etc. JMS Solace Sample JSON formatted message HTTP PUT Sample JSON formatted PUT body"},{"location":"sample/#configuration","title":"Configuration","text":"

    Basic configuration

    "},{"location":"sample/docker/","title":"Data Caterer - Docker Compose","text":"

    If you want to try out data caterer generating data for various data sources, you do use the following docker-compose file.

    All you need to do is define which data source you want to run with via a command like below:

    DATA_SOURCE=postgres docker-compose up -d datacaterer\n

    You can change DATA_SOURCE to one of the following: - postgres - mysql - cassandra - solace - kafka - http

    "},{"location":"setup/","title":"Setup","text":"

    All the configurations and customisation related to Data Caterer can be found under here.

    • Configurations - Configurations relating to feature flags, folder pathways, metadata analysis
    • Connections - Explore the data source connections available
    • Generators - Choose and configure the type of generator you want used for fields
    • Validations - How to validate data to ensure your system is performing as expected
    "},{"location":"setup/configuration/","title":"Configuration","text":"

    A number of configurations can be made and customised within Data Caterer to help control what gets run and/or where any metadata gets saved.

    These configurations are defined from within your application.conf file as seen here.

    "},{"location":"setup/configuration/#flags","title":"Flags","text":"

    Flags are used to control which processes are executed when you run Data Caterer.

    Config Default Paid Description enableGenerateData true N Enable/disable data generation enableCount true N Count the number of records generated. Can be disabled to improve performance enableFailOnError true N Whilst saving generated data, if there is an error, it will stop any further data from being generated enableSaveSinkMetadata true N Enable/disable HTML reports summarising data generated, metadata of data generated (if enableSinkMetadata is enabled) and validation results (if enableValidation is enabled) enableSinkMetadata true N Run data profiling for the generated data. Shown in HTML reports if enableSaveSinkMetadata is enabled enableValidation false N Run validations as described in plan. Results can be viewed from logs or from HTML report if enableSaveSinkMetadata is enabled enableGeneratePlanAndTasks false Y Enable/disable plan and task auto generation based off data source connections enableRecordTracking false Y Enable/disable which data records have been generated for any data source enableDeleteGeneratedRecords false Y Delete all generated records based off record tracking (if enableRecordTracking has been set to true)"},{"location":"setup/configuration/#folders","title":"Folders","text":"

    Depending on which flags are enabled, there are folders that get used to save metadata, store HTML reports or track the records generated.

    These folder pathways can be defined as a cloud storage pathway (i.e. s3a://my-bucket/task).

    Config Default Paid Description planFilePath /opt/app/plan/customer-create-plan.yaml N Plan file path to use when generating and/or validating data taskFolderPath /opt/app/task N Task folder path that contains all the task files (can have nested directories) validationFolderPath /opt/app/validation N Validation folder path that contains all the validation files (can have nested directories) generatedDataResultsFolderPath /opt/app/html N Where HTML reports get generated that contain information about data generated along with any validations performed generatedPlanAndTaskFolderPath /tmp Y Folder path where generated plan and task files will be saved recordTrackingFolderPath /opt/app/record-tracking Y Where record tracking parquet files get saved"},{"location":"setup/configuration/#metadata","title":"Metadata","text":"

    When metadata gets generated, there are some configurations that can be altered to help with performance or accuracy related issues. Metadata gets generated from two processes: 1) if enableGeneratePlanAndTasks or 2) if enableSinkMetadata are enabled.

    During the generation of plan and tasks, data profiling is used to create the metadata for each of the fields defined in the data source. You may face issues if the number of records in the data source is large as data profiling is an expensive task. Similarly, it can be expensive when analysing the generated data if the number of records generated is large.

    Config Default Paid Description numRecordsFromDataSource 10000 Y Number of records read in from the data source numRecordsForAnalysis 10000 Y Number of records used for data profiling from the records gathered in numRecordsFromDataSource oneOfDistinctCountVsCountThreshold 0.1 Y Threshold ratio to determine if a field is of type oneOf (i.e. a field called status that only contains open or closed. Distinct count = 2, total count = 10, ratio = 2 / 10 = 0.2 therefore marked as oneOf)"},{"location":"setup/configuration/#generation","title":"Generation","text":"

    When generating data, you may have some limitations such as limited CPU or memory, large number of data sources, or data sources prone to failure under load. To help alleviate these issues or speed up performance, you can control the number of records that get generated in each batch.

    Config Default Paid Description numRecordsPerBatch 100000 N Number of records across all data sources to generate per batch"},{"location":"setup/connection/connection/","title":"Data Source Connections","text":"

    Details of all the connection configuration supported can be found in the below subsections for each type of connection.

    "},{"location":"setup/connection/connection/#supported-data-connections","title":"Supported Data Connections","text":"Data Source Type Data Source Database Postgres, MySQL, Cassandra File CSV, JSON, ORC, Parquet Kafka Kafka JMS Solace HTTP GET, PUT, POST, DELETE, PATCH, HEAD, TRACE, OPTIONS

    All connection details follow the same pattern.

    <connection format> {\n    <connection name> {\n        <key> = <value>\n    }\n}\n

    Overriding configuration

    When defining a configuration value that can be defined by a system property or environment variable at runtime, you can define that via the following:

    url = \"localhost\"\nurl = ${?POSTGRES_URL}\n

    The above defines that if there is a system property or environment variable named POSTGRES_URL, then that value will be used for the url, otherwise, it will default to localhost.

    "},{"location":"setup/connection/connection/#example-task-per-data-source","title":"Example task per data source","text":"

    To find examples of a task for each type of data source, please check out this page.

    "},{"location":"setup/connection/connection/#file","title":"File","text":"

    Linked here is a list of generic options that can be included as part of your file data source configuration if required. Links to specific file type configurations can be found below.

    "},{"location":"setup/connection/connection/#csv","title":"CSV","text":"
    csv {\n  customer_transactions {\n    path = \"/data/customer/transaction\"\n    path = ${?CSV_PATH}\n  }\n}\n

    Other available configuration for CSV can be found here

    "},{"location":"setup/connection/connection/#json","title":"JSON","text":"
    json {\n  customer_transactions {\n    path = \"/data/customer/transaction\"\n    path = ${?JSON_PATH}\n  }\n}\n

    Other available configuration for JSON can be found here

    "},{"location":"setup/connection/connection/#orc","title":"ORC","text":"
    orc {\n  customer_transactions {\n    path = \"/data/customer/transaction\"\n    path = ${?ORC_PATH}\n  }\n}\n

    Other available configuration for ORC can be found here

    "},{"location":"setup/connection/connection/#parquet","title":"Parquet","text":"
    parquet {\n  customer_transactions {\n    path = \"/data/customer/transaction\"\n    path = ${?PARQUET_PATH}\n  }\n}\n

    Other available configuration for Parquet can be found here

    "},{"location":"setup/connection/connection/#delta-not-supported-yet","title":"Delta (not supported yet)","text":"
    delta {\n  customer_transactions {\n    path = \"/data/customer/transaction\"\n    path = ${?DELTA_PATH}\n  }\n}\n
    "},{"location":"setup/connection/connection/#jdbc","title":"JDBC","text":"

    Follows the same configuration used by Spark as found here. Sample can be found below

    jdbc {\n    postgres {\n        url = \"jdbc:postgresql://localhost:5432/customer\"\n        url = ${?POSTGRES_URL}\n        user = \"postgres\"\n        user = ${?POSTGRES_USERNAME}\n        password = \"postgres\"\n        password = ${?POSTGRES_PASSWORD}\n        driver = \"org.postgresql.Driver\"\n    }\n}\n

    Ensure that the user has write permission, so it is able to save the table to the target tables.

    SQL Permission Statements
    GRANT INSERT ON <schema>.<table> TO <user>;\n
    "},{"location":"setup/connection/connection/#postgres","title":"Postgres","text":""},{"location":"setup/connection/connection/#permissions","title":"Permissions","text":"

    Following permissions are required when generating plan and tasks:

    SQL Permission Statements
    GRANT SELECT ON information_schema.tables TO < user >;\nGRANT SELECT ON information_schema.columns TO < user >;\nGRANT SELECT ON information_schema.key_column_usage TO < user >;\nGRANT SELECT ON information_schema.table_constraints TO < user >;\nGRANT SELECT ON information_schema.constraint_column_usage TO < user >;\n
    "},{"location":"setup/connection/connection/#mysql","title":"MySQL","text":""},{"location":"setup/connection/connection/#permissions_1","title":"Permissions","text":"

    Following permissions are required when generating plan and tasks:

    SQL Permission Statements
    GRANT SELECT ON information_schema.columns TO < user >;\nGRANT SELECT ON information_schema.statistics TO < user >;\nGRANT SELECT ON information_schema.key_column_usage TO < user >;\n
    "},{"location":"setup/connection/connection/#cassandra","title":"Cassandra","text":"

    Follows same configuration as defined by the Spark Cassandra Connector as found here

    org.apache.spark.sql.cassandra {\n    cassandra {\n        spark.cassandra.connection.host = \"localhost\"\n        spark.cassandra.connection.host = ${?CASSANDRA_HOST}\n        spark.cassandra.connection.port = \"9042\"\n        spark.cassandra.connection.port = ${?CASSANDRA_PORT}\n        spark.cassandra.auth.username = \"cassandra\"\n        spark.cassandra.auth.username = ${?CASSANDRA_USERNAME}\n        spark.cassandra.auth.password = \"cassandra\"\n        spark.cassandra.auth.password = ${?CASSANDRA_PASSWORD}\n    }\n}\n
    "},{"location":"setup/connection/connection/#permissions_2","title":"Permissions","text":"

    Ensure that the user has write permission, so it is able to save the table to the target tables.

    CQL Permission Statements
    GRANT INSERT ON <schema>.<table> TO <user>;\n

    Following permissions are required when generating plan and tasks:

    CQL Permission Statements
    GRANT SELECT ON system_schema.tables TO <user>;\nGRANT SELECT ON system_schema.columns TO <user>;\n
    "},{"location":"setup/connection/connection/#kafka","title":"Kafka","text":"

    Define your Kafka bootstrap server to connect and send generated data to corresponding topics. Topic gets set at a step level. Further details can be found here

    kafka {\n    kafka {\n        kafka.bootstrap.servers = \"localhost:9092\"\n        kafka.bootstrap.servers = ${?KAFKA_BOOTSTRAP_SERVERS}\n    }\n}\n

    When defining your schema for pushing data to Kafka, it follows a specific top level schema. An example can be found here. You can define the key, value, headers, partition or topic by following the linked schema.

    "},{"location":"setup/connection/connection/#jms","title":"JMS","text":"

    Uses JNDI lookup to send messages to JMS queue. Ensure that the messaging system you are using has your queue/topic registered via JNDI otherwise a connection cannot be created.

    jms {\n    solace {\n        initialContextFactory = \"com.solacesystems.jndi.SolJNDIInitialContextFactory\"\n        connectionFactory = \"/jms/cf/default\"\n        url = \"smf://localhost:55555\"\n        url = ${?SOLACE_URL}\n        user = \"admin\"\n        user = ${?SOLACE_USER}\n        password = \"admin\"\n        password = ${?SOLACE_PASSWORD}\n        vpnName = \"default\"\n        vpnName = ${?SOLACE_VPN}\n    }\n}\n
    "},{"location":"setup/connection/connection/#http","title":"HTTP","text":"

    Define any username and/or password needed for the HTTP requests. The url is defined in the tasks to allow for generated data to be populated in the url.

    http {\n    customer_api {\n        user = \"admin\"\n        user = ${?HTTP_USER}\n        password = \"admin\"\n        password = ${?HTTP_PASSWORD}\n    }\n}\n
    "},{"location":"setup/generator/count/","title":"Record Count","text":"

    There are options related to controlling the number of records generated that can help in generating the scenarios or data required.

    "},{"location":"setup/generator/count/#total-count","title":"Total Count","text":"

    Total count is the simplest as you define the total number of records you require for that particular step. For example, in the below step, it will generate 1000 records for the CSV file

    name: \"csv_file\"\nsteps:\n- name: \"transactions\"\ntype: \"csv\"\noptions:\npath: \"app/src/test/resources/sample/csv/transactions\"\ncount:\ntotal: 1000\n
    "},{"location":"setup/generator/count/#generated-count","title":"Generated Count","text":"

    As like most things in data-caterer, the count can be generated based on some metadata. For example, if I wanted to generate between 1000 and 2000 records, I could define that by the below configuration:

    name: \"csv_file\"\nsteps:\n- name: \"transactions\"\ntype: \"csv\"\noptions:\npath: \"app/src/test/resources/sample/csv/transactions\"\ncount:\ngenerator:\ntype: \"random\"\noptions:\nmin: 1000\nmax: 2000\n
    "},{"location":"setup/generator/count/#per-column-count","title":"Per Column Count","text":"

    When defining a per column count, this allows you to generate records \"per set of columns\". This means that for a given set of columns, it will generate a particular amount of records per combination of values for those columns.

    One example of this would be when generating transactions relating to a customer. A customer may be defined by columns account_id, name. A number of transactions would be generated per account_id,name.

    You can also use a combination of the above two methods to generate the number of records per column.

    "},{"location":"setup/generator/count/#total","title":"Total","text":"

    When defining a total count within the perColumn configuration, it translates to only creating (count.total * count.perColumn.total) records. This is a fixed number of records that will be generated each time, with no variation between runs.

    In the example below, we have count.total = 1000 and count.perColumn.total = 2. Which means that 1000 * 2 = 2000 records will be generated for this CSV file every time data gets generated.

    name: \"csv_file\"\nsteps:\n- name: \"transactions\"\ntype: \"csv\"\noptions:\npath: \"app/src/test/resources/sample/csv/transactions\"\ncount:\ntotal: 1000\nperColumn:\ntotal: 2\ncolumnNames:\n- \"account_id\"\n- \"name\"\n
    "},{"location":"setup/generator/count/#generated","title":"Generated","text":"

    You can also define a generator for the count per column. This can be used in scenarios where you want a variable number of records per set of columns.

    In the example below, it will generate between (count.total * count.perColumn.generator.options.minValue) = (1000 * 1) = 1000 and (count.total * count.perColumn.generator.options.maxValue) = (1000 * 2) = 2000 records.

    name: \"csv_file\"\nsteps:\n- name: \"transactions\"\ntype: \"csv\"\noptions:\npath: \"app/src/test/resources/sample/csv/transactions\"\ncount:\ntotal: 1000\nperColumn:\ncolumnNames:\n- \"account_id\"\n- \"name\"\ngenerator:\ntype: \"random\"\noptions:\nmaxValue: 2\nminValue: 1\n
    "},{"location":"setup/generator/generator/","title":"Data Generators","text":""},{"location":"setup/generator/generator/#data-types","title":"Data Types","text":"

    Below is a list of all supported data types for generating data:

    Data Type Spark Data Type Options Description string StringType minLen, maxLen, expression, enableNull integer IntegerType min, minValue, max, maxValue long LongType min, minValue, max, maxValue short ShortType min, minValue, max, maxValue decimal(precision, scale) DecimalType(precision, scale) min, minValue, max, maxValue double DoubleType min, minValue, max, maxValue float FloatType min, minValue, max, maxValue date DateType min, max, enableNull timestamp TimestampType min, max, enableNull boolean BooleanType binary BinaryType minLen, maxLen, enableNull byte ByteType array ArrayType listMinLen, listMaxLen _ StructType Implicitly supported when a schema is defined for a field"},{"location":"setup/generator/generator/#options","title":"Options","text":""},{"location":"setup/generator/generator/#all-data-types","title":"All data types","text":"

    Some options are available to use for all types of data generators. Below is the list along with example and descriptions:

    Option Default Example Description enableEdgeCases false enableEdgeCases: \"true\" Enable/disable generated data to contain edge cases based on the data type. For example, integer data type has edge cases of (Int.MaxValue, Int.MinValue and 0) isUnique false isUnique: \"true\" Enable/disable generated data to be unique for that column. Errors will be thrown when it is unable to generate unique data seed seed: \"1\" Defines the random seed for generating data for that particular column. It will override any seed defined at a global level sql sql: \"CASE WHEN amount < 10 THEN true ELSE false END\" Define any SQL statement for generating that columns value. Computation occurs after all non-SQL fields are generated. This means any columns used in the SQL cannot be based on other SQL generated columns. Data type of generated value from SQL needs to match data type defined for the field"},{"location":"setup/generator/generator/#string","title":"String","text":"Option Default Example Description minLen 1 minLen: \"2\" Ensures that all generated strings have at least length minLen maxLen 10 maxLen: \"15\" Ensures that all generated strings have at most length maxLen expression expression: \"#{Name.name}\" expression:\"#{Address.city}/#{Demographic.maritalStatus}\" Will generate a string based on the faker expression provided. All possible faker expressions can be found here Expression has to be in format #{<faker expression name>} enableNull false enableNull: \"true\" Enable/disable null values being generated

    Edge cases: (\"\", \"\\n\", \"\\r\", \"\\t\", \" \", \"\\u0000\", \"\\ufff\")

    "},{"location":"setup/generator/generator/#numeric","title":"Numeric","text":"

    For all the numeric data types, there are 4 options to choose from: min, minValue, max and maxValue. Generally speaking, you only need to define one of min or minValue, similarly with max or maxValue. The reason why there are 2 options for each is because of when metadata is automatically gathered, we gather the statistics of the observed min and max values. Also, it will attempt to gather any restriction on the min or max value as defined by the data source (i.e. max value as per database type).

    "},{"location":"setup/generator/generator/#integerlongshortdecimal","title":"Integer/Long/Short/Decimal","text":"Option Default Example Description minValue 0 minValue: \"2\" Ensures that all generated values are greater than or equal to minValue min 0 min: \"2\" Ensures that all generated values are greater than or equal to min. If minValue is defined, minValue will define the lowest possible generated value maxValue 1000 maxValue: \"25\" Ensures that all generated values are less than or equal to maxValue max 1000 max: \"25\" Ensures that all generated values are less than or equal to maxValue. If maxValue is defined, maxValue will define the largest possible generated value

    Edge cases Integer: (2147483647, -2147483648, 0) Edge cases Long/Decimal: (9223372036854775807, -9223372036854775808, 0) Edge cases Short: (32767, -32768, 0)

    "},{"location":"setup/generator/generator/#doublefloat","title":"Double/Float","text":"Option Default Example Description minValue 0.0 minValue: \"2.1\" Ensures that all generated values are greater than or equal to minValue min 0.0 min: \"2.1\" Ensures that all generated values are greater than or equal to min. If minValue is defined, minValue will define the lowest possible generated value maxValue 1000.0 maxValue: \"25.9\" Ensures that all generated values are less than or equal to maxValue max 1000.0 max: \"25.9\" Ensures that all generated values are less than or equal to maxValue. If maxValue is defined, maxValue will define the largest possible generated value

    Edge cases Double: (+infinity, 1.7976931348623157e+308, 4.9e-324, 0.0, -0.0, -1.7976931348623157e+308, -infinity, NaN) Edge cases Float: (+infinity, 3.4028235e+38, 1.4e-45, 0.0, -0.0, -3.4028235e+38, -infinity, NaN)

    "},{"location":"setup/generator/generator/#date","title":"Date","text":"Option Default Example Description min now() - 365 days min: \"2023-01-31\" Ensures that all generated values are greater than or equal to min max now() max: \"2023-12-31\" Ensures that all generated values are less than or equal to max enableNull false enableNull: \"true\" Enable/disable null values being generated

    Edge cases: (0001-01-01, 1582-10-15, 1970-01-01, 9999-12-31) (reference)

    "},{"location":"setup/generator/generator/#timestamp","title":"Timestamp","text":"Option Default Example Description min now() - 365 days min: \"2023-01-31 23:10:10\" Ensures that all generated values are greater than or equal to min max now() max: \"2023-12-31 23:10:10\" Ensures that all generated values are less than or equal to max enableNull false enableNull: \"true\" Enable/disable null values being generated

    Edge cases: (0001-01-01 00:00:00, 1582-10-15 23:59:59, 1970-01-01 00:00:00, 9999-12-31 23:59:59)

    "},{"location":"setup/generator/generator/#binary","title":"Binary","text":"Option Default Example Description minLen 1 minLen: \"2\" Ensures that all generated array of bytes have at least length minLen maxLen 20 maxLen: \"15\" Ensures that all generated array of bytes have at most length maxLen enableNull false enableNull: \"true\" Enable/disable null values being generated

    Edge cases: (\"\", \"\\n\", \"\\r\", \"\\t\", \" \", \"\\u0000\", \"\\ufff\", -128, 127)

    "},{"location":"setup/generator/generator/#list","title":"List","text":"Option Default Example Description listMinLen 0 listMinLen: \"2\" Ensures that all generated lists have at least length listMinLen listMaxLen 5 listMaxLen: \"15\" Ensures that all generated lists have at most length listMaxLen enableNull false enableNull: \"true\" Enable/disable null values being generated"},{"location":"setup/validation/validation/","title":"Validations","text":"

    Validations can be used to run data checks after you have run the data generator or even as a standalone task. A report summarising the success or failure of the validations, is produced and can be examined for further investigation.

    "},{"location":"setup/validation/validation/#sample","title":"Sample","text":"
    ---\nname: \"account_checks\"\ndescription: \"Check account related fields have gone through system correctly\"\ndataSources:\naccountJson:\noptions:\npath: \"sample/json/txn-gen\"\nvalidations:\n- expr: \"amount < 100\"\n- expr: \"year == 2021\"\nerrorThreshold: 0.1\n- expr: \"regexp_like(name, 'Peter .*')\"\nerrorThreshold: 200\n
    "},{"location":"use-case/business-value/","title":"Business Value","text":"

    Below is a list of the business related benefits from using Data Caterer which may be applicable for your use case.

    Problem Data Caterer Solution Resources Effects Reliable test data creation - Profile existing data- Create scenarios- Generate data Software Engineers, QA, Testers Cost reduction in labor, more time spent on development, more bugs caught before production Faster development cycles - Generate data in local, test, UAT, pre-prod- Run different scenarios Software Engineers, QA, Testers More defects caught in lower environments, features pushed to production faster, common framework used across all environments Data compliance - Profiling existing data- Generate based on metadata- No complex masking- No production data used in lower environments Audit and compliance No chance for production data breaches Storage costs - Delete generated data- Test specific scenarios Infrastructure Lower data storage costs, less time spent on data management and clean up Schema evolution - Create metadata from data sources- Generate data based off fresh metadata Software Engineers, QA, Testers Less time spent altering tests due to schema changes, ease of use between environments and application versions"},{"location":"use-case/use-case/","title":"Use cases","text":""},{"location":"use-case/use-case/#replicate-production-in-lower-environment","title":"Replicate production in lower environment","text":"

    Having a stable and reliable test environment is a challenge for a number of companies, especially where teams are asynchronously deploying and testing changes at faster rates. Data Caterer can help alleviate these issues by doing the following:

    1. Generates data with the latest schema changes and production like field values
    2. Run as a job on a daily/regular basis to replicate production traffic or data flows
    3. Validate data to ensure your system runs as expected
    4. Clean up data to avoid build up of generated data

    "},{"location":"use-case/use-case/#local-development","title":"Local development","text":"

    Similar to the above, being able to replicate production like data in your local environment can be key to developing more reliable code as you can test directly against data in your local computer. This has a number of benefits including:

    1. Fewer assumptions or ambiguities when the developer codes
    2. Direct feedback loop in local computer rather than waiting for test environment for more reliable test data
    3. No domain expertise required to understand the data
    4. Easy for new developers to be onboarded and developing/testing code for jobs/services
    "},{"location":"use-case/use-case/#systemintegration-testing","title":"System/integration testing","text":"

    When working with third-party, external or internal data providers, it can be difficult to have all setup ready to produce reliable data that abides by relationship contracts between each of the systems. You have to rely on these data providers in order for you to run your tests which may not align to their priorities. With Data Caterer, you can generate the same data that they would produce, along with maintaining referential integrity across the data providers, so that you can run your tests without relying on their systems being up and reliable in their corresponding lower environments.

    "},{"location":"use-case/use-case/#scenario-testing","title":"Scenario testing","text":"

    If you want to set up particular data scenarios, you can customise the generated data to fit your scenario. Once the data gets generated and is consumed, you can also run validations to ensure your system has consumed the data correctly. These scenarios can be put together from existing tasks or data sources can be enabled/disabled based on your requirement. Built into Data Caterer and controlled via feature flags, is the ability to test edge cases based on the data type of the fields used for data generation (enableEdgeCases flag within <field>.generator.options, see more here).

    "},{"location":"use-case/use-case/#data-debugging","title":"Data debugging","text":"

    When data related issues occur in production, it may be difficult to replicate in a lower or local environment. It could be related to specific fields not containing expected results, size of data is too large or missing corresponding referenced data. This becomes key to resolving the issue as you can directly code against the exact data scenario and have confidence that your code changes will fix the problem. Data Caterer can be used to generate the appropriate data in whichever environment you want to test your changes against.

    "},{"location":"use-case/use-case/#data-profiling","title":"Data profiling","text":"

    When using Data Caterer with the feature flag enableGeneratePlanAndTasks enabled (see here), metadata relating all the fields defined in the data sources you have configured will be generated via data profiling. You can run this as a standalone job (can disable enableGenerateData) so that you can focus on the profile of the data you are utilising. This can be run against your production data sources to ensure the metadata can be used to accurately generate data in other environments. This is a key feature of Data Caterer as no direct production connections need to be maintained to generate data in other environments (which can lead to serious concerns about data security as seen here).

    "},{"location":"use-case/use-case/#schema-gathering","title":"Schema gathering","text":"

    When using Data Caterer with the feature flag enableGeneratePlanAndTasks enabled (see here), all schemas of the data sources defined will be tracked in a common format (as tasks). This data, along with the data profiling metadata, could then feed back into your schema registries to help keep them up to date with your system.

    "},{"location":"use-case/use-case/#comparison-to-similar-tools","title":"Comparison to similar tools","text":"Tool Description Features Pros Cons DBLDatagen Python based data generation tool - Scalable and predictable data generation across data scenarios- Plugin third-party libraries- Generate from existing data- Generate based on combination of other fields - Open source- Good documentation- Customisable and scalable- Generate from existing data/schemas - Limited support if issues- Code required- No clean up- No validation DataCebo Synthetic Data Vault Python based data generation tool with focus on ML generation, evaluating generated data - Create synthetic data using machine learning- Evaluate and visualize synthetic data- Preprocess, anonymize and define constraints Tonic Platform solution for generating data - Integration with many data sources- UI with RBAC- Quality and security checks- Auditing and alerting- Dashboards and reporting Datafaker Realistic data generation library - Generate realistic data- Push to CSV/JSON format- Create your own data providers- Performant Gatling HTTP API load testing tool - Load testing- Validating data and responses- Scenario testing- Reporting- Extensive API support- Integration with CI/CD tools Tricentis - Data integrity Testing tool that focuses on data integrity - Data testing- Pre-screening data- Reconciliation, profiling and report testing- Support SQL DB, noSQL DB, files, API Broadcom - Test data manager Test data provisioning tool with PII detection and reusable datasets - Identify sensitive data- Generate synthetic data- Store and reuse existing data- Create virtual copies of data"}]} \ No newline at end of file diff --git a/site/setup/configuration/index.html b/site/setup/configuration/index.html index 3cf9d2bb..34113331 100644 --- a/site/setup/configuration/index.html +++ b/site/setup/configuration/index.html @@ -704,55 +704,55 @@

    Flags

- + - + - + - + - + - + - + - + - + @@ -774,37 +774,37 @@

Folders

- + - + - + - + - + - + @@ -832,19 +832,19 @@

Metadata

- + - + - + @@ -867,7 +867,7 @@

Generation

- + diff --git a/site/setup/connection/connection/index.html b/site/setup/connection/connection/index.html index 3861dc7f..6f3ded85 100644 --- a/site/setup/connection/connection/index.html +++ b/site/setup/connection/connection/index.html @@ -854,19 +854,22 @@

Supported Data Connections

enableGenerateDataenableGenerateData true N Enable/disable data generation
enableCountenableCount true N Count the number of records generated. Can be disabled to improve performance
enableFailOnErrorenableFailOnError true N Whilst saving generated data, if there is an error, it will stop any further data from being generated
enableSaveSinkMetadataenableSaveSinkMetadata true N Enable/disable HTML reports summarising data generated, metadata of data generated (if enableSinkMetadata is enabled) and validation results (if enableValidation is enabled)
enableSinkMetadataenableSinkMetadata true N Run data profiling for the generated data. Shown in HTML reports if enableSaveSinkMetadata is enabled
enableValidationenableValidation false N Run validations as described in plan. Results can be viewed from logs or from HTML report if enableSaveSinkMetadata is enabled
enableGeneratePlanAndTasksenableGeneratePlanAndTasks false Y Enable/disable plan and task auto generation based off data source connections
enableRecordTrackingenableRecordTracking false Y Enable/disable which data records have been generated for any data source
enableDeleteGeneratedRecordsenableDeleteGeneratedRecords false Y Delete all generated records based off record tracking (if enableRecordTracking has been set to true)
planFilePathplanFilePath /opt/app/plan/customer-create-plan.yaml N Plan file path to use when generating and/or validating data
taskFolderPathtaskFolderPath /opt/app/task N Task folder path that contains all the task files (can have nested directories)
validationFolderPathvalidationFolderPath /opt/app/validation N Validation folder path that contains all the validation files (can have nested directories)
generatedDataResultsFolderPathgeneratedDataResultsFolderPath /opt/app/html N Where HTML reports get generated that contain information about data generated along with any validations performed
generatedPlanAndTaskFolderPathgeneratedPlanAndTaskFolderPath /tmp Y Folder path where generated plan and task files will be saved
recordTrackingFolderPathrecordTrackingFolderPath /opt/app/record-tracking Y Where record tracking parquet files get saved
numRecordsFromDataSourcenumRecordsFromDataSource 10000 Y Number of records read in from the data source
numRecordsForAnalysisnumRecordsForAnalysis 10000 Y Number of records used for data profiling from the records gathered in numRecordsFromDataSource
oneOfDistinctCountVsCountThresholdoneOfDistinctCountVsCountThreshold 0.1 Y Threshold ratio to determine if a field is of type oneOf (i.e. a field called status that only contains open or closed. Distinct count = 2, total count = 10, ratio = 2 / 10 = 0.2 therefore marked as oneOf)
numRecordsPerBatchnumRecordsPerBatch 100000 N Number of records across all data sources to generate per batch

All connection details follow the same pattern.

-
<connection format> {
+
<connection format> {
     <connection name> {
         <key> = <value>
     }
 }
-
+
+
+

Overriding configuration

When defining a configuration value that can be defined by a system property or environment variable at runtime, you can define that via the following:

-
url = "localhost"
+
url = "localhost"
 url = ${?POSTGRES_URL}
-
+

The above defines that if there is a system property or environment variable named POSTGRES_URL, then that value will be used for the url, otherwise, it will default to localhost.

+

Example task per data source

To find examples of a task for each type of data source, please check out this page.

File

@@ -874,54 +877,54 @@

File

that can be included as part of your file data source configuration if required. Links to specific file type configurations can be found below.

CSV

-
csv {
+
csv {
   customer_transactions {
     path = "/data/customer/transaction"
     path = ${?CSV_PATH}
   }
 }
-
+

Other available configuration for CSV can be found here

JSON

-
json {
+
json {
   customer_transactions {
     path = "/data/customer/transaction"
     path = ${?JSON_PATH}
   }
 }
-
+

Other available configuration for JSON can be found here

ORC

-
orc {
+
orc {
   customer_transactions {
     path = "/data/customer/transaction"
     path = ${?ORC_PATH}
   }
 }
-
+

Other available configuration for ORC can be found here

Parquet

-
parquet {
+
parquet {
   customer_transactions {
     path = "/data/customer/transaction"
     path = ${?PARQUET_PATH}
   }
 }
-
+

Other available configuration for Parquet can be found here

Delta (not supported yet)

-
delta {
+
delta {
   customer_transactions {
     path = "/data/customer/transaction"
     path = ${?DELTA_PATH}
   }
 }
-
+

JDBC

Follows the same configuration used by Spark as found here.
Sample can be found below

-
jdbc {
+
jdbc {
     postgres {
         url = "jdbc:postgresql://localhost:5432/customer"
         url = ${?POSTGRES_URL}
@@ -932,51 +935,39 @@ 

JDBC

driver = "org.postgresql.Driver" } } -
-

Ensure that the user has write permission so it is able to save the table to the target tables.

-
- - -
GRANT INSERT ON <schema>.<table> TO <user>;
-
- - +
+

Ensure that the user has write permission, so it is able to save the table to the target tables.

+
+SQL Permission Statements +
GRANT INSERT ON <schema>.<table> TO <user>;
+
-

Postgres

Permissions

Following permissions are required when generating plan and tasks:

-
- - -
GRANT SELECT ON information_schema.tables TO < user >;
-GRANT SELECT ON information_schema.columns TO < user >;
-GRANT SELECT ON information_schema.key_column_usage TO < user >;
-GRANT SELECT ON information_schema.table_constraints TO < user >;
-GRANT SELECT ON information_schema.constraint_column_usage TO < user >;
-
- - +
+SQL Permission Statements +
GRANT SELECT ON information_schema.tables TO < user >;
+GRANT SELECT ON information_schema.columns TO < user >;
+GRANT SELECT ON information_schema.key_column_usage TO < user >;
+GRANT SELECT ON information_schema.table_constraints TO < user >;
+GRANT SELECT ON information_schema.constraint_column_usage TO < user >;
+
-

MySQL

Permissions

Following permissions are required when generating plan and tasks:

-
- - -
GRANT SELECT ON information_schema.columns TO < user >;
-GRANT SELECT ON information_schema.statistics TO < user >;
-GRANT SELECT ON information_schema.key_column_usage TO < user >;
-
- - +
+SQL Permission Statements +
GRANT SELECT ON information_schema.columns TO < user >;
+GRANT SELECT ON information_schema.statistics TO < user >;
+GRANT SELECT ON information_schema.key_column_usage TO < user >;
+
-

Cassandra

Follows same configuration as defined by the Spark Cassandra Connector as found here

-
org.apache.spark.sql.cassandra {
+
org.apache.spark.sql.cassandra {
     cassandra {
         spark.cassandra.connection.host = "localhost"
         spark.cassandra.connection.host = ${?CASSANDRA_HOST}
@@ -988,41 +979,33 @@ 

Cassandra

spark.cassandra.auth.password = ${?CASSANDRA_PASSWORD} } } -
+
Permissions
-

Ensure that the user has write permission so it is able to save the table to the target tables.

-
- - -
GRANT INSERT ON <schema>.<table> TO <user>;
-
- - +

Ensure that the user has write permission, so it is able to save the table to the target tables.

+
+CQL Permission Statements +
GRANT INSERT ON <schema>.<table> TO <user>;
+
-

Following permissions are required when generating plan and tasks:

-
- - -
GRANT SELECT ON system_schema.tables TO <user>;
-GRANT SELECT ON system_schema.columns TO <user>;
-
- - +
+CQL Permission Statements +
GRANT SELECT ON system_schema.tables TO <user>;
+GRANT SELECT ON system_schema.columns TO <user>;
+
-

Kafka

Define your Kafka bootstrap server to connect and send generated data to corresponding topics. Topic gets set at a step level.
Further details can be found here

-
kafka {
+
kafka {
     kafka {
         kafka.bootstrap.servers = "localhost:9092"
         kafka.bootstrap.servers = ${?KAFKA_BOOTSTRAP_SERVERS}
     }
 }
-
+

When defining your schema for pushing data to Kafka, it follows a specific top level schema.
An example can be found here.
You can define the key, value, headers, partition or topic by following the linked schema.

@@ -1030,7 +1013,7 @@

JMS

Uses JNDI lookup to send messages to JMS queue. Ensure that the messaging system you are using has your queue/topic registered via JNDI otherwise a connection cannot be created.

-
jms {
+
jms {
     solace {
         initialContextFactory = "com.solacesystems.jndi.SolJNDIInitialContextFactory"
         connectionFactory = "/jms/cf/default"
@@ -1044,11 +1027,11 @@ 

JMS

vpnName = ${?SOLACE_VPN} } } -
+

HTTP

Define any username and/or password needed for the HTTP requests.
The url is defined in the tasks to allow for generated data to be populated in the url.

-
http {
+
http {
     customer_api {
         user = "admin"
         user = ${?HTTP_USER}
@@ -1056,7 +1039,7 @@ 

HTTP

password = ${?HTTP_PASSWORD} } } -
+
diff --git a/site/setup/generator/count/index.html b/site/setup/generator/count/index.html index 80c620d4..8cc05596 100644 --- a/site/setup/generator/count/index.html +++ b/site/setup/generator/count/index.html @@ -706,31 +706,31 @@

Record Count

Total Count

Total count is the simplest as you define the total number of records you require for that particular step. For example, in the below step, it will generate 1000 records for the CSV file

-
name: "csv_file"
-steps:
-  - name: "transactions"
-    type: "csv"
-    options:
-      path: "app/src/test/resources/sample/csv/transactions"
-    count:
-      total: 1000
-
+
name: "csv_file"
+steps:
+  - name: "transactions"
+    type: "csv"
+    options:
+      path: "app/src/test/resources/sample/csv/transactions"
+    count:
+      total: 1000
+

Generated Count

As like most things in data-caterer, the count can be generated based on some metadata. For example, if I wanted to generate between 1000 and 2000 records, I could define that by the below configuration:

-
name: "csv_file"
-steps:
-  - name: "transactions"
-    type: "csv"
-    options:
-      path: "app/src/test/resources/sample/csv/transactions"
-    count:
-      generator:
-        type: "random"
-        options:
-          min: 1000
-          max: 2000
-
+
name: "csv_file"
+steps:
+  - name: "transactions"
+    type: "csv"
+    options:
+      path: "app/src/test/resources/sample/csv/transactions"
+    count:
+      generator:
+        type: "random"
+        options:
+          min: 1000
+          max: 2000
+

Per Column Count

When defining a per column count, this allows you to generate records "per set of columns". This means that for a given set of columns, it will generate a particular amount of records per combination of values for those columns.

@@ -742,43 +742,43 @@

Total

This is a fixed number of records that will be generated each time, with no variation between runs.

In the example below, we have count.total = 1000 and count.perColumn.total = 2. Which means that 1000 * 2 = 2000 records will be generated for this CSV file every time data gets generated.

-
name: "csv_file"
-steps:
-  - name: "transactions"
-    type: "csv"
-    options:
-      path: "app/src/test/resources/sample/csv/transactions"
-    count:
-      total: 1000
-      perColumn:
-        total: 2
-        columnNames:
-          - "account_id"
-          - "name"
-
+
name: "csv_file"
+steps:
+  - name: "transactions"
+    type: "csv"
+    options:
+      path: "app/src/test/resources/sample/csv/transactions"
+    count:
+      total: 1000
+      perColumn:
+        total: 2
+        columnNames:
+          - "account_id"
+          - "name"
+

Generated

You can also define a generator for the count per column. This can be used in scenarios where you want a variable number of records per set of columns.

In the example below, it will generate between (count.total * count.perColumn.generator.options.minValue) = (1000 * 1) = 1000 and (count.total * count.perColumn.generator.options.maxValue) = (1000 * 2) = 2000 records.

-
name: "csv_file"
-steps:
-  - name: "transactions"
-    type: "csv"
-    options:
-      path: "app/src/test/resources/sample/csv/transactions"
-    count:
-      total: 1000
-      perColumn:
-        columnNames:
-          - "account_id"
-          - "name"
-        generator:
-          type: "random"
-          options:
-            maxValue: 2
-            minValue: 1
-
+
name: "csv_file"
+steps:
+  - name: "transactions"
+    type: "csv"
+    options:
+      path: "app/src/test/resources/sample/csv/transactions"
+    count:
+      total: 1000
+      perColumn:
+        columnNames:
+          - "account_id"
+          - "name"
+        generator:
+          type: "random"
+          options:
+            maxValue: 2
+            minValue: 1
+
diff --git a/site/setup/validation/validation/index.html b/site/setup/validation/validation/index.html index 5732b4fa..41e24375 100644 --- a/site/setup/validation/validation/index.html +++ b/site/setup/validation/validation/index.html @@ -669,20 +669,20 @@

Validations

Validations can be used to run data checks after you have run the data generator or even as a standalone task. A report summarising the success or failure of the validations, is produced and can be examined for further investigation.

Sample

-
---
-name: "account_checks"
-description: "Check account related fields have gone through system correctly"
-dataSources:
-  accountJson:
-    options:
-      path: "sample/json/txn-gen"
-    validations:
-      - expr: "amount < 100"
-      - expr: "year == 2021"
-        errorThreshold: 0.1
-      - expr: "regexp_like(name, 'Peter .*')"
-        errorThreshold: 200
-
+
---
+name: "account_checks"
+description: "Check account related fields have gone through system correctly"
+dataSources:
+  accountJson:
+    options:
+      path: "sample/json/txn-gen"
+    validations:
+      - expr: "amount < 100"
+      - expr: "year == 2021"
+        errorThreshold: 0.1
+      - expr: "regexp_like(name, 'Peter .*')"
+        errorThreshold: 200
+
diff --git a/site/sitemap.xml.gz b/site/sitemap.xml.gz index d7fc3551..de67d37a 100644 Binary files a/site/sitemap.xml.gz and b/site/sitemap.xml.gz differ