Update with version 0.3.7 changes to API, update home statement

pflooky · Sep 13, 2023 · 1792aff · 1792aff
1 parent c9dd693
commit 1792aff
Show file tree

Hide file tree

Showing 55 changed files with 731 additions and 375 deletions.
diff --git a/docs/get-started/docker.md b/docs/get-started/docker.md
@@ -42,6 +42,10 @@ You can check out the example project found [here](https://github.com/pflooky/da
 
 ```shell
 git clone [email protected]:pflooky/data-caterer-example.git
+#for Scala example
+#src/main/scala/com/github/pflooky/plan/DocumentationPlanRun.scala
+#for Java example
+#src/main/java/com/github/pflooky/plan/DocumentationJavaPlanRun.java
 ```
 
 === "Java"
@@ -58,7 +62,6 @@ git clone [email protected]:pflooky/data-caterer-example.git
       public class DocumentationJavaPlanRun extends PlanRun {
          {
             var myJson = json("account_info", "/tmp/data-caterer/json")
-               .numPartitions(1)
                .schema(
                   field().name("account_id").regex("ACC[0-9]{8}"),
                   field().name("year").type(IntegerType.instance()).min(2022).max(2023),
@@ -85,7 +88,7 @@ git clone [email protected]:pflooky/data-caterer-example.git
 
       ```scala
       import com.github.pflooky.datacaterer.api.PlanRun
-      import com.github.pflooky.datacaterer.api.model.{ArrayType, DateType, DoubleType}
+      import com.github.pflooky.datacaterer.api.model.{ArrayType, DateType, DoubleType, IntegerType}
       
       import java.sql.Date
       

diff --git a/docs/index.md b/docs/index.md
@@ -1,10 +1,8 @@
 # Home
 
-Using Data Caterer, you have the ability to generate production like data based on any source/target system whether it
-be a CSV file, database table, etc. anywhere you want the data to be. Whether it be in a test environment or even in
-your local laptop.
-Just define your data source connections and data will be generated.
-It can also be manually altered to produce data or scenarios the way you want.
+Data Caterer is a data generation tool that aids in creating production like data across batch and event data systems.
+You can then clean up the generated data or run data validations to ensure your systems have ingested it as expected.
+Use the Java, Scala API, or YAML files to help with setup or customisation that are all run via docker.
 
 Main features of the data generator include:
 

diff --git a/docs/sample/docker/data/custom/application.conf b/docs/sample/docker/data/custom/application.conf
@@ -9,6 +9,12 @@ flags {
     enableRecordTracking = ${?ENABLE_RECORD_TRACKING}
     enableDeleteGeneratedRecords = false
     enableDeleteGeneratedRecords = ${?ENABLE_DELETE_GENERATED_RECORDS}
+    enableSinkMetadata = true
+    enableSinkMetadata = ${?ENABLED_SINK_METADATA}
+    enableSaveReports = true
+    enableSaveReports = ${?ENABLED_SAVE_REPORTS}
+    enableValidation = false
+    enableValidation = ${?ENABLED_VALIDATION}
 }
 
 folders {
@@ -20,8 +26,8 @@ folders {
     taskFolderPath = ${?TASK_FOLDER_PATH}
     recordTrackingFolderPath = "/opt/app/custom/recordTracking"
     recordTrackingFolderPath = ${?RECORD_TRACKING_FOLDER_PATH}
-    generatedDataResultsFolderPath = "/opt/app/custom/report"
-    generatedDataResultsFolderPath = ${?GENERATED_DATA_METADATA_FOLDER_PATH}
+    generatedReportsFolderPath = "/opt/app/custom/report"
+    generatedReportsFolderPath = ${?GENERATED_REPORTS_FOLDER_PATH}
 }
 
 metadata {
@@ -34,9 +40,9 @@ generation {
     numRecordsPerBatch = 1000000
 }
 
-spark {
+runtime {
     master = "local[*]"
-    master = ${?SPARK_MASTER}
+    master = ${?DATA_CATERER_MASTER}
     config {
         "spark.driver.memory" = "2g"
         "spark.executor.memory" = "2g"

diff --git a/docs/sample/docker/data/custom/task/cassandra/cassandra-customer-task.yaml b/docs/sample/docker/data/custom/task/cassandra/cassandra-customer-task.yaml
@@ -3,7 +3,7 @@ steps:
   - name: "accounts"
     type: "cassandra"
     count:
-      total: 10
+      records: 10
     options:
       keyspace: "account"
       table: "accounts"
@@ -49,7 +49,7 @@ steps:
   - name: "account_status_history"
     type: "cassandra"
     count:
-      total: 10
+      records: 10
       perColumn:
         columnNames:
           - "account_id"

diff --git a/docs/sample/docker/data/custom/task/file/csv/csv-transaction-task.yaml b/docs/sample/docker/data/custom/task/file/csv/csv-transaction-task.yaml
@@ -5,7 +5,7 @@ steps:
     options:
       path: "/opt/app/custom/csv/transactions"
     count:
-      total: 100
+      records: 100
       perColumn:
         columnNames:
           - "account_id"

diff --git a/docs/sample/docker/data/custom/task/file/json/json-account-task.yaml b/docs/sample/docker/data/custom/task/file/json/json-account-task.yaml
@@ -3,7 +3,7 @@ steps:
   - name: "account"
     type: "json"
     count:
-      total: 1000
+      records: 1000
     options:
       path: "/opt/app/custom/json/account"
     schema:

diff --git a/docs/sample/docker/data/custom/task/file/parquet/parquet-transaction-task.yaml b/docs/sample/docker/data/custom/task/file/parquet/parquet-transaction-task.yaml
@@ -6,7 +6,7 @@ steps:
       path: "/opt/app/custom/parquet/transactions"
       partitionBy: "year"
     count:
-      total: 1000
+      records: 1000
       perColumn:
         columnNames:
           - "account_id"

diff --git a/docs/sample/docker/data/custom/task/http/http-account-task.yaml b/docs/sample/docker/data/custom/task/http/http-account-task.yaml
@@ -2,7 +2,7 @@ name: "json_account_http"
 steps:
   - name: "account"
     count:
-      total: 50
+      records: 50
     schema:
       fields:
         - name: "url"

diff --git a/docs/sample/docker/data/custom/task/jdbc/mysql/mysql-account-task.yaml b/docs/sample/docker/data/custom/task/jdbc/mysql/mysql-account-task.yaml
@@ -3,7 +3,7 @@ steps:
   - name: "accounts"
     type: "mysql"
     count:
-      total: 50
+      records: 50
     options:
       dbtable: "customer.accounts"
     schema:

diff --git a/docs/sample/docker/data/custom/task/jdbc/postgres/postgres-account-task.yaml b/docs/sample/docker/data/custom/task/jdbc/postgres/postgres-account-task.yaml
@@ -3,7 +3,7 @@ steps:
   - name: "accounts"
     type: "postgres"
     count:
-      total: "1000000"
+      records: "1000"
     options:
       dbtable: "account.accounts"
     schema:

diff --git a/docs/sample/docker/data/custom/task/jdbc/postgres/postgres-multi-table-task.yaml b/docs/sample/docker/data/custom/task/jdbc/postgres/postgres-multi-table-task.yaml
@@ -3,7 +3,7 @@ steps:
   - name: "balances"
     type: "postgres"
     count:
-      total: 1000000
+      records: 1000000
     options:
       dbtable: "account.balances"
     schema:
@@ -31,7 +31,7 @@ steps:
   - name: "transactions"
     type: "postgres"
     count:
-      total: 400000
+      records: 400000
       perColumn:
         columnNames:
           - "account_number"

diff --git a/docs/sample/docker/data/custom/task/jms/solace/jms-account-task.yaml b/docs/sample/docker/data/custom/task/jms/solace/jms-account-task.yaml
@@ -3,7 +3,7 @@ steps:
   - name: "jms_account"
     type: "json"
     count:
-      total: 50
+      records: 50
     options:
 #      destinationName: "/JNDI/Q/test_queue"
       destinationName: "/JNDI/T/test_topic"

diff --git a/docs/sample/docker/data/custom/task/kafka/kafka-account-task.yaml b/docs/sample/docker/data/custom/task/kafka/kafka-account-task.yaml
@@ -3,7 +3,7 @@ steps:
   - name: "kafka_account"
     type: "json"
     count:
-      total: 10
+      records: 10
     options:
       topic: "account-topic"
     schema:

diff --git a/docs/sample/docker/docker-compose.yaml b/docs/sample/docker/docker-compose.yaml
@@ -1,17 +1,16 @@
 version: "3.9"
 services:
   datacaterer:
-    image: "datacatering/data-caterer-basic:0.3.3"
-#    image: "datacatering/data-caterer:0.2.2"
+    image: "datacatering/data-caterer-basic:0.3.7"
     environment:
       - "APPLICATION_CONFIG_PATH=${APPLICATION_CONFIG_PATH:-/opt/app/custom/application.conf}"
       - "PLAN_FILE_PATH=/opt/app/custom/${PLAN:-plan/${DATA_SOURCE:-postgres}}.yaml"
       - "ENABLE_GENERATE_DATA=${ENABLE_GENERATE_DATA:-true}"
       - "ENABLE_RECORD_TRACKING=${ENABLE_RECORD_TRACKING:-false}"
       - "ENABLE_GENERATE_PLAN_AND_TASKS=${ENABLE_GENERATE_PLAN_AND_TASKS:-false}"
       - "ENABLE_DELETE_GENERATED_RECORDS=${ENABLE_DELETE_GENERATED_RECORDS:-false}"
-      - "SPARK_DRIVER_MEMORY=2g"
-      - "SPARK_EXECUTOR_MEMORY=2g"
+      - "DRIVER_MEMORY=2g"
+      - "EXECUTOR_MEMORY=2g"
     volumes:
       - "./data/custom:/opt/app/custom"
     depends_on:

diff --git a/docs/sample/index.md b/docs/sample/index.md
@@ -2,6 +2,8 @@
 
 Below are examples of different types of plans and tasks that can be helpful when trying to create your own. You can use
 these as a template or to search for something related to your particular use case.
+
+Checkout this [repo](https://github.com/pflooky/data-caterer-example) for example Java and Scala API usage.
 
 ## Base Concept
 

diff --git a/docs/setup/configuration.md b/docs/setup/configuration.md
@@ -147,7 +147,7 @@ when analysing the generated data if the number of records generated is large.
 |--------------------------------------|---------|------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | `numRecordsFromDataSource`           | 10000   | Y    | Number of records read in from the data source that could be used for data profiling                                                                                                                                    |
 | `numRecordsForAnalysis`              | 10000   | Y    | Number of records used for data profiling from the records gathered in `numRecordsFromDataSource`                                                                                                                       |
-| `oneOfMinCount`                      | 1000    | N    | Minimum number of records required before considering if a field can be of type `oneOf`                                                                                                                                 |
+| `oneOfMinCount`                      | 1000    | Y    | Minimum number of records required before considering if a field can be of type `oneOf`                                                                                                                                 |
 | `oneOfDistinctCountVsCountThreshold` | 0.2     | Y    | Threshold ratio to determine if a field is of type `oneOf` (i.e. a field called `status` that only contains `open` or `closed`. Distinct count = 2, total count = 10, ratio = 2 / 10 = 0.2 therefore marked as `oneOf`) |
 | `numGeneratedSamples`                | 10      | N    | Number of sample records from generated data to take. Shown in HTML report                                                                                                                                              |
 
@@ -222,7 +222,7 @@ batch.
     }
     ```
 
-## Spark
+## Runtime
 
 Given Data Caterer uses Spark as the base framework for data processing, you can configure the job as to your 
 specifications via configuration as seen [here](https://spark.apache.org/docs/latest/configuration.html).
@@ -231,39 +231,29 @@ specifications via configuration as seen [here](https://spark.apache.org/docs/la
 
     ```java
     configuration()
-      .sparkMaster("local[*]")
-      .sparkConfig(Map.of("spark.driver.cores", "5"))
-      .addSparkConfig("spark.driver.memory", "10g");
+      .master("local[*]")
+      .runtimeConfig(Map.of("spark.driver.cores", "5"))
+      .addRuntimeConfig("spark.driver.memory", "10g");
     ```
 
 === "Scala"
 
     ```scala
     configuration
-      .sparkMaster("local[*]")
-      .sparkConfig(Map("spark.driver.cores" -> "5"))
-      .addSparkConfig("spark.driver.memory" -> "10g")
+      .master("local[*]")
+      .runtimeConfig(Map("spark.driver.cores" -> "5"))
+      .addRuntimeConfig("spark.driver.memory" -> "10g")
     ```
 
 === "application.conf"
 
     ```
-    spark {
+    runtime {
       master = "local[*]"
-      master = ${?SPARK_MASTER}
+      master = ${?DATA_CATERER_MASTER}
       config {
-        "spark.driver.memory" = "2g"
-        "spark.executor.memory" = "2g"
-        "spark.sql.cbo.enabled" = "true"
-        "spark.sql.adaptive.enabled" = "true"
-        "spark.sql.cbo.planStats.enabled" = "true"
-        "spark.sql.legacy.allowUntypedScalaUDF" = "true"
-        "spark.sql.statistics.histogram.enabled" = "true"
-        "spark.sql.shuffle.partitions" = "4"
-        "spark.sql.catalog.postgres" = ""
-        "spark.sql.catalog.cassandra" = "com.datastax.spark.connector.datasource.CassandraCatalog"
-        "spark.hadoop.fs.s3a.directory.marker.retention" = "keep"
-        "spark.hadoop.fs.s3a.bucket.all.committer.magic.enabled" = "true"
+        "spark.driver.cores" = "5"
+        "spark.driver.memory" = "10g"
       }
     }
     ```