Clean up docker compose to ensure services are healthy before generat…

…ing data
pflooky · Aug 9, 2023 · 6c71ca3 · 6c71ca3
1 parent d5f8e04
commit 6c71ca3
Show file tree

Hide file tree

Showing 32 changed files with 427 additions and 5,329 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,4 @@
 docs/sample/docker/data/custom/csv
 docs/sample/docker/data/custom/json
 docs/sample/docker/data/custom/parquet
+docs/sample/docker/data/custom/generated
diff --git a/docs/connections/connections.md b/docs/connections/connections.md
@@ -4,13 +4,13 @@ Details of all the connection configuration supported can be found in the below
 
 ## Supported Data Connections
 
-| Data Source Type | Data Source                |
-|------------------|----------------------------|
-| Database         | Postgres, MySQL, Cassandra |
-| File             | CSV, JSON, ORC, Parquet    |
-| Kafka            | Kafka                      |
-| JMS              | Solace                     |
-| HTTP             | GET, PUT, POST             |
+| Data Source Type | Data Source                                         |
+|------------------|-----------------------------------------------------|
+| Database         | Postgres, MySQL, Cassandra                          |
+| File             | CSV, JSON, ORC, Parquet                             |
+| Kafka            | Kafka                                               |
+| JMS              | Solace                                              |
+| HTTP             | GET, PUT, POST, DELETE, PATCH, HEAD, TRACE, OPTIONS |
 
 All connection details follow the same pattern.
 
@@ -198,8 +198,10 @@ GRANT INSERT ON <schema>.<table> TO <user>;
 
 ### Kafka
 
-Define your Kafka bootstrap server to connect and send generated data to corresponding topics. Topic gets set at a step level.  
-Further details can be found [here](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html#writing-data-to-kafka)
+Define your Kafka bootstrap server to connect and send generated data to corresponding topics. Topic gets set at a step
+level.  
+Further details can be
+found [here](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html#writing-data-to-kafka)
 
 ```
 kafka {
@@ -209,12 +211,11 @@ kafka {
     }
 }
 ```
-  
+
 When defining your schema for pushing data to Kafka, it follows a specific top level schema.  
 An example can be found [here](../sample/docker/data/custom/task/kafka/kafka-account-task.yaml).  
 You can define the key, value, headers, partition or topic by following the linked schema.
 
-
 ### JMS
 
 Uses JNDI lookup to send messages to JMS queue. Ensure that the messaging system you are using has your queue/topic
@@ -240,17 +241,15 @@ jms {
 
 ### HTTP
 
-Define a URL to connect to when sending HTTP requests.  
-Later, can have the ability to define generated data as part of the URL.
+Define any username and/or password needed for the HTTP requests.  
+The url is defined in the tasks to allow for generated data to be populated in the url.
 
 ```
 http {
     customer_api {
-        url = "http://localhost:80/get"
-        url = ${?HTTP_URL}
-        user = "admin"      #optional
+        user = "admin"
         user = ${?HTTP_USER}
-        password = "admin"  #optional
+        password = "admin"
         password = ${?HTTP_PASSWORD}
     }
 }

diff --git a/docs/generators/count.md b/docs/generators/count.md
@@ -53,7 +53,7 @@ You can also use a combination of the above two methods to generate the number o
 When defining a total count within the `perColumn` configuration, it translates to only creating `(count.total * count.perColumn.total)` records.  
 This is a fixed number of records that will be generated each time, with no variation between runs.
 
-In the example below, we have `count.total=1000` and `count.perColumn.total=2`. Which means that `1000 * 2=2000` records will be generated
+In the example below, we have `count.total = 1000` and `count.perColumn.total = 2`. Which means that `1000 * 2 = 2000` records will be generated
 for this CSV file every time data gets generated.
 
 ```yaml

diff --git a/docs/generators/generators.md b/docs/generators/generators.md
@@ -37,12 +37,12 @@ descriptions:
 
 ### String
 
-| Option     | Default | Example                                                                                   | Description                                                                                                                                                                                                                |
-|------------|---------|-------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| minLen     | 1       | minLen: "2"                                                                               | Ensures that all generated strings have at least length `minLen`                                                                                                                                                           |
-| maxLen     | 10      | maxLen: "15"                                                                              | Ensures that all generated strings have at most length `maxLen`                                                                                                                                                            |
-| expression | <empty> | expression: "#{Name.name}"<br/> expression:"#{Address.city}/#{Demographic.maritalStatus}" | Will generate a string based on the faker expression provided. All possible faker expressions can be found [here](../sample/datafaker/expressions.txt)<br/> Expression has to be in format `#{<faker expression name>}`       |
-| enableNull | false   | enableNull: "true"                                                                        | Enable/disable null values being generated                                                                                                                                                                                 |
+| Option     | Default | Example                                                                                   | Description                                                                                                                                                                                                             |
+|------------|---------|-------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| minLen     | 1       | minLen: "2"                                                                               | Ensures that all generated strings have at least length `minLen`                                                                                                                                                        |
+| maxLen     | 10      | maxLen: "15"                                                                              | Ensures that all generated strings have at most length `maxLen`                                                                                                                                                         |
+| expression | <empty> | expression: "#{Name.name}"<br/> expression:"#{Address.city}/#{Demographic.maritalStatus}" | Will generate a string based on the faker expression provided. All possible faker expressions can be found [here](../sample/datafaker/expressions.txt)<br/> Expression has to be in format `#{<faker expression name>}` |
+| enableNull | false   | enableNull: "true"                                                                        | Enable/disable null values being generated                                                                                                                                                                              |
 
 **Edge cases**: ("", "\n", "\r", "\t", " ", "\\u0000", "\\ufff")
 

diff --git a/docs/get-started/docker.md b/docs/get-started/docker.md
@@ -55,7 +55,7 @@ cat data/custom/generated/plan/plan_*
 #### Generate data with record tracking
 
 ```shell
-APPLICATION_CONFIG_PATH=/opt/app/custom/application-dvd.conf ENABLE_GENERATE_DATA=true ENABLE_GENERATE_PLAN_AND_TASKS=false ENABLE_RECORD_TRACKING=true DATA_SOURCE=postgresdvd PLAN=generated/plan/plan_20230803T040203Z docker-compose up -d datacaterer
+APPLICATION_CONFIG_PATH=/opt/app/custom/application-dvd.conf ENABLE_GENERATE_DATA=true ENABLE_GENERATE_PLAN_AND_TASKS=false ENABLE_RECORD_TRACKING=true DATA_SOURCE=postgresdvd PLAN=generated/plan/$(ls data/custom/generated/plan/ | grep plan | head -1 | awk -F " " '{print $NF}' | sed 's/\.yaml//g') docker-compose up -d datacaterer
 ```
 
 #### Delete the generated data

diff --git a/docs/index.md b/docs/index.md
@@ -5,6 +5,13 @@ be a CSV file, database table, etc. anywhere you want the data to be. Whether it
 your local laptop.
 Just define your data source connections and data will be generated.
 It can also be manually altered to produce data or scenarios the way you want.
+
+Main features of the data generator include:
+- Ability to gather metadata about data sources
+- Generate data in either batch or real-time
+- Maintain referential integrity across generated data
+- Create custom data generation scenarios
+- Delete generated data
 
 <video src="https://github.com/pflooky/data-caterer-docs/assets/26299147/d853241b-7c7e-4943-aefe-4002b848edf5" controls="controls" style="max-width: 730px;">
 </video>

diff --git a/docs/sample/docker/README.md b/docs/sample/docker/README.md
@@ -8,9 +8,10 @@ All you need to do is define which data source you want to run with via a comman
 DATA_SOURCE=postgres docker-compose up -d datacaterer
 ```
 
-You can change `DATA_SOURCE` to one of the following:
+You can change `DATA_SOURCE` to one of the following:  
 - postgres
 - mysql
 - cassandra
 - solace
 - kafka
+- http
diff --git a/docs/sample/docker/data/custom/application-dvd.conf b/docs/sample/docker/data/custom/application-dvd.conf
@@ -9,6 +9,8 @@ flags {
     enableRecordTracking = ${?ENABLE_RECORD_TRACKING}
     enableDeleteGeneratedRecords = false
     enableDeleteGeneratedRecords = ${?ENABLE_DELETE_GENERATED_RECORDS}
+    enableFailOnError = true
+    enableFailOnError = ${?ENABLE_FAIL_ON_ERROR}
 }
 
 folders {
@@ -46,7 +48,7 @@ spark {
 
 jdbc {
     postgresDvd {
-        url = "jdbc:postgresql://postgresdvd:5432/dvdrental"
+        url = "jdbc:postgresql://localhost:5432/dvdrental"
         url = ${?POSTGRES_URL}
         user = "postgres"
         user = ${?POSTGRES_USER}

diff --git a/docs/sample/docker/data/custom/application.conf b/docs/sample/docker/data/custom/application.conf
@@ -61,7 +61,7 @@ parquet {
 
 jdbc {
     postgresCustomer {
-        url = "jdbc:postgresql://postgres:5432/customer"
+        url = "jdbc:postgresql://postgresserver:5432/customer"
         url = ${?POSTGRES_URL}
         user = "postgres"
         user = ${?POSTGRES_USER}
@@ -70,7 +70,7 @@ jdbc {
         driver = "org.postgresql.Driver"
     }
     mysql {
-        url = "jdbc:mysql://mysql:3306/customer"
+        url = "jdbc:mysql://mysqlserver:3306/customer"
         url = ${?MYSQL_URL}
         user = "root"
         user = ${?MYSQL_USERNAME}
@@ -83,7 +83,7 @@ jdbc {
 
 org.apache.spark.sql.cassandra {
     cassandra {
-        spark.cassandra.connection.host = "cassandra"
+        spark.cassandra.connection.host = "cassandraserver"
         spark.cassandra.connection.host = ${?CASSANDRA_HOST}
         spark.cassandra.connection.port = "9042"
         spark.cassandra.connection.port = ${?CASSANDRA_PORT}
@@ -96,8 +96,6 @@ org.apache.spark.sql.cassandra {
 
 http {
     httpbin {
-        url = "http://httpbin:80/put"
-        url = ${?HTTP_URL}
     }
 }
 

diff --git a/docs/sample/docker/data/custom/clean_dvd.sql b/docs/sample/docker/data/custom/clean_dvd.sql
@@ -1,3 +1,9 @@
+delete from payment where payment_id > 27502;
+delete from rental where rental_id > 10005;
+delete from customer where customer_id > 599;
+delete from store where store_id > 2;
+delete from staff where staff_id > 2;
+delete from address where address_id > 605;
 delete from film_category where category_id > 16;
 delete from inventory where film_id > 1000;
 delete from film where language_id > 16;