Ontotext-AD · viktor-ribchev · May 22, 2024 · May 20, 2024 · May 21, 2024 · May 21, 2024
diff --git a/README.md b/README.md
@@ -157,7 +157,7 @@ Before you begin using this Terraform module, ensure you meet the following prer
 | lb\_enable\_access\_logs | Enable or disable access logs for the NLB | `bool` | `false` | no |
 | lb\_access\_logs\_expiration\_days | Define the days after which the LB access logs should be deleted. | `number` | `14` | no |
 | bucket\_replication\_destination\_region | Define in which Region should the bucket be replicated | `string` | `null` | no |
-| asg\_enable\_instance\_refresh | Enables instance refresh for the GraphDB Auto scaling group. A refresh is started when any of the following Auto Scaling Group properties change: launch\_configuration, launch\_template, mixed\_instances\_policy | `bool` | `true` | no |
+| asg\_enable\_instance\_refresh | Enables instance refresh for the GraphDB Auto scaling group. A refresh is started when any of the following Auto Scaling Group properties change: launch\_configuration, launch\_template, mixed\_instances\_policy | `bool` | `false` | no |
 | asg\_instance\_refresh\_checkpoint\_delay | Number of seconds to wait after a checkpoint. | `number` | `3600` | no |
 | graphdb\_enable\_userdata\_scripts\_on\_reboot | (Experimental) Modifies cloud-config to always run user data scripts on EC2 boot | `bool` | `false` | no |
 <!-- END_TF_DOCS -->
@@ -343,29 +343,36 @@ Support for this will be introduced in the future.
 
 ### Upgrading GraphDB Version
 
-To automatically update the GraphDB version with `terraform apply`, you need to set `enable_instance_refresh` to `true`
-in your `tfvars` file. This configuration will refresh your already running instances with new ones,
-replacing them one at a time.
+To automatically update the GraphDB version with `terraform apply`, you could set `enable_instance_refresh` to `true`
+in your `tfvars` file. This configuration will enable [instance refresh](https://docs.aws.amazon.com/autoscaling/ec2/userguide/instance-refresh-overview.html)
+for the ASG and will replace your already running instances with new ones, one at a time.
 
-Please note that by default, the instance refresh process will wait for one hour before moving on to update the next instance.
-This is a precautionary measure as GraphDB may need time to sync with the other nodes.
-You can control this delay by updating the value of `instance_refresh_checkpoint_delay`.
+By default, the instance refresh process waits for one hour before updating the next instance.
+This delay allows GraphDB time to sync with other nodes.
+You can adjust this delay by changing the `instance_refresh_checkpoint_delay` value.
+If there are many writes to the cluster, consider increasing this delay.
 
-It's important to note that if you have made changes to any GraphDB configurations,
-they will be applied during the instance refresh process with the exception for the `graphdb_admin_password`.
-Support for this will be introduced in the future.
 
-**Important:** Having `enable_instance_refresh` enabled when scaling up the GraphDB cluster may lead to data
-replication issues, as existing instances will still undergo the refresh process.
-Depending on the data size, the new nodes might fail in joining the cluster due to the instance refresh.
+Note that any changes to GraphDB configurations will be applied during the instance refresh process,
+except for the `graphdb_admin_password`.
+Support for updating the admin password will be introduced in a future release.
+
+### ⚠️ **WARNING**
+Enabling `enable_instance_refresh` while scaling out the GraphDB cluster may lead to data replication issues or broken cluster configuration.
+Existing instances could still undergo the refresh process, might change their original Availability zone
+and new nodes might fail to join the cluster due to the instance refresh, depending on the data size.
+
+**We strongly recommend disabling enable_instance_refresh when scaling up the cluster.**
 
-**We strongly recommend you to set `enable_instance_refresh` to `false` when scaling up the cluster.**
+To work around this issue, you can manually set "Scale-in protection" on the existing nodes, scale out the cluster,
+and then remove the "Scale-in protection".
+However, any configuration changes will not be applied to the old instances, which could cause them to drift apart.
 
 ## Local Development
 
 Instead of using the module dependency, you can create a local variables file named `terraform.tfvars` and provide
 configuration overrides there.
-Here's an example of a terraform.tfvars file:
+Here's an example of a `terraform.tfvars` file:
 
 ### terraform.tfvars
 

diff --git a/modules/graphdb/templates/00_functions.sh b/modules/graphdb/templates/00_functions.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+# Generic helper functions
+
+# Function to print messages with timestamps
+log_with_timestamp() {
+  echo "$(date '+%Y-%m-%d %H:%M:%S'): $1"
+}
diff --git a/...aphdb/templates/00_wait_node_count.sh.tpl → ...aphdb/templates/01_wait_node_count.sh.tpl b/...aphdb/templates/00_wait_node_count.sh.tpl → ...aphdb/templates/01_wait_node_count.sh.tpl
@@ -11,6 +11,13 @@ set -o errexit
 set -o nounset
 set -o pipefail
 
+# Imports helper functions
+source /var/lib/cloud/instance/scripts/part-002
+
+echo "#####################################################"
+echo "#    Please be patient, these scripts take time     #"
+echo "#####################################################"
+
 # This handles instance refreshing where new and old nodes are both present.
 # Waiting until the ASG nodes are equal to the expected node count and proceeding with the provisioning afterwards.
 IMDS_TOKEN=$(curl -Ss -H "X-aws-ec2-metadata-token-ttl-seconds: 6000" -XPUT 169.254.169.254/latest/api/token)
@@ -20,13 +27,13 @@ ASG_NAME=${name}
 instance_refresh_status=$(aws autoscaling describe-instance-refreshes --auto-scaling-group-name "$ASG_NAME" --query 'InstanceRefreshes[?Status==`InProgress`]' --output json)
 
 if [ "$instance_refresh_status" != "[]" ]; then
-  echo "An instance refresh is currently in progress for the ASG: $ASG_NAME"
+  log_with_timestamp "An instance refresh is currently in progress for the ASG: $ASG_NAME"
   echo "$instance_refresh_status" | jq '.'
 
   IMDS_TOKEN=$(curl -Ss -H "X-aws-ec2-metadata-token-ttl-seconds: 6000" -XPUT 169.254.169.254/latest/api/token)
   INSTANCE_ID=$(curl -Ss -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" 169.254.169.254/latest/meta-data/instance-id)
 
-  echo "Waiting for default EC2 status check to pass for instance $INSTANCE_ID..."
+  log_with_timestamp "Waiting for default EC2 status check to pass for instance $INSTANCE_ID..."
 
   # Loop until the default status check passes
   while true; do
@@ -35,7 +42,7 @@ if [ "$instance_refresh_status" != "[]" ]; then
     system_status=$(aws ec2 describe-instance-status --instance-ids $INSTANCE_ID --query 'InstanceStatuses[0].SystemStatus.Status' --output text)
 
     if [[ "$instance_status" == "ok" && $system_status == "ok" ]]; then
-      echo "Default EC2 status check passed for instance $INSTANCE_ID."
+      log_with_timestamp "Default EC2 status check passed for instance $INSTANCE_ID."
       break
     fi
 
@@ -49,23 +56,34 @@ if [ "$instance_refresh_status" != "[]" ]; then
 
   # Find out if the current instance was created in response to an instance refresh
   if [ "$matching_activities" != "[]" ]; then
-    echo "Current instance was created in response to instance refresh:"
+    log_with_timestamp "Current instance was created in response to instance refresh:"
     echo "$matching_activities" | jq '.'
 
-    echo "Waiting for an available volume in $AZ"
-    while true; do
+    log_with_timestamp "Waiting for an available volume in $AZ"
+
+    TIMEOUT=600 # Timeout in seconds (10 minutes)
+    ELAPSED=0
+
+    while [ $ELAPSED -lt $TIMEOUT ]; do
       # Get the list of volumes in the current availability zone
       available_volumes=$(aws ec2 describe-volumes --filters Name=availability-zone,Values=$AZ Name=status,Values=available Name=volume-type,Values=gp3 --query "Volumes[*].VolumeId" --output text)
       # Check if any volumes are available
       if [ -n "$available_volumes" ]; then
-        echo "Found an available volume in $AZ."
+        log_with_timestamp "Found an available volume in $AZ."
         break
       fi
       sleep 5
+      ELAPSED=$((ELAPSED + 5))
     done
+
+    if [ $ELAPSED -ge $TIMEOUT ]; then
+      log_with_timestamp "Timeout reached while waiting for an available volume in $AZ. Exiting..."
+      exit 1
+    fi
+
   else
-    echo "Current instance was not created in response to instance refresh. Proceeding with the disk provisioning script"
+    log_with_timestamp "Current instance was not created in response to instance refresh. Proceeding with the volume provisioning."
   fi
 else
-  echo "No instance refresh is currently in progress for the ASG: $ASG_NAME"
+  log_with_timestamp "No instance refresh is currently in progress for the ASG: $ASG_NAME"
 fi
diff --git a/...aphdb/templates/01_disk_management.sh.tpl → ...aphdb/templates/02_disk_management.sh.tpl b/...aphdb/templates/01_disk_management.sh.tpl → ...aphdb/templates/02_disk_management.sh.tpl
@@ -12,6 +12,9 @@ set -o errexit
 set -o nounset
 set -o pipefail
 
+# Imports helper functions
+source /var/lib/cloud/instance/scripts/part-002
+
 echo "###########################################"
 echo "#    Creating/Attaching managed disks     #"
 echo "###########################################"
@@ -25,7 +28,7 @@ AVAILABLE_VOLUMES=()
 
 # Function to create a volume
 create_volume() {
-  echo "Creating new volume"
+  log_with_timestamp "Creating new volume"
   VOLUME_ID=$(aws ec2 create-volume \
     --availability-zone "$AVAILABILITY_ZONE" \
     --encrypted \
@@ -41,30 +44,30 @@ create_volume() {
 
   # Wait for the volume to be available
   aws ec2 wait volume-available --volume-ids "$VOLUME_ID"
-  echo "Successfully created volume: $VOLUME_ID"
+  log_with_timestamp "Successfully created volume: $VOLUME_ID"
 }
 
 # Function to attach volumes
 attach_volumes() {
   for volume in "$${AVAILABLE_VOLUMES[@]}"; do
-    echo "Trying to attach volume: $volume"
+    log_with_timestamp "Trying to attach volume: $volume"
     if aws ec2 attach-volume --volume-id "$volume" --instance-id "$INSTANCE_ID" --device "${device_name}"; then
-      echo "Volume $volume attached successfully"
+      log_with_timestamp "Volume $volume attached successfully"
       return
     else
-      echo "Failed to attach volume $volume. Trying the next volume..."
+      log_with_timestamp "Failed to attach volume $volume. Trying the next volume..."
     fi
   done
 
-  echo "No available volumes to attach. Creating a new volume..."
+  log_with_timestamp "No available volumes to attach. Creating a new volume..."
   AVAILABLE_VOLUMES=()
   create_volume
   attach_volumes
 }
 
 # Check if the device is already mounted
 if mount | grep -q "on $disk_mount_point"; then
-  echo "Device is already mounted at $disk_mount_point"
+  log_with_timestamp "Device is already mounted at $disk_mount_point"
 else
   for _ in {1..6}; do
     VOLUME_ID=$(aws ec2 describe-volumes \
@@ -73,18 +76,18 @@ else
 
     if [ -n "$VOLUME_ID" ]; then
       AVAILABLE_VOLUMES=($VOLUME_ID)
-      echo "Found volumes: $${AVAILABLE_VOLUMES[*]}"
+      log_with_timestamp "Found volumes: $${AVAILABLE_VOLUMES[*]}"
       break
     else
-      echo "EBS volume not yet available. Retrying..."
+      log_with_timestamp "EBS volume not yet available. Retrying..."
       sleep 10
     fi
   done
 
   if [ -z "$${AVAILABLE_VOLUMES[*]}" ]; then
     create_volume
   fi
-  echo "No device is mounted at $disk_mount_point"
+  log_with_timestamp "No device is mounted at $disk_mount_point"
   attach_volumes
 
   # Handle the EBS volume used for the GraphDB data directory.
@@ -105,17 +108,17 @@ else
         real_device=$(nvme id-ctrl --raw-binary "$volume" | cut -c3073-3104 | tr -s ' ' | sed 's/ $//')
         if [[ "$device_mapping_full" == "$real_device" || "$device_mapping_short" == "$real_device" ]]; then
           graphdb_device="$volume"
-          echo "Device found: $graphdb_device"
+          log_with_timestamp "Device found: $graphdb_device"
           break 2
         fi
       fi
     done
-    echo "Device not available, retrying ..."
+    log_with_timestamp "Device not available, retrying ..."
     sleep 5
   done
 
   if [ "$graphdb_device: data" == "$(file -s "$graphdb_device")" ]; then
-    echo "Creating file system for $graphdb_device"
+    log_with_timestamp "Creating file system for $graphdb_device"
     mkfs -t ext4 "$graphdb_device"
   fi
 
@@ -125,9 +128,9 @@ else
   fi
 
   mount "$disk_mount_point"
-  echo "The disk at $graphdb_device is now mounted at $disk_mount_point."
+  log_with_timestamp "The disk at $graphdb_device is now mounted at $disk_mount_point."
 
-  echo "Creating data folders"
+  log_with_timestamp "Creating data folders"
   mkdir -p "$disk_mount_point/node" "$disk_mount_point/cluster-proxy"
   chown -R graphdb:graphdb "$disk_mount_point"
 fi
diff --git a/modules/graphdb/templates/02_dns_provisioning.sh.tpl b/modules/graphdb/templates/02_dns_provisioning.sh.tpl