GDB-9900 Cluster rejoin

Added check if 1 node is deployed, don't setup DNS records and attempt to create cluster Variable renaming and tidying up Refactored the user data scripts
Ontotext-AD · viktor-ribchev · Apr 9, 2024 · Mar 28, 2024 · Apr 9, 2024 · 0d4dc1fa80924e784c62057ff21976d57d21b175
commit 0d4dc1fa80924e784c62057ff21976d57d21b175
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,9 @@
-# GraphDB AWS Terraform Module Changelog 
+# GraphDB AWS Terraform Module Changelog
+
+## 1.0.0
+Updated the user data scripts to allow setup of multi node cluster based on the `node_count` variable.
+Added ability for a node to rejoin the cluster if raft folder is empty or missing.
+Added stable network names based on AZ deployment.
 
 ## 0.1.0
 

diff --git a/main.tf b/main.tf
@@ -134,4 +134,7 @@ module "graphdb" {
 
   zone_id       = module.graphdb.zone_id
   zone_dns_name = var.zone_dns_name
+
+  # User data scripts
+  deploy_monitoring = var.deploy_monitoring
 }
diff --git a/modules/graphdb/templates/01_disk_management.sh.tpl b/modules/graphdb/templates/01_disk_management.sh.tpl
@@ -17,15 +17,15 @@ echo "#    Creating/Attaching managed disks     #"
 echo "###########################################"
 
 # Set common variables used throughout the script.
-IMDS_TOKEN=$( curl -Ss -H "X-aws-ec2-metadata-token-ttl-seconds: 6000" -XPUT 169.254.169.254/latest/api/token )
-INSTANCE_ID=$( curl -Ss -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" 169.254.169.254/latest/meta-data/instance-id )
-AVAILABILITY_ZONE=$( curl -Ss -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" 169.254.169.254/latest/meta-data/placement/availability-zone )
+IMDS_TOKEN=$(curl -Ss -H "X-aws-ec2-metadata-token-ttl-seconds: 6000" -XPUT 169.254.169.254/latest/api/token)
+INSTANCE_ID=$(curl -Ss -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" 169.254.169.254/latest/meta-data/instance-id)
+AVAILABILITY_ZONE=$(curl -Ss -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" 169.254.169.254/latest/meta-data/placement/availability-zone)
 VOLUME_ID=""
+AVAILABLE_VOLUMES=()
 
 # Search for an available EBS volume to attach to the instance. Wait one minute for a volume to become available,
 # if no volume is found - create new one, attach, format and mount the volume.
 for i in $(seq 1 6); do
-
   VOLUME_ID=$(
     aws --cli-connect-timeout 300 ec2 describe-volumes \
       --filters "Name=status,Values=available" "Name=availability-zone,Values=$AVAILABILITY_ZONE" "Name=tag:Name,Values=${name}-graphdb-data" \
@@ -42,8 +42,20 @@ for i in $(seq 1 6); do
   fi
 done
 
-if [ -z "$${VOLUME_ID:-}" ]; then
+# Transforms the returned result to an AVAILABLE_VOLUMES
+if [ -n "$VOLUME_ID" ]; then
+  # Loop through each element in VOLUME_ID and add it to the AVAILABLE_VOLUMES
+  while read -r element; do
+    AVAILABLE_VOLUMES+=("$element")
+  done <<< "$VOLUME_ID"
+  echo "Found volumes: $${AVAILABLE_VOLUMES[@]}"
+else
+  echo "No volumes found"
+fi
 
+# Function which creates a volume
+create_volume() {
+  echo "Creating new volume"
   VOLUME_ID=$(
     aws --cli-connect-timeout 300 ec2 create-volume \
       --availability-zone "$AVAILABILITY_ZONE" \
@@ -56,17 +68,51 @@ if [ -z "$${VOLUME_ID:-}" ]; then
       --tag-specifications "ResourceType=volume,Tags=[{Key=Name,Value=${name}-graphdb-data}]" | \
       jq -r .VolumeId
   )
+  # Transforms the returned result to an AVAILABLE_VOLUMES
+  while read -r element; do
+    AVAILABLE_VOLUMES+=("$element")
+  done <<< "$VOLUME_ID"
 
+  # wait for the volume to be available
   aws --cli-connect-timeout 300 ec2 wait volume-available --volume-ids "$VOLUME_ID"
-fi
+  echo "Successfully created volume: $VOLUME_ID"
+}
+
+attach_volumes() {
+  local volume total_volumes
+  total_volumes=$${#AVAILABLE_VOLUMES[@]}
+  for ((index = 0; index < total_volumes; index++)); do
+    volume=$${AVAILABLE_VOLUMES[index]}
+    echo "Trying to attach volume: $volume"
+
+    if aws --cli-connect-timeout 300 ec2 attach-volume \
+      --volume-id "$volume" \
+      --instance-id "$INSTANCE_ID" \
+      --device "${device_name}"; then
+      echo "Volume $volume attached successfully"
+      break
+    else
+      echo "Failed to attach volume $volume"
+      echo "Will try again with the next volume"
+
+      # Check if this is the last available volume
+      if ((index == total_volumes - 1)); then
+        echo "Attempting to create a new volume..."
+        # Resetting the AVAILABLE_VOLUMES
+        AVAILABLE_VOLUMES=()
+        create_volume
+        attach_volumes # Retry attaching volumes including the newly created one
+        break
+      fi
+    fi
+  done
+}
 
-aws --cli-connect-timeout 300 ec2 attach-volume \
-  --volume-id "$VOLUME_ID" \
-  --instance-id "$INSTANCE_ID" \
-  --device "${device_name}"
+if [[ -z "$${AVAILABLE_VOLUMES[@]}" ]]; then
+  create_volume
+fi
 
-# Storing it to be used in another script
-echo $VOLUME_ID > /tmp/volume_id
+attach_volumes
 
 # Handle the EBS volume used for the GraphDB data directory.
 # beware, here be dragons...

diff --git a/modules/graphdb/templates/02_dns_provisioning.sh.tpl b/modules/graphdb/templates/02_dns_provisioning.sh.tpl
@@ -15,23 +15,61 @@ echo "########################"
 echo "#   DNS Provisioning   #"
 echo "########################"
 
-IMDS_TOKEN=$( curl -Ss -H "X-aws-ec2-metadata-token-ttl-seconds: 6000" -XPUT 169.254.169.254/latest/api/token )
-LOCAL_IPv4=$( curl -Ss -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" 169.254.169.254/latest/meta-data/local-ipv4 )
-VOLUME_ID=$(cat /tmp/volume_id)
+IMDS_TOKEN=$(curl -Ss -H "X-aws-ec2-metadata-token-ttl-seconds: 6000" -XPUT 169.254.169.254/latest/api/token)
+LOCAL_IPv4=$(curl -Ss -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" 169.254.169.254/latest/meta-data/local-ipv4)
+AVAILABILITY_ZONE_ID=$(curl -Ss -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" 169.254.169.254/latest/meta-data/placement/availability-zone-id)
+NODE_DNS_PATH="/var/opt/graphdb/node_dns"
+# Extract only the numeric part from the AVAILABILITY_ZONE_ID
+AVAILABILITY_ZONE_ID_NUMBER="$${AVAILABILITY_ZONE_ID//*-az}"
+NODE_NUMBER=0
 
-# Subdomain is based on the volume name.
-SUBDOMAIN="$( echo -n "$VOLUME_ID" | sed 's/^vol-//' )"
-NODE_DNS="$SUBDOMAIN.${zone_dns_name}"
+# Handles instance reboots or recreations when the node has already been part of a cluster
+if [ -f $NODE_DNS_PATH ]; then
+  echo "Found $NODE_DNS_PATH"
+  NODE_DNS_RECORD=$(cat $NODE_DNS_PATH)
 
-# Storing it to be used in another script
-echo $NODE_DNS > /tmp/node_dns
+  # Updates the NODE_DSN record on file with the new IP.
+  echo "Updating IP address for $NODE_DNS_RECORD"
 
-# Creates the DNS record
-aws --cli-connect-timeout 300 route53 change-resource-record-sets \
-  --hosted-zone-id "${zone_id}" \
-  --change-batch '{"Changes": [{"Action": "UPSERT","ResourceRecordSet": {"Name": "'"$NODE_DNS"'","Type": "A","TTL": 60,"ResourceRecords": [{"Value": "'"$LOCAL_IPv4"'"}]}}]}'
+  aws --cli-connect-timeout 300 route53 change-resource-record-sets \
+    --hosted-zone-id "${zone_id}" \
+    --change-batch '{"Changes": [{"Action": "UPSERT","ResourceRecordSet": {"Name": "'"$NODE_DNS_RECORD"'","Type": "A","TTL": 60,"ResourceRecords": [{"Value": "'"$LOCAL_IPv4"'"}]}}]}'
 
-echo "DNS record for $NODE_DNS has been created"
+  hostnamectl set-hostname "$NODE_DNS_RECORD"
+  echo "DNS record for $NODE_DNS_RECORD has been updated"
+else
+  echo "$NODE_DNS_PATH does not exist. New DNS record will be created."
 
-hostnamectl set-hostname "$NODE_DNS"
+  while true; do
+    # Concatenate "node" with the extracted number
+    NODE_NAME="node-$NODE_NUMBER-zone-$AVAILABILITY_ZONE_ID_NUMBER"
 
+    # Check if the Route 53 record exists for the node name
+    DNS_RECORD_TAKEN=$(aws route53 list-resource-record-sets --hosted-zone-id ${zone_id} --query "ResourceRecordSets[?contains(Name, '$NODE_NAME')]" --output text)
+
+    if [ "$DNS_RECORD_TAKEN" ]; then
+      echo "Record $NODE_NAME is taken in hosted zone ${zone_id}"
+      # Increment node number for the next iteration
+      NODE_NUMBER=$((NODE_NUMBER + 1))
+    else
+      echo "Record $NODE_NAME does not exist in hosted zone ${zone_id}"
+      # Forms the full DNS address for the current node
+      NODE_DNS_RECORD="$NODE_NAME.${zone_dns_name}"
+
+      # Attempt to create the DNS record
+      if aws --cli-connect-timeout 300 route53 change-resource-record-sets \
+        --hosted-zone-id "${zone_id}" \
+        --change-batch '{"Changes": [{"Action": "CREATE","ResourceRecordSet": {"Name": "'"$NODE_DNS_RECORD"'","Type": "A","TTL": 60,"ResourceRecords": [{"Value": "'"$LOCAL_IPv4"'"}]}}]}' &>/dev/null; then
+        echo "DNS record for $NODE_DNS_RECORD has been created"
+        hostnamectl set-hostname "$NODE_DNS_RECORD"
+        echo "$NODE_DNS_RECORD" >/var/opt/graphdb/node_dns
+        break # Exit loop when non-existing node name is found
+      else
+        echo "Creating DNS record failed for $NODE_NAME, retrying with next available name"
+        # Retry with the next node number
+        NODE_NUMBER=$((NODE_NUMBER + 1))
+      fi
+    fi
+  done
+
+fi
diff --git a/modules/graphdb/templates/03_gdb_conf_overrides.sh.tpl b/modules/graphdb/templates/03_gdb_conf_overrides.sh.tpl
@@ -17,32 +17,33 @@ echo "#######################################"
 echo "#   GraphDB configuration overrides   #"
 echo "#######################################"
 
+LB_DNS_RECORD=${graphdb_lb_dns_name}
+NODE_DNS_RECORD=$(cat /var/opt/graphdb/node_dns)
+
 # Get and store the GraphDB license
 aws --cli-connect-timeout 300 ssm get-parameter --region ${region} --name "/${name}/graphdb/license" --with-decryption | \
   jq -r .Parameter.Value | \
   base64 -d > /etc/graphdb/graphdb.license
 
 # Get the cluster token
 GRAPHDB_CLUSTER_TOKEN="$(aws --cli-connect-timeout 300 ssm get-parameter --region ${region} --name "/${name}/graphdb/cluster_token" --with-decryption | jq -r .Parameter.Value | base64 -d)"
-# Get the NODE_DNS value from the previous script
-NODE_DNS=$(cat /tmp/node_dns)
+# Get the NODE_DNS_RECORD value from the previous script
+SSM_PARAMETERS=$(aws ssm describe-parameters --cli-connect-timeout 300 --region ${region} --query "Parameters[?starts_with(Name, '/${name}/graphdb/')].Name" --output text)
 
 cat << EOF > /etc/graphdb/graphdb.properties
 graphdb.auth.token.secret=$GRAPHDB_CLUSTER_TOKEN
 graphdb.connector.port=7201
-graphdb.external-url=http://$${NODE_DNS}:7201
-graphdb.rpc.address=$${NODE_DNS}:7301
+graphdb.external-url=http://$${NODE_DNS_RECORD}:7201
+graphdb.rpc.address=$${NODE_DNS_RECORD}:7301
 EOF
 
-LB_DNS=$(aws --cli-connect-timeout 300 ssm get-parameter --region ${region} --name "/${name}/graphdb/lb_dns_name" | jq -r .Parameter.Value)
-
 cat << EOF > /etc/graphdb-cluster-proxy/graphdb.properties
 graphdb.auth.token.secret=$GRAPHDB_CLUSTER_TOKEN
 graphdb.connector.port=7200
-graphdb.external-url=http://$${LB_DNS}
-graphdb.vhosts=http://$${LB_DNS},http://$${NODE_DNS}:7200
-graphdb.rpc.address=$${NODE_DNS}:7300
-graphdb.proxy.hosts=$${NODE_DNS}:7301
+graphdb.external-url=http://$${LB_DNS_RECORD}
+graphdb.vhosts=http://$${LB_DNS_RECORD},http://$${NODE_DNS_RECORD}:7200
+graphdb.rpc.address=$${NODE_DNS_RECORD}:7300
+graphdb.proxy.hosts=$${NODE_DNS_RECORD}:7301
 EOF
 
 mkdir -p /etc/systemd/system/graphdb.service.d/
@@ -60,16 +61,14 @@ cat << EOF > /etc/systemd/system/graphdb.service.d/overrides.conf
 Environment="GDB_HEAP_SIZE=$${JVM_MAX_MEMORY}g"
 EOF
 
-parameters=$(aws ssm describe-parameters --cli-connect-timeout 300 --region ${region} --query "Parameters[?starts_with(Name, '/${name}/graphdb/')].Name" --output text)
-
 # Appends configuration overrides to graphdb.properties
-if [[ $parameters == *"/${name}/graphdb/graphdb_properties"* ]]; then
+if [[ $SSM_PARAMETERS == *"/${name}/graphdb/graphdb_properties"* ]]; then
   aws --cli-connect-timeout 300 ssm get-parameter --region ${region} --name "/${name}/graphdb/graphdb_properties" --with-decryption | jq -r .Parameter.Value | \
     base64 -d >> /etc/graphdb/graphdb.properties
 fi
 
 # Appends environment overrides to GDB_JAVA_OPTS
-if [[ $parameters == *"/${name}/graphdb/graphdb_java_options"* ]]; then
+if [[ $SSM_PARAMETERS == *"/${name}/graphdb/graphdb_java_options"* ]]; then
   extra_graphdb_java_options="$(aws --cli-connect-timeout 300 ssm get-parameter --region ${region} --name "/${name}/graphdb/graphdb_java_options" --with-decryption | jq -r .Parameter.Value)"
   (
     source /etc/graphdb/graphdb.env

diff --git a/modules/graphdb/templates/04_gdb_backup_conf.sh.tpl b/modules/graphdb/templates/04_gdb_backup_conf.sh.tpl
@@ -66,7 +66,7 @@ rotate_backups
 EOF
 
   chmod +x /usr/bin/graphdb_backup
-  echo "${backup_schedule} graphdb /usr/bin/graphdb_backup" > /etc/cron.d/graphdb_backup
+  echo "${backup_schedule} graphdb /usr/bin/graphdb_backup" >/etc/cron.d/graphdb_backup
 
   echo "Cron job created"
 else

diff --git a/modules/graphdb/templates/06_cloudwatch_setup.sh.tpl b/modules/graphdb/templates/06_cloudwatch_setup.sh.tpl
@@ -12,26 +12,22 @@ echo "#################################"
 echo "#    Cloudwatch Provisioning    #"
 echo "#################################"
 
-parameters=$(aws ssm describe-parameters --cli-connect-timeout 300 --region ${region} --query "Parameters[?starts_with(Name, '/${name}/graphdb/')].Name" --output text)
-
 # Appends configuration overrides to graphdb.properties
-if [[ $parameters == *"/${name}/graphdb/CWAgent/Config"* ]]; then
+if [ ${deploy_monitoring} == "true" ]; then
+  GRAPHDB_ADMIN_PASSWORD=$(aws --cli-connect-timeout 300 ssm get-parameter --region ${region} --name "/${name}/graphdb/admin_password" --with-decryption --query "Parameter.Value" --output text | base64 -d)
   # Parse the CW Agent Config from SSM Parameter store and put it in file
   CWAGENT_CONFIG=$(aws ssm get-parameter --name "/${name}/graphdb/CWAgent/Config" --query "Parameter.Value" --output text)
-  echo "$CWAGENT_CONFIG" > /etc/graphdb/cloudwatch-agent-config.json
-
-GRAPHDB_ADMIN_PASSWORD=$(aws --cli-connect-timeout 300 ssm get-parameter --region ${region} --name "/${name}/graphdb/admin_password" --with-decryption --query "Parameter.Value" --output text | base64 -d)
+  echo "$CWAGENT_CONFIG" >/etc/graphdb/cloudwatch-agent-config.json
 
   tmp=$(mktemp)
-  jq '.logs.metrics_collected.prometheus.log_group_name = "${name}-graphdb"' /etc/graphdb/cloudwatch-agent-config.json > "$tmp" && mv "$tmp" /etc/graphdb/cloudwatch-agent-config.json
-  jq '.logs.metrics_collected.prometheus.emf_processor.metric_namespace = "${name}-graphdb"' /etc/graphdb/cloudwatch-agent-config.json > "$tmp" && mv "$tmp" /etc/graphdb/cloudwatch-agent-config.json
-  cat /etc/prometheus/prometheus.yaml | yq '.scrape_configs[].static_configs[].targets = ["localhost:7201"]' > "$tmp" && mv "$tmp" /etc/prometheus/prometheus.yaml
-  cat /etc/prometheus/prometheus.yaml | yq '.scrape_configs[].basic_auth.username = "admin"' | yq ".scrape_configs[].basic_auth.password = \"$${GRAPHDB_ADMIN_PASSWORD}\"" > "$tmp" && mv "$tmp" /etc/prometheus/prometheus.yaml
+  jq '.logs.metrics_collected.prometheus.log_group_name = "${name}-graphdb"' /etc/graphdb/cloudwatch-agent-config.json >"$tmp" && mv "$tmp" /etc/graphdb/cloudwatch-agent-config.json
+  jq '.logs.metrics_collected.prometheus.emf_processor.metric_namespace = "${name}-graphdb"' /etc/graphdb/cloudwatch-agent-config.json >"$tmp" && mv "$tmp" /etc/graphdb/cloudwatch-agent-config.json
+  cat /etc/prometheus/prometheus.yaml | yq '.scrape_configs[].static_configs[].targets = ["localhost:7201"]' >"$tmp" && mv "$tmp" /etc/prometheus/prometheus.yaml
+  cat /etc/prometheus/prometheus.yaml | yq '.scrape_configs[].basic_auth.username = "admin"' | yq ".scrape_configs[].basic_auth.password = \"$${GRAPHDB_ADMIN_PASSWORD}\"" >"$tmp" && mv "$tmp" /etc/prometheus/prometheus.yaml
 
   amazon-cloudwatch-agent-ctl -a start
   amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c file:/etc/graphdb/cloudwatch-agent-config.json
 
 else
-  echo "/${name}/graphdb/CWAgent/Config was not found! Check the deployment..."
+  echo "Monitoring module was not deployed, skipping provisioning..."
 fi
-