Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GDB-9900 Cluster rejoin #33

Merged
merged 1 commit into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
GDB-9900 Cluster rejoin
Added check if 1 node is deployed, don't setup DNS records and attempt to create cluster
Variable renaming and tidying up
Refactored the user data scripts
  • Loading branch information
simonzhekoff authored and viktor-ribchev committed Apr 9, 2024
commit 0d4dc1fa80924e784c62057ff21976d57d21b175
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
# GraphDB AWS Terraform Module Changelog
# GraphDB AWS Terraform Module Changelog

## 1.0.0
Updated the user data scripts to allow setup of multi node cluster based on the `node_count` variable.
Added ability for a node to rejoin the cluster if raft folder is empty or missing.
Added stable network names based on AZ deployment.

## 0.1.0

Expand Down
3 changes: 3 additions & 0 deletions main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -134,4 +134,7 @@ module "graphdb" {

zone_id = module.graphdb.zone_id
zone_dns_name = var.zone_dns_name

# User data scripts
deploy_monitoring = var.deploy_monitoring
}
70 changes: 58 additions & 12 deletions modules/graphdb/templates/01_disk_management.sh.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@ echo "# Creating/Attaching managed disks #"
echo "###########################################"

# Set common variables used throughout the script.
IMDS_TOKEN=$( curl -Ss -H "X-aws-ec2-metadata-token-ttl-seconds: 6000" -XPUT 169.254.169.254/latest/api/token )
INSTANCE_ID=$( curl -Ss -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" 169.254.169.254/latest/meta-data/instance-id )
AVAILABILITY_ZONE=$( curl -Ss -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" 169.254.169.254/latest/meta-data/placement/availability-zone )
IMDS_TOKEN=$(curl -Ss -H "X-aws-ec2-metadata-token-ttl-seconds: 6000" -XPUT 169.254.169.254/latest/api/token)
INSTANCE_ID=$(curl -Ss -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" 169.254.169.254/latest/meta-data/instance-id)
AVAILABILITY_ZONE=$(curl -Ss -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" 169.254.169.254/latest/meta-data/placement/availability-zone)
VOLUME_ID=""
AVAILABLE_VOLUMES=()

# Search for an available EBS volume to attach to the instance. Wait one minute for a volume to become available,
# if no volume is found - create new one, attach, format and mount the volume.
for i in $(seq 1 6); do

VOLUME_ID=$(
aws --cli-connect-timeout 300 ec2 describe-volumes \
--filters "Name=status,Values=available" "Name=availability-zone,Values=$AVAILABILITY_ZONE" "Name=tag:Name,Values=${name}-graphdb-data" \
Expand All @@ -42,8 +42,20 @@ for i in $(seq 1 6); do
fi
done

if [ -z "$${VOLUME_ID:-}" ]; then
# Transforms the returned result to an AVAILABLE_VOLUMES
if [ -n "$VOLUME_ID" ]; then
yaskoo marked this conversation as resolved.
Show resolved Hide resolved
# Loop through each element in VOLUME_ID and add it to the AVAILABLE_VOLUMES
while read -r element; do
AVAILABLE_VOLUMES+=("$element")
done <<< "$VOLUME_ID"
echo "Found volumes: $${AVAILABLE_VOLUMES[@]}"
else
echo "No volumes found"
fi

# Function which creates a volume
create_volume() {
echo "Creating new volume"
VOLUME_ID=$(
aws --cli-connect-timeout 300 ec2 create-volume \
--availability-zone "$AVAILABILITY_ZONE" \
Expand All @@ -56,17 +68,51 @@ if [ -z "$${VOLUME_ID:-}" ]; then
--tag-specifications "ResourceType=volume,Tags=[{Key=Name,Value=${name}-graphdb-data}]" | \
jq -r .VolumeId
)
# Transforms the returned result to an AVAILABLE_VOLUMES
while read -r element; do
AVAILABLE_VOLUMES+=("$element")
done <<< "$VOLUME_ID"

# wait for the volume to be available
aws --cli-connect-timeout 300 ec2 wait volume-available --volume-ids "$VOLUME_ID"
fi
echo "Successfully created volume: $VOLUME_ID"
}

attach_volumes() {
local volume total_volumes
total_volumes=$${#AVAILABLE_VOLUMES[@]}
for ((index = 0; index < total_volumes; index++)); do
volume=$${AVAILABLE_VOLUMES[index]}
echo "Trying to attach volume: $volume"

if aws --cli-connect-timeout 300 ec2 attach-volume \
--volume-id "$volume" \
--instance-id "$INSTANCE_ID" \
--device "${device_name}"; then
echo "Volume $volume attached successfully"
break
else
echo "Failed to attach volume $volume"
echo "Will try again with the next volume"

# Check if this is the last available volume
if ((index == total_volumes - 1)); then
yaskoo marked this conversation as resolved.
Show resolved Hide resolved
echo "Attempting to create a new volume..."
# Resetting the AVAILABLE_VOLUMES
AVAILABLE_VOLUMES=()
create_volume
attach_volumes # Retry attaching volumes including the newly created one
break
fi
fi
done
}

aws --cli-connect-timeout 300 ec2 attach-volume \
--volume-id "$VOLUME_ID" \
--instance-id "$INSTANCE_ID" \
--device "${device_name}"
if [[ -z "$${AVAILABLE_VOLUMES[@]}" ]]; then
create_volume
fi

# Storing it to be used in another script
echo $VOLUME_ID > /tmp/volume_id
attach_volumes

# Handle the EBS volume used for the GraphDB data directory.
# beware, here be dragons...
Expand Down
66 changes: 52 additions & 14 deletions modules/graphdb/templates/02_dns_provisioning.sh.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,61 @@ echo "########################"
echo "# DNS Provisioning #"
echo "########################"

IMDS_TOKEN=$( curl -Ss -H "X-aws-ec2-metadata-token-ttl-seconds: 6000" -XPUT 169.254.169.254/latest/api/token )
LOCAL_IPv4=$( curl -Ss -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" 169.254.169.254/latest/meta-data/local-ipv4 )
VOLUME_ID=$(cat /tmp/volume_id)
IMDS_TOKEN=$(curl -Ss -H "X-aws-ec2-metadata-token-ttl-seconds: 6000" -XPUT 169.254.169.254/latest/api/token)
LOCAL_IPv4=$(curl -Ss -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" 169.254.169.254/latest/meta-data/local-ipv4)
AVAILABILITY_ZONE_ID=$(curl -Ss -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" 169.254.169.254/latest/meta-data/placement/availability-zone-id)
NODE_DNS_PATH="/var/opt/graphdb/node_dns"
# Extract only the numeric part from the AVAILABILITY_ZONE_ID
AVAILABILITY_ZONE_ID_NUMBER="$${AVAILABILITY_ZONE_ID//*-az}"
NODE_NUMBER=0

# Subdomain is based on the volume name.
SUBDOMAIN="$( echo -n "$VOLUME_ID" | sed 's/^vol-//' )"
NODE_DNS="$SUBDOMAIN.${zone_dns_name}"
# Handles instance reboots or recreations when the node has already been part of a cluster
if [ -f $NODE_DNS_PATH ]; then
echo "Found $NODE_DNS_PATH"
NODE_DNS_RECORD=$(cat $NODE_DNS_PATH)

# Storing it to be used in another script
echo $NODE_DNS > /tmp/node_dns
# Updates the NODE_DSN record on file with the new IP.
echo "Updating IP address for $NODE_DNS_RECORD"

# Creates the DNS record
aws --cli-connect-timeout 300 route53 change-resource-record-sets \
--hosted-zone-id "${zone_id}" \
--change-batch '{"Changes": [{"Action": "UPSERT","ResourceRecordSet": {"Name": "'"$NODE_DNS"'","Type": "A","TTL": 60,"ResourceRecords": [{"Value": "'"$LOCAL_IPv4"'"}]}}]}'
aws --cli-connect-timeout 300 route53 change-resource-record-sets \
--hosted-zone-id "${zone_id}" \
--change-batch '{"Changes": [{"Action": "UPSERT","ResourceRecordSet": {"Name": "'"$NODE_DNS_RECORD"'","Type": "A","TTL": 60,"ResourceRecords": [{"Value": "'"$LOCAL_IPv4"'"}]}}]}'

echo "DNS record for $NODE_DNS has been created"
hostnamectl set-hostname "$NODE_DNS_RECORD"
echo "DNS record for $NODE_DNS_RECORD has been updated"
else
echo "$NODE_DNS_PATH does not exist. New DNS record will be created."

hostnamectl set-hostname "$NODE_DNS"
while true; do
# Concatenate "node" with the extracted number
NODE_NAME="node-$NODE_NUMBER-zone-$AVAILABILITY_ZONE_ID_NUMBER"

# Check if the Route 53 record exists for the node name
DNS_RECORD_TAKEN=$(aws route53 list-resource-record-sets --hosted-zone-id ${zone_id} --query "ResourceRecordSets[?contains(Name, '$NODE_NAME')]" --output text)

if [ "$DNS_RECORD_TAKEN" ]; then
echo "Record $NODE_NAME is taken in hosted zone ${zone_id}"
# Increment node number for the next iteration
NODE_NUMBER=$((NODE_NUMBER + 1))
else
echo "Record $NODE_NAME does not exist in hosted zone ${zone_id}"
# Forms the full DNS address for the current node
NODE_DNS_RECORD="$NODE_NAME.${zone_dns_name}"

# Attempt to create the DNS record
if aws --cli-connect-timeout 300 route53 change-resource-record-sets \
--hosted-zone-id "${zone_id}" \
--change-batch '{"Changes": [{"Action": "CREATE","ResourceRecordSet": {"Name": "'"$NODE_DNS_RECORD"'","Type": "A","TTL": 60,"ResourceRecords": [{"Value": "'"$LOCAL_IPv4"'"}]}}]}' &>/dev/null; then
echo "DNS record for $NODE_DNS_RECORD has been created"
hostnamectl set-hostname "$NODE_DNS_RECORD"
echo "$NODE_DNS_RECORD" >/var/opt/graphdb/node_dns
break # Exit loop when non-existing node name is found
else
echo "Creating DNS record failed for $NODE_NAME, retrying with next available name"
# Retry with the next node number
NODE_NUMBER=$((NODE_NUMBER + 1))
fi
fi
done

fi
27 changes: 13 additions & 14 deletions modules/graphdb/templates/03_gdb_conf_overrides.sh.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -17,32 +17,33 @@ echo "#######################################"
echo "# GraphDB configuration overrides #"
echo "#######################################"

LB_DNS_RECORD=${graphdb_lb_dns_name}
NODE_DNS_RECORD=$(cat /var/opt/graphdb/node_dns)

# Get and store the GraphDB license
aws --cli-connect-timeout 300 ssm get-parameter --region ${region} --name "/${name}/graphdb/license" --with-decryption | \
jq -r .Parameter.Value | \
base64 -d > /etc/graphdb/graphdb.license

# Get the cluster token
GRAPHDB_CLUSTER_TOKEN="$(aws --cli-connect-timeout 300 ssm get-parameter --region ${region} --name "/${name}/graphdb/cluster_token" --with-decryption | jq -r .Parameter.Value | base64 -d)"
# Get the NODE_DNS value from the previous script
NODE_DNS=$(cat /tmp/node_dns)
# Get the NODE_DNS_RECORD value from the previous script
SSM_PARAMETERS=$(aws ssm describe-parameters --cli-connect-timeout 300 --region ${region} --query "Parameters[?starts_with(Name, '/${name}/graphdb/')].Name" --output text)

cat << EOF > /etc/graphdb/graphdb.properties
graphdb.auth.token.secret=$GRAPHDB_CLUSTER_TOKEN
graphdb.connector.port=7201
graphdb.external-url=http://$${NODE_DNS}:7201
graphdb.rpc.address=$${NODE_DNS}:7301
graphdb.external-url=http://$${NODE_DNS_RECORD}:7201
graphdb.rpc.address=$${NODE_DNS_RECORD}:7301
EOF

LB_DNS=$(aws --cli-connect-timeout 300 ssm get-parameter --region ${region} --name "/${name}/graphdb/lb_dns_name" | jq -r .Parameter.Value)

cat << EOF > /etc/graphdb-cluster-proxy/graphdb.properties
graphdb.auth.token.secret=$GRAPHDB_CLUSTER_TOKEN
graphdb.connector.port=7200
graphdb.external-url=http://$${LB_DNS}
graphdb.vhosts=http://$${LB_DNS},http://$${NODE_DNS}:7200
graphdb.rpc.address=$${NODE_DNS}:7300
graphdb.proxy.hosts=$${NODE_DNS}:7301
graphdb.external-url=http://$${LB_DNS_RECORD}
graphdb.vhosts=http://$${LB_DNS_RECORD},http://$${NODE_DNS_RECORD}:7200
graphdb.rpc.address=$${NODE_DNS_RECORD}:7300
graphdb.proxy.hosts=$${NODE_DNS_RECORD}:7301
EOF

mkdir -p /etc/systemd/system/graphdb.service.d/
Expand All @@ -60,16 +61,14 @@ cat << EOF > /etc/systemd/system/graphdb.service.d/overrides.conf
Environment="GDB_HEAP_SIZE=$${JVM_MAX_MEMORY}g"
EOF

parameters=$(aws ssm describe-parameters --cli-connect-timeout 300 --region ${region} --query "Parameters[?starts_with(Name, '/${name}/graphdb/')].Name" --output text)

# Appends configuration overrides to graphdb.properties
if [[ $parameters == *"/${name}/graphdb/graphdb_properties"* ]]; then
if [[ $SSM_PARAMETERS == *"/${name}/graphdb/graphdb_properties"* ]]; then
aws --cli-connect-timeout 300 ssm get-parameter --region ${region} --name "/${name}/graphdb/graphdb_properties" --with-decryption | jq -r .Parameter.Value | \
base64 -d >> /etc/graphdb/graphdb.properties
fi

# Appends environment overrides to GDB_JAVA_OPTS
if [[ $parameters == *"/${name}/graphdb/graphdb_java_options"* ]]; then
if [[ $SSM_PARAMETERS == *"/${name}/graphdb/graphdb_java_options"* ]]; then
extra_graphdb_java_options="$(aws --cli-connect-timeout 300 ssm get-parameter --region ${region} --name "/${name}/graphdb/graphdb_java_options" --with-decryption | jq -r .Parameter.Value)"
(
source /etc/graphdb/graphdb.env
Expand Down
2 changes: 1 addition & 1 deletion modules/graphdb/templates/04_gdb_backup_conf.sh.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ rotate_backups
EOF

chmod +x /usr/bin/graphdb_backup
echo "${backup_schedule} graphdb /usr/bin/graphdb_backup" > /etc/cron.d/graphdb_backup
echo "${backup_schedule} graphdb /usr/bin/graphdb_backup" >/etc/cron.d/graphdb_backup

echo "Cron job created"
else
Expand Down
20 changes: 8 additions & 12 deletions modules/graphdb/templates/06_cloudwatch_setup.sh.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -12,26 +12,22 @@ echo "#################################"
echo "# Cloudwatch Provisioning #"
echo "#################################"

parameters=$(aws ssm describe-parameters --cli-connect-timeout 300 --region ${region} --query "Parameters[?starts_with(Name, '/${name}/graphdb/')].Name" --output text)

# Appends configuration overrides to graphdb.properties
if [[ $parameters == *"/${name}/graphdb/CWAgent/Config"* ]]; then
if [ ${deploy_monitoring} == "true" ]; then
GRAPHDB_ADMIN_PASSWORD=$(aws --cli-connect-timeout 300 ssm get-parameter --region ${region} --name "/${name}/graphdb/admin_password" --with-decryption --query "Parameter.Value" --output text | base64 -d)
# Parse the CW Agent Config from SSM Parameter store and put it in file
CWAGENT_CONFIG=$(aws ssm get-parameter --name "/${name}/graphdb/CWAgent/Config" --query "Parameter.Value" --output text)
echo "$CWAGENT_CONFIG" > /etc/graphdb/cloudwatch-agent-config.json

GRAPHDB_ADMIN_PASSWORD=$(aws --cli-connect-timeout 300 ssm get-parameter --region ${region} --name "/${name}/graphdb/admin_password" --with-decryption --query "Parameter.Value" --output text | base64 -d)
echo "$CWAGENT_CONFIG" >/etc/graphdb/cloudwatch-agent-config.json

tmp=$(mktemp)
jq '.logs.metrics_collected.prometheus.log_group_name = "${name}-graphdb"' /etc/graphdb/cloudwatch-agent-config.json > "$tmp" && mv "$tmp" /etc/graphdb/cloudwatch-agent-config.json
jq '.logs.metrics_collected.prometheus.emf_processor.metric_namespace = "${name}-graphdb"' /etc/graphdb/cloudwatch-agent-config.json > "$tmp" && mv "$tmp" /etc/graphdb/cloudwatch-agent-config.json
cat /etc/prometheus/prometheus.yaml | yq '.scrape_configs[].static_configs[].targets = ["localhost:7201"]' > "$tmp" && mv "$tmp" /etc/prometheus/prometheus.yaml
cat /etc/prometheus/prometheus.yaml | yq '.scrape_configs[].basic_auth.username = "admin"' | yq ".scrape_configs[].basic_auth.password = \"$${GRAPHDB_ADMIN_PASSWORD}\"" > "$tmp" && mv "$tmp" /etc/prometheus/prometheus.yaml
jq '.logs.metrics_collected.prometheus.log_group_name = "${name}-graphdb"' /etc/graphdb/cloudwatch-agent-config.json >"$tmp" && mv "$tmp" /etc/graphdb/cloudwatch-agent-config.json
jq '.logs.metrics_collected.prometheus.emf_processor.metric_namespace = "${name}-graphdb"' /etc/graphdb/cloudwatch-agent-config.json >"$tmp" && mv "$tmp" /etc/graphdb/cloudwatch-agent-config.json
cat /etc/prometheus/prometheus.yaml | yq '.scrape_configs[].static_configs[].targets = ["localhost:7201"]' >"$tmp" && mv "$tmp" /etc/prometheus/prometheus.yaml
cat /etc/prometheus/prometheus.yaml | yq '.scrape_configs[].basic_auth.username = "admin"' | yq ".scrape_configs[].basic_auth.password = \"$${GRAPHDB_ADMIN_PASSWORD}\"" >"$tmp" && mv "$tmp" /etc/prometheus/prometheus.yaml

amazon-cloudwatch-agent-ctl -a start
amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c file:/etc/graphdb/cloudwatch-agent-config.json

else
echo "/${name}/graphdb/CWAgent/Config was not found! Check the deployment..."
echo "Monitoring module was not deployed, skipping provisioning..."
fi

Loading
Loading