Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(provider): add cert expiration liveness check #260

Merged
merged 1 commit into from
Mar 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion charts/akash-provider/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ type: application
# Versions are expected to follow Semantic Versioning (https://semver.org/)

# Major version bit highlights the mainnet release (e.g. mainnet4 = 4.x.x, mainnet5 = 5.x.x, ...)
version: 9.0.7
version: 9.1.0

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
Expand Down
48 changes: 48 additions & 0 deletions charts/akash-provider/scripts/create_provider.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/bin/bash
# Filename: create_provider.sh

set -x

##
# Create Provider
##

cat <<EOT > provider.yaml
host: https://provider.{{ .Values.domain }}:8443
attributes:
{{- range $key, $val := .Values.attributes }}
- key: {{ $val.key }}
value: {{ $val.value }}
{{- end }}
info:
email: {{ .Values.email }}
website: {{ .Values.website }}
owner: {{ .Values.from }}
EOT

# Figure the provider address in case the user passes `--from=<key_name>` instead of `--from=<akash1...>` address.
PROVIDER_ADDRESS="$(provider-services keys show $AKASH_FROM -a)"
if [[ -z "$PROVIDER_ADDRESS" ]]; then
echo "PROVIDER_ADDRESS variable is empty. Something went wrong"
exit 1
fi

provider-services query provider get $PROVIDER_ADDRESS -o json
if [[ $? -ne 0 ]]; then
echo "Could not find provider: $PROVIDER_ADDRES on the blockchain when querying Akash RPC node: $AKASH_NODE"
echo "Attempting to create a new provider ..."
provider-services tx provider create provider.yaml
fi

##
# Update Provider
##

echo "Checking whether provider.yaml needs to be updated on the chain ..."
diff --color -Nur <(cat provider.yaml | awk '/attributes:/{print;flag=1;next}/^ - key:/{if(flag)sub(" ","");print;next}flag&&/^ /{sub(" "," ");print;next}{flag=0;print}' | sort) <(provider-services query provider get $PROVIDER_ADDRESS -o text | sed -e 's/"//g' -e 's/host_uri:/host:/g' | sort)
rc=$?
if [[ $rc -ne 0 ]]; then
echo "Updating provider info in the blockchain in 10 seconds ..."
sleep 10s
provider-services tx provider update provider.yaml
fi
136 changes: 9 additions & 127 deletions charts/akash-provider/scripts/init.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/bin/bash
# Filename: init.sh

if [[ $AKASH_DEBUG == "true" ]]; then sleep 5000; fi

Expand All @@ -10,135 +11,16 @@ set -x
cat "$AKASH_BOOT_KEYS/key-pass.txt" | { cat ; echo ; } | provider-services --home="$AKASH_HOME" keys import --keyring-backend="$AKASH_KEYRING_BACKEND" "$AKASH_FROM" "$AKASH_BOOT_KEYS/key.txt"

##
# Check the Akash Node is working
# Wait for RPC
##
apt update && apt -yqq install curl jq bc netcat ca-certificates

# fail fast should there be a problem installing curl, jq, nc packages
type curl || exit 1
type jq || exit 1
type nc || exit 1

solo_ip=$(echo $AKASH_NODE | cut -d":" -f2 | cut -d"/" -f3)
port=$(echo $AKASH_NODE | cut -d":" -f3 | cut -d"/" -f1)
if [[ $AKASH_NODE != "http://akash-node-1:26657" ]]; then
nc -z -v -w5 $solo_ip $port
fi
until [[ $(curl -s $AKASH_NODE/status | jq -r .result.sync_info.catching_up) == "false" ]]; do sleep 15; echo "Akash node not ready. Retrying"; done

# Check Akash RPC node isn't running behind too much and abort if it does.
DATE_AKASH=$(curl -s $AKASH_NODE/status | jq -r '.result.sync_info.latest_block_time')
TS_AKASH=$(date +%s --date "$DATE_AKASH")
TS=$(date +%s)
DIFF=$(echo "$TS - $TS_AKASH" | bc)
if [[ "$DIFF" -gt 30 ]]; then
echo "Akash RPC $AKASH_NODE is running $DIFF seconds behind."
echo "ACTION: Make sure your system time in synchronized and/or check your Akash RPC node."
exit 1
elif [[ "$DIFF" -lt -30 ]]; then
echo "Akash RPC $AKASH_NODE is running $DIFF seconds ahead."
echo "ACTION: Make sure your system time in synchronized and/or check your Akash RPC node."
exit 1
else
echo "Last block Akash RPC $AKASH_NODE seen was $DIFF seconds ago => OK"
fi
/scripts/wait_for_rpc.sh

##
# Create Provider
# Create/Update Provider
##
/scripts/create_provider.sh

cat <<EOT > provider.yaml
host: https://provider.{{ .Values.domain }}:8443
attributes:
{{- range $key, $val := .Values.attributes }}
- key: {{ $val.key }}
value: {{ $val.value }}
{{- end }}
info:
email: {{ .Values.email }}
website: {{ .Values.website }}
owner: {{ .Values.from }}
EOT

# Figure the provider address in case the user passes `--from=<key_name>` instead of `--from=<akash1...>` address.
PROVIDER_ADDRESS="$(provider-services keys show $AKASH_FROM -a)"
if [[ -z "$PROVIDER_ADDRESS" ]]; then
echo "PROVIDER_ADDRESS variable is empty. Something went wrong"
exit 1
fi

provider-services query provider get $PROVIDER_ADDRESS -o json
if [[ $? -ne 0 ]]; then
echo "Could not find provider: $PROVIDER_ADDRES on the blockchain when querying Akash RPC node: $AKASH_NODE"
echo "Attempting to create a new provider ..."
provider-services tx provider create provider.yaml
fi

echo "Checking whether provider.yaml needs to be updated on the chain ..."
diff --color -Nur <(cat provider.yaml | awk '/attributes:/{print;flag=1;next}/^ - key:/{if(flag)sub(" ","");print;next}flag&&/^ /{sub(" "," ");print;next}{flag=0;print}' | sort) <(provider-services query provider get $PROVIDER_ADDRESS -o text | sed -e 's/"//g' -e 's/host_uri:/host:/g' | sort)
rc=$?
if [[ $rc -ne 0 ]]; then
echo "Updating provider info in the blockchain in 10 seconds ..."
sleep 10s
provider-services tx provider update provider.yaml
fi

CERT_SYMLINK="${AKASH_HOME}/${PROVIDER_ADDRESS}.pem"
CERT_REAL_PATH="/config/provider.pem"
rm -vf "$CERT_SYMLINK"
# provider cert is coming from the configmap
ln -sv "$CERT_REAL_PATH" "$CERT_SYMLINK"
# 0 = yes; otherwise do not (re-)generate new provider certificate, unless
GEN_NEW_CERT=1

# Check whether the certificate is present and valid on the blockchain
if [[ -f "${CERT_REAL_PATH}" ]]; then
LOCAL_CERT_SN="$(cat "${CERT_REAL_PATH}" | openssl x509 -serial -noout | cut -d'=' -f2)"
LOCAL_CERT_SN_DECIMAL=$(echo "obase=10; ibase=16; $LOCAL_CERT_SN" | bc)
REMOTE_CERT_STATUS="$(AKASH_OUTPUT=json provider-services query cert list --owner $PROVIDER_ADDRESS --state valid --serial $LOCAL_CERT_SN_DECIMAL --reverse | jq -r '.certificates[0].certificate.state')"
echo "Provider certificate serial number: ${LOCAL_CERT_SN:-unknown}, status on chain: ${REMOTE_CERT_STATUS:-unknown}"
else
echo "${CERT_REAL_PATH} file is missing."
GEN_NEW_CERT=0
fi

if [[ -z "$LOCAL_CERT_SN" ]]; then
echo "LOCAL_CERT_SN variable is empty. Most likely ${CERT_REAL_PATH} file is empty or malformed."
GEN_NEW_CERT=0
fi

if [[ "valid" != "$REMOTE_CERT_STATUS" ]]; then
echo "No valid certificate found for provider: $PROVIDER_ADDRESS"
GEN_NEW_CERT=0

echo "It might as well be that the current certificate was expired/revoked, thus, it should be safe to delete it locally"
fi

# generate a new cert if the current one expires sooner than 7 days
AKASH_OUTPUT=json provider-services query cert list --owner $PROVIDER_ADDRESS --state valid --reverse | jq -r '.certificates[0].certificate.cert' | openssl base64 -A -d | openssl x509 -checkend 604800 -noout 2>/dev/null 1>&2
rc=$?
if [[ $rc -ne 0 ]]; then
echo "Certificate expires in less than 7 days, so going to generate a new one."
GEN_NEW_CERT=0
fi

# check if current local cert has expired
# TODO: should probably add a healthCheck which would keep doing this every 5 minutes to bounce the pod if cert got expired
openssl x509 -checkend 604800 -noout -in "${CERT_REAL_PATH}" 2>/dev/null 1>&2
rc=$?
if [[ $rc -ne 0 ]]; then
echo "Certificate expires in less than 7 days, so going to generate a new one."
GEN_NEW_CERT=0
fi

if [[ "$GEN_NEW_CERT" -eq "0" ]]; then
echo "Removing the old certificate before generating a new one"
# It's also a good idea to delete it as otherwise, we'd have to add `--overwrite` to `provider-services tx cert generate server` command later.
rm -vf "${CERT_REAL_PATH}"

echo "Generating new provider certificate"
provider-services tx cert generate server provider.{{ .Values.domain }}

echo "Publishing new provider certificate"
provider-services tx cert publish server
fi
##
# Create/Update Provider certs
##
/scripts/refresh_provider_cert.sh
6 changes: 6 additions & 0 deletions charts/akash-provider/scripts/liveness_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@
# Ensure the script fails if any part of a pipeline fails
set -o pipefail

# Check provider certificate expiration
if ! openssl x509 -in /config/provider.pem -checkend 3600 -noout > /dev/null; then
echo "certificate will expire in 1h, restarting"
exit 1
fi

# Provider API /status check
if ! timeout 5s curl -o /dev/null -fsk https://127.0.0.1:8443/status; then
echo "api /status check failed"
Expand Down
71 changes: 71 additions & 0 deletions charts/akash-provider/scripts/refresh_provider_cert.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/bin/bash
# Filename: refresh_provider_cert.sh

set -x

# Figure the provider address in case the user passes `--from=<key_name>` instead of `--from=<akash1...>` address.
PROVIDER_ADDRESS="$(provider-services keys show $AKASH_FROM -a)"
if [[ -z "$PROVIDER_ADDRESS" ]]; then
echo "PROVIDER_ADDRESS variable is empty. Something went wrong"
exit 1
fi

CERT_SYMLINK="${AKASH_HOME}/${PROVIDER_ADDRESS}.pem"
CERT_REAL_PATH="/config/provider.pem"
rm -vf "$CERT_SYMLINK"
# provider cert is coming from the configmap
ln -sv "$CERT_REAL_PATH" "$CERT_SYMLINK"
# 0 = yes; otherwise do not (re-)generate new provider certificate, unless
GEN_NEW_CERT=1

# Check whether the certificate is present and valid on the blockchain
if [[ -f "${CERT_REAL_PATH}" ]]; then
LOCAL_CERT_SN="$(cat "${CERT_REAL_PATH}" | openssl x509 -serial -noout | cut -d'=' -f2)"
LOCAL_CERT_SN_DECIMAL=$(echo "obase=10; ibase=16; $LOCAL_CERT_SN" | bc)
REMOTE_CERT_STATUS="$(AKASH_OUTPUT=json provider-services query cert list --owner $PROVIDER_ADDRESS --state valid --serial $LOCAL_CERT_SN_DECIMAL --reverse | jq -r '.certificates[0].certificate.state')"
echo "Provider certificate serial number: ${LOCAL_CERT_SN:-unknown}, status on chain: ${REMOTE_CERT_STATUS:-unknown}"
else
echo "${CERT_REAL_PATH} file is missing."
GEN_NEW_CERT=0
fi

if [[ -z "$LOCAL_CERT_SN" ]]; then
echo "LOCAL_CERT_SN variable is empty. Most likely ${CERT_REAL_PATH} file is empty or malformed."
GEN_NEW_CERT=0
fi

if [[ "valid" != "$REMOTE_CERT_STATUS" ]]; then
echo "No valid certificate found for provider: $PROVIDER_ADDRESS"
GEN_NEW_CERT=0

echo "It might as well be that the current certificate was expired/revoked, thus, it should be safe to delete it locally"
fi

# generate a new cert if the current one expires sooner than 7 days
AKASH_OUTPUT=json provider-services query cert list --owner $PROVIDER_ADDRESS --state valid --reverse | jq -r '.certificates[0].certificate.cert' | openssl base64 -A -d | openssl x509 -checkend 604800 -noout 2>/dev/null 1>&2
rc=$?
if [[ $rc -ne 0 ]]; then
echo "Certificate expires in less than 7 days, so going to generate a new one."
GEN_NEW_CERT=0
fi

# check if current local cert has expired
# TODO: should probably add a healthCheck which would keep doing this every 5 minutes to bounce the pod if cert got expired
openssl x509 -checkend 604800 -noout -in "${CERT_REAL_PATH}" 2>/dev/null 1>&2
rc=$?
if [[ $rc -ne 0 ]]; then
echo "Certificate expires in less than 7 days, so going to generate a new one."
GEN_NEW_CERT=0
fi

if [[ "$GEN_NEW_CERT" -eq "0" ]]; then
echo "Removing the old certificate before generating a new one"
# It's also a good idea to delete it as otherwise, we'd have to add `--overwrite` to `provider-services tx cert generate server` command later.
rm -vf "${CERT_REAL_PATH}"

echo "Generating new provider certificate"
provider-services tx cert generate server provider.{{ .Values.domain }}

echo "Publishing new provider certificate"
provider-services tx cert publish server
fi
10 changes: 10 additions & 0 deletions charts/akash-provider/scripts/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,16 @@ type jq || exit 1
type awk || exit 1
type bc || exit 1

##
# Wait for RPC
##
/scripts/wait_for_rpc.sh

##
# Create/Update Provider certs
##
/scripts/refresh_provider_cert.sh

# Start provider-services and monitor its output
exec provider-services run | while read line; do
echo "$line"
Expand Down
38 changes: 38 additions & 0 deletions charts/akash-provider/scripts/wait_for_rpc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash
# Filename: wait_for_rpc.sh

set -x

##
# Check the Akash Node is working
##
apt update && apt -yqq install curl jq bc netcat ca-certificates

# fail fast should there be a problem installing curl, jq, nc packages
type curl || exit 1
type jq || exit 1
type nc || exit 1

solo_ip=$(echo $AKASH_NODE | cut -d":" -f2 | cut -d"/" -f3)
port=$(echo $AKASH_NODE | cut -d":" -f3 | cut -d"/" -f1)
if [[ $AKASH_NODE != "http://akash-node-1:26657" ]]; then
nc -z -v -w5 $solo_ip $port
fi
until [[ $(curl -s $AKASH_NODE/status | jq -r .result.sync_info.catching_up) == "false" ]]; do sleep 15; echo "Akash node not ready. Retrying"; done

# Check Akash RPC node isn't running behind too much and abort if it does.
DATE_AKASH=$(curl -s $AKASH_NODE/status | jq -r '.result.sync_info.latest_block_time')
TS_AKASH=$(date +%s --date "$DATE_AKASH")
TS=$(date +%s)
DIFF=$(echo "$TS - $TS_AKASH" | bc)
if [[ "$DIFF" -gt 30 ]]; then
echo "Akash RPC $AKASH_NODE is running $DIFF seconds behind."
echo "ACTION: Make sure your system time in synchronized and/or check your Akash RPC node."
exit 1
elif [[ "$DIFF" -lt -30 ]]; then
echo "Akash RPC $AKASH_NODE is running $DIFF seconds ahead."
echo "ACTION: Make sure your system time in synchronized and/or check your Akash RPC node."
exit 1
else
echo "Last block Akash RPC $AKASH_NODE seen was $DIFF seconds ago => OK"
fi
12 changes: 0 additions & 12 deletions charts/akash-provider/templates/configmap-boot.yaml

This file was deleted.

16 changes: 12 additions & 4 deletions charts/akash-provider/templates/configmap-scripts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,19 @@ metadata:
name: {{ include "provider.fullname" . }}-scripts
namespace: {{ .Release.Namespace }}
data:
dummy: |
dummy
init.sh: |
{{ tpl (.Files.Get "scripts/init.sh") . | indent 4 }}
wait_for_rpc.sh: |
{{ tpl (.Files.Get "scripts/wait_for_rpc.sh") . | indent 4 }}
create_provider.sh: |
{{ tpl (.Files.Get "scripts/create_provider.sh") . | indent 4 }}
refresh_provider_cert.sh: |
{{ tpl (.Files.Get "scripts/refresh_provider_cert.sh") . | indent 4 }}
run.sh: |
{{ tpl (.Files.Get "scripts/run.sh") . | indent 4 }}
liveness_checks.sh: |
{{ tpl (.Files.Get "scripts/liveness_checks.sh") . | indent 4 }}
{{- if .Values.bidpricescript }}
price_script.sh: |
{{ .Values.bidpricescript | b64dec | indent 4 }}
{{- end }}
liveness_checks.sh: |
{{ tpl (.Files.Get "scripts/liveness_checks.sh") . | indent 4 }}
Loading
Loading