diff --git a/files/scripts/healdata/heal-cedar-data-ingest.py b/files/scripts/healdata/heal-cedar-data-ingest.py index 2f8273851..e6634a70a 100644 --- a/files/scripts/healdata/heal-cedar-data-ingest.py +++ b/files/scripts/healdata/heal-cedar-data-ingest.py @@ -169,6 +169,9 @@ def update_filter_metadata(metadata_to_update): print("Metadata is already registered. Updating MDS record") elif mds_res["_guid_type"] == "unregistered_discovery_metadata": print("Metadata has not been registered. Registering it in MDS record") + else: + print(f"This metadata data record has a special GUID type \"{mds_res['_guid_type']}\" and will be skipped") + continue if "clinicaltrials_gov" in cedar_record: mds_clinical_trials = cedar_record["clinicaltrials_gov"] diff --git a/gen3/bin/kube-setup-cohort-middleware.sh b/gen3/bin/kube-setup-cohort-middleware.sh index 477de064c..a6a024578 100644 --- a/gen3/bin/kube-setup-cohort-middleware.sh +++ b/gen3/bin/kube-setup-cohort-middleware.sh @@ -7,6 +7,10 @@ gen3_load "gen3/lib/kube-setup-init" setup_secrets() { gen3_log_info "Deploying secrets for cohort-middleware" # subshell + if [[ -n "$JENKINS_HOME" ]]; then + gen3_log_err "skipping secrets setup in non-adminvm environment" + return 0 + fi ( if ! dbcreds="$(gen3 db creds ohdsi)"; then diff --git a/kube/services/node-monitor/application.yaml b/kube/services/node-monitor/application.yaml new file mode 100644 index 000000000..df41c34b9 --- /dev/null +++ b/kube/services/node-monitor/application.yaml @@ -0,0 +1,22 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: node-monitor-application + namespace: argocd +spec: + destination: + namespace: default + server: https://kubernetes.default.svc + project: default + source: + repoURL: https://github.com/uc-cdis/cloud-automation.git + targetRevision: master + path: kube/services/node-monitor + directory: + exclude: "application.yaml" + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/kube/services/node-monitor/auth.yaml b/kube/services/node-monitor/auth.yaml new file mode 100644 index 000000000..72560cddc --- /dev/null +++ b/kube/services/node-monitor/auth.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: node-monitor + namespace: default +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: node-monitor-binding +subjects: + - kind: ServiceAccount + name: node-monitor + namespace: default +roleRef: + kind: ClusterRole + name: system:node + apiGroup: rbac.authorization.k8s.io diff --git a/kube/services/node-monitor/cronjob.yaml b/kube/services/node-monitor/cronjob.yaml new file mode 100644 index 000000000..e53046280 --- /dev/null +++ b/kube/services/node-monitor/cronjob.yaml @@ -0,0 +1,58 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: node-monitor-cron + namespace: default +spec: + schedule: "*/5 * * * *" + jobTemplate: + spec: + template: + metadata: + labels: + app: gen3job + spec: + serviceAccountName: node-monitor + containers: + - name: kubectl + image: quay.io/cdis/awshelper + env: + # This is the label we want to monitor, probably will never need to change + - name: NODE_LABEL + value: purpose=workflow + # This is 3 * 3600, or 3 hours + - name: THRESHOLD_TIME + value: "10800" + - name: SLACK_WEBHOOK_URL + valueFrom: + configMapKeyRef: + name: global + key: slack_webhook + + command: ["/bin/bash"] + args: + - "-c" + - | + #!/bin/bash + # Get all nodes with specific label and check their age + kubectl get nodes -l "$NODE_LABEL" -o json | jq -c '.items[] | {name: .metadata.name, creationTimestamp: .metadata.creationTimestamp}' | while read node_info; do + NODE_NAME=$(echo $node_info | jq -r '.name') + CREATION_TIMESTAMP=$(echo $node_info | jq -r '.creationTimestamp') + + # Convert creation timestamp to Unix Epoch time + CREATION_EPOCH=$(date -d "$CREATION_TIMESTAMP" +%s) + + # Get current Unix Epoch time + CURRENT_EPOCH=$(date +%s) + + # Calculate node age in seconds + NODE_AGE=$(($CURRENT_EPOCH - $CREATION_EPOCH)) + + # Check if node age is greater than threshold + if [ "$NODE_AGE" -gt "$THRESHOLD_TIME" ]; then + echo "Node $NODE_NAME has been around too long, sending an alert" + # Send alert to Slack + curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"WARNING: Node \`${NODE_NAME}\` is older than 3 hours!\"}" $SLACK_WEBHOOK_URL + fi + done + restartPolicy: OnFailure