Skip to content

Commit

Permalink
Merge branch 'master' into feat/GPE-1122
Browse files Browse the repository at this point in the history
  • Loading branch information
EliseCastle23 authored Oct 16, 2023
2 parents b554ee8 + cac3946 commit 40f637a
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 0 deletions.
3 changes: 3 additions & 0 deletions files/scripts/healdata/heal-cedar-data-ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,9 @@ def update_filter_metadata(metadata_to_update):
print("Metadata is already registered. Updating MDS record")
elif mds_res["_guid_type"] == "unregistered_discovery_metadata":
print("Metadata has not been registered. Registering it in MDS record")
else:
print(f"This metadata data record has a special GUID type \"{mds_res['_guid_type']}\" and will be skipped")
continue

if "clinicaltrials_gov" in cedar_record:
mds_clinical_trials = cedar_record["clinicaltrials_gov"]
Expand Down
4 changes: 4 additions & 0 deletions gen3/bin/kube-setup-cohort-middleware.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ gen3_load "gen3/lib/kube-setup-init"
setup_secrets() {
gen3_log_info "Deploying secrets for cohort-middleware"
# subshell
if [[ -n "$JENKINS_HOME" ]]; then
gen3_log_err "skipping secrets setup in non-adminvm environment"
return 0
fi

(
if ! dbcreds="$(gen3 db creds ohdsi)"; then
Expand Down
22 changes: 22 additions & 0 deletions kube/services/node-monitor/application.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: node-monitor-application
namespace: argocd
spec:
destination:
namespace: default
server: https://kubernetes.default.svc
project: default
source:
repoURL: https://github.com/uc-cdis/cloud-automation.git
targetRevision: master
path: kube/services/node-monitor
directory:
exclude: "application.yaml"
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=true
18 changes: 18 additions & 0 deletions kube/services/node-monitor/auth.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: node-monitor
namespace: default
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: node-monitor-binding
subjects:
- kind: ServiceAccount
name: node-monitor
namespace: default
roleRef:
kind: ClusterRole
name: system:node
apiGroup: rbac.authorization.k8s.io
58 changes: 58 additions & 0 deletions kube/services/node-monitor/cronjob.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: node-monitor-cron
namespace: default
spec:
schedule: "*/5 * * * *"
jobTemplate:
spec:
template:
metadata:
labels:
app: gen3job
spec:
serviceAccountName: node-monitor
containers:
- name: kubectl
image: quay.io/cdis/awshelper
env:
# This is the label we want to monitor, probably will never need to change
- name: NODE_LABEL
value: purpose=workflow
# This is 3 * 3600, or 3 hours
- name: THRESHOLD_TIME
value: "10800"
- name: SLACK_WEBHOOK_URL
valueFrom:
configMapKeyRef:
name: global
key: slack_webhook

command: ["/bin/bash"]
args:
- "-c"
- |
#!/bin/bash
# Get all nodes with specific label and check their age
kubectl get nodes -l "$NODE_LABEL" -o json | jq -c '.items[] | {name: .metadata.name, creationTimestamp: .metadata.creationTimestamp}' | while read node_info; do
NODE_NAME=$(echo $node_info | jq -r '.name')
CREATION_TIMESTAMP=$(echo $node_info | jq -r '.creationTimestamp')
# Convert creation timestamp to Unix Epoch time
CREATION_EPOCH=$(date -d "$CREATION_TIMESTAMP" +%s)
# Get current Unix Epoch time
CURRENT_EPOCH=$(date +%s)
# Calculate node age in seconds
NODE_AGE=$(($CURRENT_EPOCH - $CREATION_EPOCH))
# Check if node age is greater than threshold
if [ "$NODE_AGE" -gt "$THRESHOLD_TIME" ]; then
echo "Node $NODE_NAME has been around too long, sending an alert"
# Send alert to Slack
curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"WARNING: Node \`${NODE_NAME}\` is older than 3 hours!\"}" $SLACK_WEBHOOK_URL
fi
done
restartPolicy: OnFailure

0 comments on commit 40f637a

Please sign in to comment.