diff --git a/kubernetes/ansible/roles/sunbird-monitoring/templates/additional-scrape-configs.yaml b/kubernetes/ansible/roles/sunbird-monitoring/templates/additional-scrape-configs.yaml index a28d39c9b2..0938226302 100644 --- a/kubernetes/ansible/roles/sunbird-monitoring/templates/additional-scrape-configs.yaml +++ b/kubernetes/ansible/roles/sunbird-monitoring/templates/additional-scrape-configs.yaml @@ -140,6 +140,31 @@ services: es: port: 9200 address: ["{{ groups['es'] | join('","') }}"] + postgres: + port: 5432 + address: ["{{ groups['postgres'] | join('","') }}"] + neo4j: + port: 7687 + address: ["{{ groups['learning-neo4j-node1'] | join('","') }}"] + mongo: + port: 27017 + address: ["{{ groups['mongo'] | join('","') }}"] + druid-coordinator: + port: 8081 + address: ["{{ groups['raw-coordinator'] | join('","') }}"] + druid-broker: + port: 8082 + address: ["{{ groups['raw-broker'] | join('","') }}"] + druid-middlemanager: + port: 8091 + address: ["{{ groups['raw-middlemanager'] | join('","') }}"] + druid-overlord: + port: 8090 + address: ["{{ groups['raw-overlord'] | join('","') }}"] + spark: + port: 22 + address: ["{{ groups['dp-spark-ps'] | join('","') }}"] + {% if additional_blackbox_scrapeconfigs is defined and additional_blackbox_scrapeconfigs %} {{ additional_blackbox_scrapeconfigs | to_nice_yaml(indent=2) | indent( width=2) }} {% endif %} diff --git a/kubernetes/helm_charts/monitoring/dashboards/dashboards/ed_infra_status.json b/kubernetes/helm_charts/monitoring/dashboards/dashboards/ed_infra_status.json new file mode 100644 index 0000000000..9e65086ec0 --- /dev/null +++ b/kubernetes/helm_charts/monitoring/dashboards/dashboards/ed_infra_status.json @@ -0,0 +1,835 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 63, + "iteration": 1715597210775, + "links": [ + { + "$$hashKey": "object:116", + "icon": "dashboard", + "includeVars": true, + "tags": [], + "title": "", + "type": "link", + "url": "https://staging.sunbirded.org/grafana/d/QNgevzV7k/health?orgId=1&refresh=1m&from=1714381645257&to=1714381945258" + }, + { + "$$hashKey": "object:105", + "icon": "external link", + "tags": [ + "http://11.3.16.225/grafana/d/85a562078cdf77779eaa1add43ccec1e/kubernetes-compute-resources-namespace-pods?orgId=1&refresh=10s&var-datasource=Prometheus&var-cluster=&var-namespace=staging&from=1714378436868&to=1714382036868&viewPanel=5" + ], + "type": "dashboards" + } + ], + "panels": [ + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "deployment" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "mode": "gradient", + "type": "color-background" + } + }, + { + "id": "custom.displayMode", + "value": "color-background" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 22, + "w": 9, + "x": 0, + "y": 0 + }, + "id": 3, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": false + }, + "pluginVersion": "7.0.6", + "targets": [ + { + "$$hashKey": "object:33", + "aggregation": "Last", + "crit": 0, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "decimals": 1, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "Never", + "editorMode": "code", + "exemplar": false, + "expr": "kube_deployment_status_replicas{namespace=\"$namespace\"} !=0", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "__auto", + "range": false, + "refId": "A", + "units": "none", + "valueHandler": "Number Threshold", + "warn": 2 + } + ], + "title": "Pods UP", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "__name__": true, + "container": true, + "endpoint": true, + "instance": true, + "job": true, + "namespace": true, + "pod": true, + "service": true + }, + "indexByName": {}, + "renameByName": {} + } + } + ], + "type": "table" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "color-background" + }, + "filterable": false, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "deployment" + }, + "properties": [ + { + "id": "custom.displayMode", + "value": "color-background" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "semi-dark-red", + "value": null + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 9, + "y": 0 + }, + "id": 14, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": false + }, + "pluginVersion": "7.0.6", + "targets": [ + { + "expr": "kube_deployment_status_replicas{namespace=\"$namespace\"} == 0\n", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Pods DOWN", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "__name__": true, + "container": true, + "endpoint": true, + "instance": true, + "job": true, + "namespace": true, + "pod": true, + "service": true + }, + "indexByName": {}, + "renameByName": {} + } + } + ], + "type": "table" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "center", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "service" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "mode": "gradient", + "type": "color-background" + } + }, + { + "id": "custom.displayMode", + "value": "color-background" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 13, + "x": 9, + "y": 10 + }, + "id": 9, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": false, + "sortBy": [ + { + "desc": false, + "displayName": "Value" + } + ] + }, + "pluginVersion": "7.0.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "exemplar": true, + "expr": "probe_success", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Backend VMs health", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "__name__": true, + "endpoint": true, + "instance": true, + "job": true, + "namespace": true + }, + "indexByName": {}, + "renameByName": { + "Value": "" + } + } + } + ], + "type": "table" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": { + "align": null + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 22 + }, + "hiddenSeries": false, + "id": 16, + "interval": "1m", + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [], + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pluginVersion": "7.0.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:463", + "alias": "quota - requests", + "color": "#F2495C", + "dashes": true, + "fill": 0, + "hiddenSeries": true, + "hideTooltip": true, + "legend": true, + "linewidth": 2, + "stack": false + }, + { + "$$hashKey": "object:464", + "alias": "quota - limits", + "color": "#FF9830", + "dashes": true, + "fill": 0, + "hiddenSeries": true, + "hideTooltip": true, + "legend": true, + "linewidth": 2, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "refId": "A" + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"requests.cpu\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - requests", + "legendLink": null, + "refId": "B" + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"limits.cpu\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - limits", + "legendLink": null, + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "POD CPU Usage", + "tooltip": { + "shared": false, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "decimals": 2, + "fieldConfig": { + "defaults": { + "custom": { + "align": null + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 29 + }, + "hiddenSeries": false, + "id": 18, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pluginVersion": "7.0.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "(node_load1) * on (instance) group_left(nodename) node_uname_info{job=~\".*node-exporter\", nodename=~\"$nodename\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{nodename}}", + "refId": "B", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Load avg on VM's", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "", + "schemaVersion": 25, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "includeAll": false, + "label": "", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": null, + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": "$datasource", + "definition": "label_values(up{job=\"kube-state-metrics\"}, cluster)", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [], + "query": "label_values(up{job=\"kube-state-metrics\"}, cluster)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "selected": true, + "text": "staging", + "value": "staging" + }, + "datasource": "Prometheus", + "definition": "label_values(kube_namespace_status_phase{job=\"kube-state-metrics\", cluster=\"$cluster\"}, namespace)", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [], + "query": "label_values(kube_namespace_status_phase{job=\"kube-state-metrics\", cluster=\"$cluster\"}, namespace)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "selected": true, + "tags": [], + "text": "All", + "value": [ + "$__all" + ] + }, + "datasource": "Prometheus", + "definition": "label_values(node_uname_info{job=~\".*node-exporter\"},nodename)", + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "nodename", + "options": [], + "query": "label_values(node_uname_info{job=~\".*node-exporter\"},nodename)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "ED Infra Status", + "uid": "q66ZrEfIk", + "version": 24 +} \ No newline at end of file diff --git a/kubernetes/helm_charts/monitoring/dashboards/values.yaml b/kubernetes/helm_charts/monitoring/dashboards/values.yaml index 0717102dd5..7c986427cb 100644 --- a/kubernetes/helm_charts/monitoring/dashboards/values.yaml +++ b/kubernetes/helm_charts/monitoring/dashboards/values.yaml @@ -435,6 +435,9 @@ dashboards2: graylog: graylogmetrics: file: dashboards/graylog-dashboard.json + edinfra: + edinframetrics: + file: dashboards/ed_infra_status.json # prometheus-stats: # gnetId: 2