Skip to content

Commit

Permalink
Enable RHDH noejs metrics
Browse files Browse the repository at this point in the history
Enable collecting RHDH nodejs prometheus metrics.
The environement variable RHDH_METRIC is set to true to enable collecting
the metrics.
  • Loading branch information
yogananth-subramanian committed Sep 10, 2024
1 parent cdf411d commit 9f994f5
Show file tree
Hide file tree
Showing 7 changed files with 196 additions and 2 deletions.
4 changes: 4 additions & 0 deletions ci-scripts/rhdh-setup/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ export ENABLE_RBAC="${ENABLE_RBAC:-false}"
export ENABLE_PROFILING="${ENABLE_PROFILING:-false}"

export PSQL_LOG="${PSQL_LOG:-true}"
export RHDH_METRIC="${RHDH_METRIC:-true}"
export LOG_MIN_DURATION_STATEMENT="${LOG_MIN_DURATION_STATEMENT:-65}"
export LOG_MIN_DURATION_SAMPLE="${LOG_MIN_DURATION_SAMPLE:-50}"
export LOG_STATEMENT_SAMPLE_RATE="${LOG_STATEMENT_SAMPLE_RATE:-0.7}"
Expand Down Expand Up @@ -213,6 +214,8 @@ backstage_install() {
echo "Invalid install method: $INSTALL_METHOD, currently allowed methods are helm or olm"
return 1
fi
if [ "${AUTH_PROVIDER}" == "keycloak" ] && ${RHDH_METRIC}; then $clin create -f template/backstage/rhdh-metrics-service.yaml; fi
if ${RHDH_METRIC}; then envsubst <template/backstage/rhdh-servicemonitor.yaml| $clin create -f -; fi
}

# shellcheck disable=SC2016,SC1004
Expand Down Expand Up @@ -253,6 +256,7 @@ install_rhdh_with_helm() {
${RHDH_IMAGE_REPO} \
${RHDH_IMAGE_TAG} \
${RHDH_NAMESPACE} \
${RHDH_METRIC} \
${COOKIE_SECRET} \
' <"$TMP_DIR/chart-values.temp.yaml" >"$TMP_DIR/chart-values.yaml"
if [ -n "${RHDH_RESOURCES_CPU_REQUESTS}" ]; then yq -i '.upstream.backstage.resources.requests.cpu = "'"${RHDH_RESOURCES_CPU_REQUESTS}"'"' "$TMP_DIR/chart-values.yaml"; fi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ upstream:
enabled: false
metrics:
serviceMonitor:
enabled: false
enabled: ${RHDH_METRIC}
path: /metrics
nameOverride: developer-hub
networkPolicy:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ upstream:
enabled: false
metrics:
serviceMonitor:
enabled: false
enabled: ${RHDH_METRIC}
path: /metrics
nameOverride: developer-hub
networkPolicy:
Expand Down
18 changes: 18 additions & 0 deletions ci-scripts/rhdh-setup/template/backstage/rhdh-metrics-service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
kind: Service
apiVersion: v1
metadata:
name: rhdh-metrics
labels:
app.kubernetes.io/component: backstage
app.kubernetes.io/instance: rhdh
app.kubernetes.io/name: developer-hub
spec:
ports:
- name: backend
protocol: TCP
port: 7007
targetPort: backend
selector:
app.kubernetes.io/component: backstage
app.kubernetes.io/instance: rhdh
app.kubernetes.io/name: developer-hub
18 changes: 18 additions & 0 deletions ci-scripts/rhdh-setup/template/backstage/rhdh-servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: rhdh
namespace: ${RHDH_NAMESPACE}
labels:
app.kubernetes.io/instance: rhdh
app.kubernetes.io/name: developer-hub
spec:
namespaceSelector:
matchNames:
- ${RHDH_NAMESPACE}
selector:
matchLabels:
app.kubernetes.io/name: developer-hub
endpoints:
- port: backend
path: '/metrics'
77 changes: 77 additions & 0 deletions config/cluster_read_config.populate.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,80 @@
{%- endmacro %}

{{ pv_stats('rhdh-postgresql', 'data-(rhdh|backstage)-(postgresql|psql)-(primary|developer-hub)-0') }}

{% macro rhdh_nodejs_rate( query ) -%}
# Gather nodejs monitoring data about the {{ query }}
- name: measurements.nodejs.populate.{{ query }}
monitoring_query: sum(rate({{ query }}{ job="rhdh-metrics" }[5m]))
monitoring_step: 15
{%- endmacro %}

{% for query in [
'process_cpu_user_seconds_total',
'process_cpu_system_seconds_total',
'process_cpu_seconds_total',
'catalog_stitched_entities_count'
] %}
{{ rhdh_nodejs_rate(query) }}
{% endfor %}


{% macro rhdh_nodejs( query ) -%}
# Gather nodejs monitoring data about the {{ query }}
- name: measurements.nodejs.populate.{{ query }}
monitoring_query: sum({{ query }}{ job="rhdh-metrics" })
monitoring_step: 15
{%- endmacro %}

{% for query in [
'process_resident_memory_bytes',
'process_virtual_memory_bytes',
'process_heap_bytes',
'process_open_fds',
'nodejs_eventloop_lag_seconds',
'nodejs_eventloop_lag_mean_seconds',
'nodejs_eventloop_lag_stddev_seconds',
'nodejs_eventloop_lag_p90_seconds',
'nodejs_active_resources_total',
'nodejs_active_handles_total',
'nodejs_active_requests_total',
'nodejs_heap_size_total_bytes',
'nodejs_heap_size_used_bytes',
'nodejs_external_memory_bytes',
'catalog_registered_locations_count',
'catalog_relations_count',
'catalog_processing_queue_delay_seconds_sum',
'catalog_processing_queue_delay_seconds_count'
] %}
{{ rhdh_nodejs(query) }}
{% endfor %}

{% macro rhdh_nodejs_lst( query, label, valuelst) -%}
{% for value in valuelst %}
# Gather nodejs monitoring data about the {{ query }}
- name: measurements.nodejs.populate.{{ query }}.{{ label }}.{{
value }}
monitoring_query: sum({{ query }}{ {{ label }}="{{ value }}", job="rhdh-metrics" })
monitoring_step: 15
{% endfor %}
{%- endmacro %}

{{ rhdh_nodejs_lst('catalog_processors_duration_seconds_sum', 'result', ['ok','failed']) }}
{{ rhdh_nodejs_lst('catalog_processors_duration_seconds_count', 'result', ['ok','failed']) }}
{{ rhdh_nodejs_lst('catalog_processing_duration_seconds_sum', 'result', ['unchanged']) }}
{{ rhdh_nodejs_lst('catalog_processing_duration_seconds_count', 'result', ['unchanged']) }}
{{ rhdh_nodejs_lst('nodejs_gc_duration_seconds_sum', 'kind', ['minor','major','incremental']) }}
{{ rhdh_nodejs_lst('nodejs_gc_duration_seconds_count', 'kind', ['minor','major','incremental']) }}
{{ rhdh_nodejs_lst('catalog_entities_count', 'kind', ['location','user','group']) }}

- name: measurements.nodejs.populate.catalog_processing_queue_delay_seconds_average
monitoring_query: sum(rate(catalog_processing_queue_delay_seconds_sum{job="rhdh-metrics"}[5m]))/sum(rate(catalog_processing_queue_delay_seconds_count{job="rhdh-metrics"}[5m]))
monitoring_step: 15

- name: measurements.nodejs.populate.catalog_processors_duration_seconds_failed_average
monitoring_query: sum(rate(catalog_processors_duration_seconds_sum{result="failed",job="rhdh-metrics"}[5m]))/sum(rate(catalog_processors_duration_seconds_count{result="failed",job="rhdh-metrics"}[5m]))
monitoring_step: 15

- name: measurements.nodejs.populate.nodejs_gc_duration_seconds_major_average
monitoring_query: sum(rate(nodejs_gc_duration_seconds_sum{kind="major",job="rhdh-metrics"}[5m]))/sum(rate(nodejs_gc_duration_seconds_count{kind="major",job="rhdh-metrics"}[5m]))
monitoring_step: 15
77 changes: 77 additions & 0 deletions config/cluster_read_config.test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,83 @@

{{ collect_index_usage('rhdh-performance.*', '(rhdh|backstage)-(postgresql|psql)-(primary|developer-hub)-0') }}


{% macro rhdh_nodejs_lst( query, label, valuelst) -%}
{% for value in valuelst %}
# Gather nodejs monitoring data about the {{ query }}
- name: measurements.nodejs.test.{{ query }}.{{ label }}.{{
value }}
monitoring_query: sum({{ query }}{ {{ label }}="{{ value }}", job="rhdh-metrics" })
monitoring_step: 15
{% endfor %}
{%- endmacro %}

{{ rhdh_nodejs_lst('catalog_processors_duration_seconds_sum', 'result', ['ok','failed']) }}
{{ rhdh_nodejs_lst('catalog_processors_duration_seconds_count', 'result', ['ok','failed']) }}
{{ rhdh_nodejs_lst('catalog_processing_duration_seconds_sum', 'result', ['unchanged']) }}
{{ rhdh_nodejs_lst('catalog_processing_duration_seconds_count', 'result', ['unchanged']) }}
{{ rhdh_nodejs_lst('nodejs_gc_duration_seconds_sum', 'kind', ['minor','major','incremental']) }}
{{ rhdh_nodejs_lst('nodejs_gc_duration_seconds_count', 'kind', ['minor','major','incremental']) }}
{{ rhdh_nodejs_lst('catalog_entities_count', 'kind', ['location','user','group']) }}

{% macro rhdh_nodejs_rate( query ) -%}
# Gather nodejs monitoring data about the {{ query }}
- name: measurements.nodejs.populate.{{ query }}
monitoring_query: sum(rate({{ query }}{ job="rhdh-metrics" }[5m]))
monitoring_step: 15
{%- endmacro %}

{% for query in [
'process_cpu_user_seconds_total',
'process_cpu_system_seconds_total',
'process_cpu_seconds_total',
'catalog_stitched_entities_count'
] %}
{{ rhdh_nodejs_rate(query) }}
{% endfor %}

{% macro rhdh_nodejs( query ) -%}
# Gather nodejs monitoring data about the {{ query }}
- name: measurements.nodejs.test.{{ query }}
monitoring_query: sum({{ query }}{ job="rhdh-metrics" })
monitoring_step: 15
{%- endmacro %}

{% for query in [
'process_resident_memory_bytes',
'process_virtual_memory_bytes',
'process_heap_bytes',
'process_open_fds',
'nodejs_eventloop_lag_seconds',
'nodejs_eventloop_lag_mean_seconds',
'nodejs_eventloop_lag_stddev_seconds',
'nodejs_eventloop_lag_p90_seconds',
'nodejs_active_resources_total',
'nodejs_active_handles_total',
'nodejs_active_requests_total',
'nodejs_heap_size_total_bytes',
'nodejs_heap_size_used_bytes',
'nodejs_external_memory_bytes',
'catalog_registered_locations_count',
'catalog_relations_count',
'catalog_processing_queue_delay_seconds_sum',
'catalog_processing_queue_delay_seconds_count'
] %}
{{ rhdh_nodejs(query) }}
{% endfor %}

- name: measurements.nodejs.populate.catalog_processing_queue_delay_seconds_average
monitoring_query: sum(rate(catalog_processing_queue_delay_seconds_sum{job="rhdh-metrics"}[5m]))/sum(rate(catalog_processing_queue_delay_seconds_count{job="rhdh-metrics"}[5m]))
monitoring_step: 15

- name: measurements.nodejs.populate.catalog_processors_duration_seconds_failed_average
monitoring_query: sum(rate(catalog_processors_duration_seconds_sum{result="failed",job="rhdh-metrics"}[5m]))/sum(rate(catalog_processors_duration_seconds_count{result="failed",job="rhdh-metrics"}[5m]))
monitoring_step: 15

- name: measurements.nodejs.populate.nodejs_gc_duration_seconds_major_average
monitoring_query: sum(rate(nodejs_gc_duration_seconds_sum{kind="major",job="rhdh-metrics"}[5m]))/sum(rate(nodejs_gc_duration_seconds_count{kind="major",job="rhdh-metrics"}[5m]))
monitoring_step: 15

# Results
{%macro results_scenario(name) -%}
- name: results.{{name}}.locust_requests_avg_response_time
Expand Down

0 comments on commit 9f994f5

Please sign in to comment.