Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(RHIDP-872): Enable RHDH nodejs metrics #75

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions ci-scripts/rhdh-setup/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ export ENABLE_RBAC="${ENABLE_RBAC:-false}"
export ENABLE_PROFILING="${ENABLE_PROFILING:-false}"

export PSQL_LOG="${PSQL_LOG:-true}"
export RHDH_METRIC="${RHDH_METRIC:-true}"
export LOG_MIN_DURATION_STATEMENT="${LOG_MIN_DURATION_STATEMENT:-65}"
export LOG_MIN_DURATION_SAMPLE="${LOG_MIN_DURATION_SAMPLE:-50}"
export LOG_STATEMENT_SAMPLE_RATE="${LOG_STATEMENT_SAMPLE_RATE:-0.7}"
Expand Down Expand Up @@ -213,6 +214,8 @@ backstage_install() {
echo "Invalid install method: $INSTALL_METHOD, currently allowed methods are helm or olm"
return 1
fi
if [ "${AUTH_PROVIDER}" == "keycloak" ] && ${RHDH_METRIC}; then $clin create -f template/backstage/rhdh-metrics-service.yaml; fi
if ${RHDH_METRIC}; then envsubst <template/backstage/rhdh-servicemonitor.yaml| $clin create -f -; fi
}

# shellcheck disable=SC2016,SC1004
Expand Down Expand Up @@ -253,6 +256,7 @@ install_rhdh_with_helm() {
${RHDH_IMAGE_REPO} \
${RHDH_IMAGE_TAG} \
${RHDH_NAMESPACE} \
${RHDH_METRIC} \
${COOKIE_SECRET} \
' <"$TMP_DIR/chart-values.temp.yaml" >"$TMP_DIR/chart-values.yaml"
if [ -n "${RHDH_RESOURCES_CPU_REQUESTS}" ]; then yq -i '.upstream.backstage.resources.requests.cpu = "'"${RHDH_RESOURCES_CPU_REQUESTS}"'"' "$TMP_DIR/chart-values.yaml"; fi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ upstream:
enabled: false
metrics:
serviceMonitor:
enabled: false
enabled: ${RHDH_METRIC}
path: /metrics
nameOverride: developer-hub
networkPolicy:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ upstream:
enabled: false
metrics:
serviceMonitor:
enabled: false
enabled: ${RHDH_METRIC}
path: /metrics
nameOverride: developer-hub
networkPolicy:
Expand Down
18 changes: 18 additions & 0 deletions ci-scripts/rhdh-setup/template/backstage/rhdh-metrics-service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
kind: Service
apiVersion: v1
metadata:
name: rhdh-metrics
labels:
app.kubernetes.io/component: backstage
app.kubernetes.io/instance: rhdh
app.kubernetes.io/name: developer-hub
spec:
ports:
- name: backend
protocol: TCP
port: 7007
targetPort: backend
selector:
app.kubernetes.io/component: backstage
app.kubernetes.io/instance: rhdh
app.kubernetes.io/name: developer-hub
18 changes: 18 additions & 0 deletions ci-scripts/rhdh-setup/template/backstage/rhdh-servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: rhdh
namespace: ${RHDH_NAMESPACE}
labels:
app.kubernetes.io/instance: rhdh
app.kubernetes.io/name: developer-hub
spec:
namespaceSelector:
matchNames:
- ${RHDH_NAMESPACE}
selector:
matchLabels:
app.kubernetes.io/name: developer-hub
endpoints:
- port: backend
path: '/metrics'
77 changes: 77 additions & 0 deletions config/cluster_read_config.populate.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,80 @@
{%- endmacro %}

{{ pv_stats('rhdh-postgresql', 'data-(rhdh|backstage)-(postgresql|psql)-(primary|developer-hub)-0') }}

{% macro rhdh_nodejs_rate( query ) -%}
# Gather nodejs monitoring data about the {{ query }}
- name: measurements.nodejs.populate.{{ query }}
monitoring_query: sum(rate({{ query }}{ job="rhdh-metrics" }[5m]))
monitoring_step: 15
{%- endmacro %}

{% for query in [
'process_cpu_user_seconds_total',
'process_cpu_system_seconds_total',
'process_cpu_seconds_total',
'catalog_stitched_entities_count'
] %}
{{ rhdh_nodejs_rate(query) }}
{% endfor %}


{% macro rhdh_nodejs( query ) -%}
# Gather nodejs monitoring data about the {{ query }}
- name: measurements.nodejs.populate.{{ query }}
monitoring_query: sum({{ query }}{ job="rhdh-metrics" })
monitoring_step: 15
{%- endmacro %}

{% for query in [
'process_resident_memory_bytes',
'process_virtual_memory_bytes',
'process_heap_bytes',
'process_open_fds',
'nodejs_eventloop_lag_seconds',
'nodejs_eventloop_lag_mean_seconds',
'nodejs_eventloop_lag_stddev_seconds',
'nodejs_eventloop_lag_p90_seconds',
'nodejs_active_resources_total',
'nodejs_active_handles_total',
'nodejs_active_requests_total',
'nodejs_heap_size_total_bytes',
'nodejs_heap_size_used_bytes',
'nodejs_external_memory_bytes',
'catalog_registered_locations_count',
'catalog_relations_count',
'catalog_processing_queue_delay_seconds_sum',
'catalog_processing_queue_delay_seconds_count'
] %}
{{ rhdh_nodejs(query) }}
{% endfor %}

{% macro rhdh_nodejs_lst( query, label, valuelst) -%}
{% for value in valuelst %}
# Gather nodejs monitoring data about the {{ query }}
- name: measurements.nodejs.populate.{{ query }}.{{ label }}.{{
value }}
monitoring_query: sum({{ query }}{ {{ label }}="{{ value }}", job="rhdh-metrics" })
monitoring_step: 15
{% endfor %}
{%- endmacro %}

{{ rhdh_nodejs_lst('catalog_processors_duration_seconds_sum', 'result', ['ok','failed']) }}
{{ rhdh_nodejs_lst('catalog_processors_duration_seconds_count', 'result', ['ok','failed']) }}
{{ rhdh_nodejs_lst('catalog_processing_duration_seconds_sum', 'result', ['unchanged']) }}
{{ rhdh_nodejs_lst('catalog_processing_duration_seconds_count', 'result', ['unchanged']) }}
{{ rhdh_nodejs_lst('nodejs_gc_duration_seconds_sum', 'kind', ['minor','major','incremental']) }}
{{ rhdh_nodejs_lst('nodejs_gc_duration_seconds_count', 'kind', ['minor','major','incremental']) }}
{{ rhdh_nodejs_lst('catalog_entities_count', 'kind', ['location','user','group']) }}

- name: measurements.nodejs.populate.catalog_processing_queue_delay_seconds_average
monitoring_query: sum(rate(catalog_processing_queue_delay_seconds_sum{job="rhdh-metrics"}[5m]))/sum(rate(catalog_processing_queue_delay_seconds_count{job="rhdh-metrics"}[5m]))
monitoring_step: 15

- name: measurements.nodejs.populate.catalog_processors_duration_seconds_failed_average
monitoring_query: sum(rate(catalog_processors_duration_seconds_sum{result="failed",job="rhdh-metrics"}[5m]))/sum(rate(catalog_processors_duration_seconds_count{result="failed",job="rhdh-metrics"}[5m]))
monitoring_step: 15

- name: measurements.nodejs.populate.nodejs_gc_duration_seconds_major_average
monitoring_query: sum(rate(nodejs_gc_duration_seconds_sum{kind="major",job="rhdh-metrics"}[5m]))/sum(rate(nodejs_gc_duration_seconds_count{kind="major",job="rhdh-metrics"}[5m]))
monitoring_step: 15
77 changes: 77 additions & 0 deletions config/cluster_read_config.test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,83 @@

{{ collect_index_usage('rhdh-performance.*', '(rhdh|backstage)-(postgresql|psql)-(primary|developer-hub)-0') }}


{% macro rhdh_nodejs_lst( query, label, valuelst) -%}
{% for value in valuelst %}
# Gather nodejs monitoring data about the {{ query }}
- name: measurements.nodejs.test.{{ query }}.{{ label }}.{{
value }}
monitoring_query: sum({{ query }}{ {{ label }}="{{ value }}", job="rhdh-metrics" })
monitoring_step: 15
{% endfor %}
{%- endmacro %}

{{ rhdh_nodejs_lst('catalog_processors_duration_seconds_sum', 'result', ['ok','failed']) }}
{{ rhdh_nodejs_lst('catalog_processors_duration_seconds_count', 'result', ['ok','failed']) }}
{{ rhdh_nodejs_lst('catalog_processing_duration_seconds_sum', 'result', ['unchanged']) }}
{{ rhdh_nodejs_lst('catalog_processing_duration_seconds_count', 'result', ['unchanged']) }}
{{ rhdh_nodejs_lst('nodejs_gc_duration_seconds_sum', 'kind', ['minor','major','incremental']) }}
{{ rhdh_nodejs_lst('nodejs_gc_duration_seconds_count', 'kind', ['minor','major','incremental']) }}
{{ rhdh_nodejs_lst('catalog_entities_count', 'kind', ['location','user','group']) }}

{% macro rhdh_nodejs_rate( query ) -%}
# Gather nodejs monitoring data about the {{ query }}
- name: measurements.nodejs.populate.{{ query }}
monitoring_query: sum(rate({{ query }}{ job="rhdh-metrics" }[5m]))
monitoring_step: 15
{%- endmacro %}

{% for query in [
'process_cpu_user_seconds_total',
'process_cpu_system_seconds_total',
'process_cpu_seconds_total',
'catalog_stitched_entities_count'
] %}
{{ rhdh_nodejs_rate(query) }}
{% endfor %}

{% macro rhdh_nodejs( query ) -%}
# Gather nodejs monitoring data about the {{ query }}
- name: measurements.nodejs.test.{{ query }}
monitoring_query: sum({{ query }}{ job="rhdh-metrics" })
monitoring_step: 15
{%- endmacro %}

{% for query in [
'process_resident_memory_bytes',
'process_virtual_memory_bytes',
'process_heap_bytes',
'process_open_fds',
'nodejs_eventloop_lag_seconds',
'nodejs_eventloop_lag_mean_seconds',
'nodejs_eventloop_lag_stddev_seconds',
'nodejs_eventloop_lag_p90_seconds',
'nodejs_active_resources_total',
'nodejs_active_handles_total',
'nodejs_active_requests_total',
'nodejs_heap_size_total_bytes',
'nodejs_heap_size_used_bytes',
'nodejs_external_memory_bytes',
'catalog_registered_locations_count',
'catalog_relations_count',
'catalog_processing_queue_delay_seconds_sum',
'catalog_processing_queue_delay_seconds_count'
] %}
{{ rhdh_nodejs(query) }}
{% endfor %}

- name: measurements.nodejs.populate.catalog_processing_queue_delay_seconds_average
monitoring_query: sum(rate(catalog_processing_queue_delay_seconds_sum{job="rhdh-metrics"}[5m]))/sum(rate(catalog_processing_queue_delay_seconds_count{job="rhdh-metrics"}[5m]))
monitoring_step: 15

- name: measurements.nodejs.populate.catalog_processors_duration_seconds_failed_average
monitoring_query: sum(rate(catalog_processors_duration_seconds_sum{result="failed",job="rhdh-metrics"}[5m]))/sum(rate(catalog_processors_duration_seconds_count{result="failed",job="rhdh-metrics"}[5m]))
monitoring_step: 15

- name: measurements.nodejs.populate.nodejs_gc_duration_seconds_major_average
monitoring_query: sum(rate(nodejs_gc_duration_seconds_sum{kind="major",job="rhdh-metrics"}[5m]))/sum(rate(nodejs_gc_duration_seconds_count{kind="major",job="rhdh-metrics"}[5m]))
monitoring_step: 15

# Results
{%macro results_scenario(name) -%}
- name: results.{{name}}.locust_requests_avg_response_time
Expand Down