From e39a9e6feb30dcd7f58742e1bc296856a9b27b04 Mon Sep 17 00:00:00 2001 From: zyy17 Date: Mon, 30 Sep 2024 11:26:50 +0800 Subject: [PATCH] feat: add `StatementStatistics` for slow query logging implementation (#4719) * feat: log slow query * feat: log slow query for sql * refactor: add slow query logging options * ci: fix errors * feat: add StatementStatistics * chore: revert modification of servers crate * docs: update config docs * fix: clippy errors --- Cargo.lock | 1 + config/config.md | 20 ++++++ config/datanode.example.toml | 13 ++++ config/flownode.example.toml | 13 ++++ config/frontend.example.toml | 13 ++++ config/metasrv.example.toml | 13 ++++ config/standalone.example.toml | 13 ++++ src/cmd/src/frontend.rs | 2 + src/cmd/src/standalone.rs | 2 + src/common/telemetry/Cargo.toml | 1 + src/common/telemetry/src/logging.rs | 62 +++++++++++++++++- src/common/telemetry/src/macros.rs | 11 ++++ src/flow/src/server.rs | 2 + src/frontend/src/instance/builder.rs | 5 ++ src/operator/src/statement.rs | 6 ++ src/query/Cargo.toml | 1 + src/query/src/lib.rs | 1 + src/query/src/stats.rs | 98 ++++++++++++++++++++++++++++ tests-integration/src/cluster.rs | 2 + tests-integration/src/standalone.rs | 2 + tests-integration/tests/http.rs | 3 + 21 files changed, 282 insertions(+), 2 deletions(-) create mode 100644 src/query/src/stats.rs diff --git a/Cargo.lock b/Cargo.lock index 4533257660ad..7aebe2e719eb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2289,6 +2289,7 @@ dependencies = [ "common-error", "console-subscriber", "greptime-proto", + "humantime-serde", "lazy_static", "once_cell", "opentelemetry 0.21.0", diff --git a/config/config.md b/config/config.md index 641eee4b5837..d413936ff3cd 100644 --- a/config/config.md +++ b/config/config.md @@ -163,6 +163,10 @@ | `logging.log_format` | String | `text` | The log format. Can be `text`/`json`. | | `logging.tracing_sample_ratio` | -- | -- | The percentage of tracing will be sampled and exported.
Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1.
ratio > 1 are treated as 1. Fractions < 0 are treated as 0 | | `logging.tracing_sample_ratio.default_ratio` | Float | `1.0` | -- | +| `logging.slow_query` | -- | -- | The slow query log options. | +| `logging.slow_query.enable` | Bool | `false` | Whether to enable slow query log. | +| `logging.slow_query.threshold` | String | Unset | The threshold of slow query. | +| `logging.slow_query.sample_ratio` | Float | Unset | The sampling ratio of slow query log. The value should be in the range of (0, 1]. | | `export_metrics` | -- | -- | The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API.
This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. | | `export_metrics.enable` | Bool | `false` | whether enable export metrics. | | `export_metrics.write_interval` | String | `30s` | The interval of export metrics. | @@ -249,6 +253,10 @@ | `logging.log_format` | String | `text` | The log format. Can be `text`/`json`. | | `logging.tracing_sample_ratio` | -- | -- | The percentage of tracing will be sampled and exported.
Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1.
ratio > 1 are treated as 1. Fractions < 0 are treated as 0 | | `logging.tracing_sample_ratio.default_ratio` | Float | `1.0` | -- | +| `logging.slow_query` | -- | -- | The slow query log options. | +| `logging.slow_query.enable` | Bool | `false` | Whether to enable slow query log. | +| `logging.slow_query.threshold` | String | Unset | The threshold of slow query. | +| `logging.slow_query.sample_ratio` | Float | Unset | The sampling ratio of slow query log. The value should be in the range of (0, 1]. | | `export_metrics` | -- | -- | The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API.
This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. | | `export_metrics.enable` | Bool | `false` | whether enable export metrics. | | `export_metrics.write_interval` | String | `30s` | The interval of export metrics. | @@ -314,6 +322,10 @@ | `logging.log_format` | String | `text` | The log format. Can be `text`/`json`. | | `logging.tracing_sample_ratio` | -- | -- | The percentage of tracing will be sampled and exported.
Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1.
ratio > 1 are treated as 1. Fractions < 0 are treated as 0 | | `logging.tracing_sample_ratio.default_ratio` | Float | `1.0` | -- | +| `logging.slow_query` | -- | -- | The slow query log options. | +| `logging.slow_query.enable` | Bool | `false` | Whether to enable slow query log. | +| `logging.slow_query.threshold` | String | Unset | The threshold of slow query. | +| `logging.slow_query.sample_ratio` | Float | Unset | The sampling ratio of slow query log. The value should be in the range of (0, 1]. | | `export_metrics` | -- | -- | The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API.
This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. | | `export_metrics.enable` | Bool | `false` | whether enable export metrics. | | `export_metrics.write_interval` | String | `30s` | The interval of export metrics. | @@ -466,6 +478,10 @@ | `logging.log_format` | String | `text` | The log format. Can be `text`/`json`. | | `logging.tracing_sample_ratio` | -- | -- | The percentage of tracing will be sampled and exported.
Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1.
ratio > 1 are treated as 1. Fractions < 0 are treated as 0 | | `logging.tracing_sample_ratio.default_ratio` | Float | `1.0` | -- | +| `logging.slow_query` | -- | -- | The slow query log options. | +| `logging.slow_query.enable` | Bool | `false` | Whether to enable slow query log. | +| `logging.slow_query.threshold` | String | Unset | The threshold of slow query. | +| `logging.slow_query.sample_ratio` | Float | Unset | The sampling ratio of slow query log. The value should be in the range of (0, 1]. | | `export_metrics` | -- | -- | The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API.
This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. | | `export_metrics.enable` | Bool | `false` | whether enable export metrics. | | `export_metrics.write_interval` | String | `30s` | The interval of export metrics. | @@ -512,5 +528,9 @@ | `logging.log_format` | String | `text` | The log format. Can be `text`/`json`. | | `logging.tracing_sample_ratio` | -- | -- | The percentage of tracing will be sampled and exported.
Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1.
ratio > 1 are treated as 1. Fractions < 0 are treated as 0 | | `logging.tracing_sample_ratio.default_ratio` | Float | `1.0` | -- | +| `logging.slow_query` | -- | -- | The slow query log options. | +| `logging.slow_query.enable` | Bool | `false` | Whether to enable slow query log. | +| `logging.slow_query.threshold` | String | Unset | The threshold of slow query. | +| `logging.slow_query.sample_ratio` | Float | Unset | The sampling ratio of slow query log. The value should be in the range of (0, 1]. | | `tracing` | -- | -- | The tracing options. Only effect when compiled with `tokio-console` feature. | | `tracing.tokio_console_addr` | String | Unset | The tokio console address. | diff --git a/config/datanode.example.toml b/config/datanode.example.toml index e4a3dca6d328..06a59ebd6fcd 100644 --- a/config/datanode.example.toml +++ b/config/datanode.example.toml @@ -586,6 +586,19 @@ log_format = "text" [logging.tracing_sample_ratio] default_ratio = 1.0 +## The slow query log options. +[logging.slow_query] +## Whether to enable slow query log. +enable = false + +## The threshold of slow query. +## @toml2docs:none-default +threshold = "10s" + +## The sampling ratio of slow query log. The value should be in the range of (0, 1]. +## @toml2docs:none-default +sample_ratio = 1.0 + ## The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API. ## This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. [export_metrics] diff --git a/config/flownode.example.toml b/config/flownode.example.toml index a3a414fb987d..9d6d2fe5aa68 100644 --- a/config/flownode.example.toml +++ b/config/flownode.example.toml @@ -84,6 +84,19 @@ log_format = "text" [logging.tracing_sample_ratio] default_ratio = 1.0 +## The slow query log options. +[logging.slow_query] +## Whether to enable slow query log. +enable = false + +## The threshold of slow query. +## @toml2docs:none-default +threshold = "10s" + +## The sampling ratio of slow query log. The value should be in the range of (0, 1]. +## @toml2docs:none-default +sample_ratio = 1.0 + ## The tracing options. Only effect when compiled with `tokio-console` feature. [tracing] ## The tokio console address. diff --git a/config/frontend.example.toml b/config/frontend.example.toml index eae001a2db0d..cc9698f61e88 100644 --- a/config/frontend.example.toml +++ b/config/frontend.example.toml @@ -191,6 +191,19 @@ log_format = "text" [logging.tracing_sample_ratio] default_ratio = 1.0 +## The slow query log options. +[logging.slow_query] +## Whether to enable slow query log. +enable = false + +## The threshold of slow query. +## @toml2docs:none-default +threshold = "10s" + +## The sampling ratio of slow query log. The value should be in the range of (0, 1]. +## @toml2docs:none-default +sample_ratio = 1.0 + ## The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API. ## This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. [export_metrics] diff --git a/config/metasrv.example.toml b/config/metasrv.example.toml index e95a9fa7f201..8431940b45aa 100644 --- a/config/metasrv.example.toml +++ b/config/metasrv.example.toml @@ -178,6 +178,19 @@ log_format = "text" [logging.tracing_sample_ratio] default_ratio = 1.0 +## The slow query log options. +[logging.slow_query] +## Whether to enable slow query log. +enable = false + +## The threshold of slow query. +## @toml2docs:none-default +threshold = "10s" + +## The sampling ratio of slow query log. The value should be in the range of (0, 1]. +## @toml2docs:none-default +sample_ratio = 1.0 + ## The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API. ## This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. [export_metrics] diff --git a/config/standalone.example.toml b/config/standalone.example.toml index 1cd75e641421..2d14b6550f3c 100644 --- a/config/standalone.example.toml +++ b/config/standalone.example.toml @@ -630,6 +630,19 @@ log_format = "text" [logging.tracing_sample_ratio] default_ratio = 1.0 +## The slow query log options. +[logging.slow_query] +## Whether to enable slow query log. +enable = false + +## The threshold of slow query. +## @toml2docs:none-default +threshold = "10s" + +## The sampling ratio of slow query log. The value should be in the range of (0, 1]. +## @toml2docs:none-default +sample_ratio = 1.0 + ## The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API. ## This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. [export_metrics] diff --git a/src/cmd/src/frontend.rs b/src/cmd/src/frontend.rs index 320dc49c1979..7678e90c884a 100644 --- a/src/cmd/src/frontend.rs +++ b/src/cmd/src/frontend.rs @@ -36,6 +36,7 @@ use frontend::instance::builder::FrontendBuilder; use frontend::instance::{FrontendInstance, Instance as FeInstance}; use frontend::server::Services; use meta_client::{MetaClientOptions, MetaClientType}; +use query::stats::StatementStatistics; use servers::tls::{TlsMode, TlsOption}; use servers::Mode; use snafu::{OptionExt, ResultExt}; @@ -352,6 +353,7 @@ impl StartCommand { catalog_manager, Arc::new(client), meta_client, + StatementStatistics::new(opts.logging.slow_query.clone()), ) .with_plugin(plugins.clone()) .with_local_cache_invalidator(layered_cache_registry) diff --git a/src/cmd/src/standalone.rs b/src/cmd/src/standalone.rs index 80b38ebaa749..4335bd5447e5 100644 --- a/src/cmd/src/standalone.rs +++ b/src/cmd/src/standalone.rs @@ -55,6 +55,7 @@ use frontend::service_config::{ }; use meta_srv::metasrv::{FLOW_ID_SEQ, TABLE_ID_SEQ}; use mito2::config::MitoConfig; +use query::stats::StatementStatistics; use serde::{Deserialize, Serialize}; use servers::export_metrics::ExportMetricsOption; use servers::grpc::GrpcOptions; @@ -557,6 +558,7 @@ impl StartCommand { catalog_manager.clone(), node_manager.clone(), ddl_task_executor.clone(), + StatementStatistics::new(opts.logging.slow_query.clone()), ) .with_plugin(plugins.clone()) .try_build() diff --git a/src/common/telemetry/Cargo.toml b/src/common/telemetry/Cargo.toml index 20fc52a763f7..2b4023cf7cdd 100644 --- a/src/common/telemetry/Cargo.toml +++ b/src/common/telemetry/Cargo.toml @@ -17,6 +17,7 @@ backtrace = "0.3" common-error.workspace = true console-subscriber = { version = "0.1", optional = true } greptime-proto.workspace = true +humantime-serde.workspace = true lazy_static.workspace = true once_cell.workspace = true opentelemetry = { version = "0.21.0", default-features = false, features = [ diff --git a/src/common/telemetry/src/logging.rs b/src/common/telemetry/src/logging.rs index de018aa4b6f3..c21766f04dc9 100644 --- a/src/common/telemetry/src/logging.rs +++ b/src/common/telemetry/src/logging.rs @@ -15,6 +15,7 @@ //! logging stuffs, inspired by databend use std::env; use std::sync::{Arc, Mutex, Once}; +use std::time::Duration; use once_cell::sync::{Lazy, OnceCell}; use opentelemetry::{global, KeyValue}; @@ -26,7 +27,7 @@ use serde::{Deserialize, Serialize}; use tracing_appender::non_blocking::WorkerGuard; use tracing_appender::rolling::{RollingFileAppender, Rotation}; use tracing_log::LogTracer; -use tracing_subscriber::filter::Targets; +use tracing_subscriber::filter::{FilterFn, Targets}; use tracing_subscriber::fmt::Layer; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::prelude::*; @@ -64,6 +65,24 @@ pub struct LoggingOptions { /// The tracing sample ratio. pub tracing_sample_ratio: Option, + + /// The logging options of slow query. + pub slow_query: SlowQueryOptions, +} + +/// The options of slow query. +#[derive(Clone, Debug, Serialize, Deserialize, Default)] +#[serde(default)] +pub struct SlowQueryOptions { + /// Whether to enable slow query log. + pub enable: bool, + + /// The threshold of slow queries. + #[serde(with = "humantime_serde")] + pub threshold: Option, + + /// The sample ratio of slow queries. + pub sample_ratio: Option, } #[derive(Clone, Debug, Copy, PartialEq, Eq, Serialize, Deserialize)] @@ -96,6 +115,7 @@ impl Default for LoggingOptions { otlp_endpoint: None, tracing_sample_ratio: None, append_stdout: true, + slow_query: SlowQueryOptions::default(), } } } @@ -235,6 +255,42 @@ pub fn init_global_logging( None }; + let slow_query_logging_layer = if !opts.dir.is_empty() && opts.slow_query.enable { + let rolling_appender = + RollingFileAppender::new(Rotation::HOURLY, &opts.dir, "greptimedb-slow-queries"); + let (writer, guard) = tracing_appender::non_blocking(rolling_appender); + guards.push(guard); + + // Only logs if the field contains "slow". + let slow_query_filter = FilterFn::new(|metadata| { + metadata + .fields() + .iter() + .any(|field| field.name().contains("slow")) + }); + + if opts.log_format == LogFormat::Json { + Some( + Layer::new() + .json() + .with_writer(writer) + .with_ansi(false) + .with_filter(slow_query_filter) + .boxed(), + ) + } else { + Some( + Layer::new() + .with_writer(writer) + .with_ansi(false) + .with_filter(slow_query_filter) + .boxed(), + ) + } + } else { + None + }; + // resolve log level settings from: // - options from command line or config files // - environment variable: RUST_LOG @@ -279,6 +335,7 @@ pub fn init_global_logging( .with(stdout_logging_layer) .with(file_logging_layer) .with(err_file_logging_layer) + .with(slow_query_logging_layer) }; // consume the `tracing_opts` to avoid "unused" warnings. @@ -289,7 +346,8 @@ pub fn init_global_logging( .with(dyn_filter) .with(stdout_logging_layer) .with(file_logging_layer) - .with(err_file_logging_layer); + .with(err_file_logging_layer) + .with(slow_query_logging_layer); if opts.enable_otlp_tracing { global::set_text_map_propagator(TraceContextPropagator::new()); diff --git a/src/common/telemetry/src/macros.rs b/src/common/telemetry/src/macros.rs index cb838db6fef6..2846bd5d2051 100644 --- a/src/common/telemetry/src/macros.rs +++ b/src/common/telemetry/src/macros.rs @@ -152,6 +152,17 @@ macro_rules! trace { }; } +#[macro_export] +macro_rules! slow { + (target: $target:expr, $($arg:tt)+) => { + $crate::log!(target: $target, slow = true, $crate::tracing::Level::INFO, $($arg)+) + }; + + ($($arg:tt)+) => { + $crate::log!($crate::tracing::Level::INFO, slow = true, $($arg)+) + }; +} + #[cfg(test)] mod tests { use common_error::mock::MockError; diff --git a/src/flow/src/server.rs b/src/flow/src/server.rs index d78f9219cb0c..4381dd06a03b 100644 --- a/src/flow/src/server.rs +++ b/src/flow/src/server.rs @@ -37,6 +37,7 @@ use operator::delete::Deleter; use operator::insert::Inserter; use operator::statement::StatementExecutor; use partition::manager::PartitionRuleManager; +use query::stats::StatementStatistics; use query::{QueryEngine, QueryEngineFactory}; use servers::error::{AlreadyStartedSnafu, StartGrpcSnafu, TcpBindSnafu, TcpIncomingSnafu}; use servers::server::Server; @@ -475,6 +476,7 @@ impl FrontendInvoker { layered_cache_registry.clone(), inserter.clone(), table_route_cache, + StatementStatistics::default(), )); let invoker = FrontendInvoker::new(inserter, deleter, statement_executor); diff --git a/src/frontend/src/instance/builder.rs b/src/frontend/src/instance/builder.rs index 5450e55ce25e..a9513121d85a 100644 --- a/src/frontend/src/instance/builder.rs +++ b/src/frontend/src/instance/builder.rs @@ -33,6 +33,7 @@ use operator::statement::{StatementExecutor, StatementExecutorRef}; use operator::table::TableMutationOperator; use partition::manager::PartitionRuleManager; use pipeline::pipeline_operator::PipelineOperator; +use query::stats::StatementStatistics; use query::QueryEngineFactory; use servers::server::ServerHandlers; use snafu::OptionExt; @@ -55,6 +56,7 @@ pub struct FrontendBuilder { plugins: Option, procedure_executor: ProcedureExecutorRef, heartbeat_task: Option, + stats: StatementStatistics, } impl FrontendBuilder { @@ -65,6 +67,7 @@ impl FrontendBuilder { catalog_manager: CatalogManagerRef, node_manager: NodeManagerRef, procedure_executor: ProcedureExecutorRef, + stats: StatementStatistics, ) -> Self { Self { options, @@ -76,6 +79,7 @@ impl FrontendBuilder { plugins: None, procedure_executor, heartbeat_task: None, + stats, } } @@ -181,6 +185,7 @@ impl FrontendBuilder { local_cache_invalidator, inserter.clone(), table_route_cache, + self.stats, )); let pipeline_operator = Arc::new(PipelineOperator::new( diff --git a/src/operator/src/statement.rs b/src/operator/src/statement.rs index 7c76d0dcfffc..033bd14e9c40 100644 --- a/src/operator/src/statement.rs +++ b/src/operator/src/statement.rs @@ -45,6 +45,7 @@ use common_time::Timestamp; use datafusion_expr::LogicalPlan; use partition::manager::{PartitionRuleManager, PartitionRuleManagerRef}; use query::parser::QueryStatement; +use query::stats::StatementStatistics; use query::QueryEngineRef; use session::context::{Channel, QueryContextRef}; use session::table_name::table_idents_to_full_name; @@ -80,11 +81,13 @@ pub struct StatementExecutor { partition_manager: PartitionRuleManagerRef, cache_invalidator: CacheInvalidatorRef, inserter: InserterRef, + stats: StatementStatistics, } pub type StatementExecutorRef = Arc; impl StatementExecutor { + #[allow(clippy::too_many_arguments)] pub fn new( catalog_manager: CatalogManagerRef, query_engine: QueryEngineRef, @@ -93,6 +96,7 @@ impl StatementExecutor { cache_invalidator: CacheInvalidatorRef, inserter: InserterRef, table_route_cache: TableRouteCacheRef, + stats: StatementStatistics, ) -> Self { Self { catalog_manager, @@ -104,6 +108,7 @@ impl StatementExecutor { partition_manager: Arc::new(PartitionRuleManager::new(kv_backend, table_route_cache)), cache_invalidator, inserter, + stats, } } @@ -113,6 +118,7 @@ impl StatementExecutor { stmt: QueryStatement, query_ctx: QueryContextRef, ) -> Result { + let _slow_query_timer = self.stats.start_slow_query_timer(stmt.clone()); match stmt { QueryStatement::Sql(stmt) => self.execute_sql(stmt, query_ctx).await, QueryStatement::Promql(_) => self.plan_exec(stmt, query_ctx).await, diff --git a/src/query/Cargo.toml b/src/query/Cargo.toml index eed0d5a3398e..4bcda7b0a77f 100644 --- a/src/query/Cargo.toml +++ b/src/query/Cargo.toml @@ -54,6 +54,7 @@ prometheus.workspace = true promql.workspace = true promql-parser.workspace = true prost.workspace = true +rand.workspace = true regex.workspace = true session.workspace = true snafu.workspace = true diff --git a/src/query/src/lib.rs b/src/query/src/lib.rs index d6dfc5e09734..e2843c708e6d 100644 --- a/src/query/src/lib.rs +++ b/src/query/src/lib.rs @@ -35,6 +35,7 @@ pub mod query_engine; mod range_select; pub mod region_query; pub mod sql; +pub mod stats; #[cfg(test)] mod tests; diff --git a/src/query/src/stats.rs b/src/query/src/stats.rs new file mode 100644 index 000000000000..896271897db1 --- /dev/null +++ b/src/query/src/stats.rs @@ -0,0 +1,98 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::time::Duration; + +use common_telemetry::logging::SlowQueryOptions; +use common_telemetry::slow; +use rand::random; + +use crate::parser::QueryStatement; + +/// StatementStatistics is used to collect statistics for a statement. +#[derive(Default, Clone, Debug)] +pub struct StatementStatistics { + /// slow_query is used to configure slow query log. + pub slow_query: SlowQueryOptions, +} + +impl StatementStatistics { + pub fn new(slow_query_options: SlowQueryOptions) -> Self { + Self { + slow_query: slow_query_options, + } + } + + pub fn start_slow_query_timer(&self, stmt: QueryStatement) -> Option { + if self.slow_query.enable { + Some(SlowQueryTimer { + start: std::time::Instant::now(), + stmt, + threshold: self.slow_query.threshold, + sample_ratio: self.slow_query.sample_ratio, + }) + } else { + None + } + } +} + +/// SlowQueryTimer is used to log slow query when it's dropped. +pub struct SlowQueryTimer { + start: std::time::Instant, + stmt: QueryStatement, + threshold: Option, + sample_ratio: Option, +} + +impl SlowQueryTimer { + fn log_slow_query(&self, elapsed: Duration, threshold: Duration) { + match &self.stmt { + QueryStatement::Sql(stmt) => { + slow!( + cost = elapsed.as_millis() as u64, + threshold = threshold.as_millis() as u64, + sql = stmt.to_string() + ); + } + QueryStatement::Promql(stmt) => { + slow!( + cost = elapsed.as_millis() as u64, + threshold = threshold.as_millis() as u64, + // TODO(zyy17): It's better to implement Display for EvalStmt for pretty print. + promql = format!("{:?}", stmt) + ); + } + } + } +} + +impl Drop for SlowQueryTimer { + fn drop(&mut self) { + if let Some(threshold) = self.threshold { + let elapsed = self.start.elapsed(); + if elapsed > threshold { + if let Some(ratio) = self.sample_ratio { + // Generate a random number in [0, 1) and compare it with sample_ratio. + if ratio >= 1.0 || random::() <= ratio { + self.log_slow_query(elapsed, threshold); + } + } else { + // If sample_ratio is not set, log all slow queries. + self.log_slow_query(elapsed, threshold); + } + } + } + } +} diff --git a/tests-integration/src/cluster.rs b/tests-integration/src/cluster.rs index ce2803996aa5..ad2c3e369fe2 100644 --- a/tests-integration/src/cluster.rs +++ b/tests-integration/src/cluster.rs @@ -48,6 +48,7 @@ use meta_client::client::MetaClientBuilder; use meta_srv::cluster::MetaPeerClientRef; use meta_srv::metasrv::{Metasrv, MetasrvOptions, SelectorRef}; use meta_srv::mocks::MockInfo; +use query::stats::StatementStatistics; use servers::grpc::flight::FlightCraftWrapper; use servers::grpc::region_server::RegionServerRequestHandler; use servers::heartbeat_options::HeartbeatOptions; @@ -393,6 +394,7 @@ impl GreptimeDbClusterBuilder { catalog_manager, datanode_clients, meta_client, + StatementStatistics::default(), ) .with_local_cache_invalidator(cache_registry) .with_heartbeat_task(heartbeat_task) diff --git a/tests-integration/src/standalone.rs b/tests-integration/src/standalone.rs index fa6e8f2a9e5c..123614e43693 100644 --- a/tests-integration/src/standalone.rs +++ b/tests-integration/src/standalone.rs @@ -40,6 +40,7 @@ use flow::FlownodeBuilder; use frontend::instance::builder::FrontendBuilder; use frontend::instance::{FrontendInstance, Instance, StandaloneDatanodeManager}; use meta_srv::metasrv::{FLOW_ID_SEQ, TABLE_ID_SEQ}; +use query::stats::StatementStatistics; use servers::Mode; use snafu::ResultExt; @@ -215,6 +216,7 @@ impl GreptimeDbStandaloneBuilder { catalog_manager.clone(), node_manager.clone(), ddl_task_executor.clone(), + StatementStatistics::default(), ) .with_plugin(plugins) .try_build() diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index e11060fbbda5..5f3eb854bbe6 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -836,6 +836,9 @@ retry_delay = "500ms" append_stdout = true enable_otlp_tracing = false +[logging.slow_query] +enable = false + [[region_engine]] [region_engine.mito]