From 01a308fe6bd5de04466983bac410c1a769bf2abc Mon Sep 17 00:00:00 2001
From: Weny Xu <wenymedia@gmail.com>
Date: Thu, 5 Dec 2024 20:12:24 +0800
Subject: [PATCH 01/36] refactor: relocate CLI to a dedicated directory (#5101)

* refactor: relocate CLI to a dedicated directory

* chore: expose method and const

* refactor: use BoxedError

* chore: expose DatabaseClient

* chore: add clone derive
---
 Cargo.lock                                    |  58 ++++
 Cargo.toml                                    |   2 +
 src/catalog/src/information_extension.rs      |  92 +++++
 src/catalog/src/lib.rs                        |   1 +
 src/cli/Cargo.toml                            |  65 ++++
 src/{cmd/src/cli => cli/src}/bench.rs         |  11 +-
 .../src/cli => cli/src}/bench/metadata.rs     |   2 +-
 src/{cmd/src/cli => cli/src}/cmd.rs           |   0
 src/{cmd/src/cli => cli/src}/database.rs      |   3 +-
 src/cli/src/error.rs                          | 316 ++++++++++++++++++
 src/{cmd/src/cli => cli/src}/export.rs        | 131 ++------
 src/{cmd/src/cli => cli/src}/helper.rs        |   2 +-
 src/{cmd/src/cli => cli/src}/import.rs        |  40 ++-
 src/cli/src/lib.rs                            |  60 ++++
 src/{cmd/src/cli => cli/src}/repl.rs          |   8 +-
 src/cmd/Cargo.toml                            |   1 +
 src/cmd/src/cli.rs                            | 155 +++++----
 src/cmd/src/error.rs                          |  16 +
 src/cmd/src/flownode.rs                       |   3 +-
 src/cmd/src/frontend.rs                       |   3 +-
 src/cmd/src/lib.rs                            |  76 -----
 src/common/meta/src/key.rs                    |   2 +-
 src/common/meta/src/kv_backend/txn.rs         |   7 +
 src/plugins/Cargo.toml                        |   3 +
 src/plugins/src/cli.rs                        |  36 ++
 src/plugins/src/lib.rs                        |   2 +
 tests-integration/src/cluster.rs              |   2 +-
 27 files changed, 817 insertions(+), 280 deletions(-)
 create mode 100644 src/catalog/src/information_extension.rs
 create mode 100644 src/cli/Cargo.toml
 rename src/{cmd/src/cli => cli/src}/bench.rs (94%)
 rename src/{cmd/src/cli => cli/src}/bench/metadata.rs (99%)
 rename src/{cmd/src/cli => cli/src}/cmd.rs (100%)
 rename src/{cmd/src/cli => cli/src}/database.rs (98%)
 create mode 100644 src/cli/src/error.rs
 rename src/{cmd/src/cli => cli/src}/export.rs (81%)
 rename src/{cmd/src/cli => cli/src}/helper.rs (99%)
 rename src/{cmd/src/cli => cli/src}/import.rs (89%)
 create mode 100644 src/cli/src/lib.rs
 rename src/{cmd/src/cli => cli/src}/repl.rs (98%)
 create mode 100644 src/plugins/src/cli.rs

diff --git a/Cargo.lock b/Cargo.lock
index 8ec39f71f7c3..0085294272e2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1712,6 +1712,60 @@ version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
 
+[[package]]
+name = "cli"
+version = "0.11.0"
+dependencies = [
+ "async-trait",
+ "auth",
+ "base64 0.21.7",
+ "cache",
+ "catalog",
+ "chrono",
+ "clap 4.5.19",
+ "client",
+ "common-base",
+ "common-catalog",
+ "common-config",
+ "common-error",
+ "common-grpc",
+ "common-macro",
+ "common-meta",
+ "common-options",
+ "common-procedure",
+ "common-query",
+ "common-recordbatch",
+ "common-runtime",
+ "common-telemetry",
+ "common-test-util",
+ "common-time",
+ "common-version",
+ "common-wal",
+ "datatypes",
+ "either",
+ "etcd-client",
+ "futures",
+ "humantime",
+ "meta-client",
+ "nu-ansi-term",
+ "query",
+ "rand",
+ "reqwest",
+ "rustyline 10.1.1",
+ "serde",
+ "serde_json",
+ "servers",
+ "session",
+ "snafu 0.8.5",
+ "store-api",
+ "substrait 0.11.0",
+ "table",
+ "temp-env",
+ "tempfile",
+ "tokio",
+ "tracing-appender",
+]
+
 [[package]]
 name = "client"
 version = "0.11.0"
@@ -1793,6 +1847,7 @@ dependencies = [
  "catalog",
  "chrono",
  "clap 4.5.19",
+ "cli",
  "client",
  "common-base",
  "common-catalog",
@@ -8348,7 +8403,10 @@ name = "plugins"
 version = "0.11.0"
 dependencies = [
  "auth",
+ "clap 4.5.19",
+ "cli",
  "common-base",
+ "common-error",
  "datanode",
  "frontend",
  "meta-srv",
diff --git a/Cargo.toml b/Cargo.toml
index 73db80c4c858..4cc07cd89818 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,6 +4,7 @@ members = [
     "src/auth",
     "src/cache",
     "src/catalog",
+    "src/cli",
     "src/client",
     "src/cmd",
     "src/common/base",
@@ -200,6 +201,7 @@ api = { path = "src/api" }
 auth = { path = "src/auth" }
 cache = { path = "src/cache" }
 catalog = { path = "src/catalog" }
+cli = { path = "src/cli" }
 client = { path = "src/client" }
 cmd = { path = "src/cmd", default-features = false }
 common-base = { path = "src/common/base" }
diff --git a/src/catalog/src/information_extension.rs b/src/catalog/src/information_extension.rs
new file mode 100644
index 000000000000..55764557a326
--- /dev/null
+++ b/src/catalog/src/information_extension.rs
@@ -0,0 +1,92 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use api::v1::meta::ProcedureStatus;
+use common_error::ext::BoxedError;
+use common_meta::cluster::{ClusterInfo, NodeInfo};
+use common_meta::datanode::RegionStat;
+use common_meta::ddl::{ExecutorContext, ProcedureExecutor};
+use common_meta::rpc::procedure;
+use common_procedure::{ProcedureInfo, ProcedureState};
+use meta_client::MetaClientRef;
+use snafu::ResultExt;
+
+use crate::error;
+use crate::information_schema::InformationExtension;
+
+pub struct DistributedInformationExtension {
+    meta_client: MetaClientRef,
+}
+
+impl DistributedInformationExtension {
+    pub fn new(meta_client: MetaClientRef) -> Self {
+        Self { meta_client }
+    }
+}
+
+#[async_trait::async_trait]
+impl InformationExtension for DistributedInformationExtension {
+    type Error = crate::error::Error;
+
+    async fn nodes(&self) -> std::result::Result<Vec<NodeInfo>, Self::Error> {
+        self.meta_client
+            .list_nodes(None)
+            .await
+            .map_err(BoxedError::new)
+            .context(error::ListNodesSnafu)
+    }
+
+    async fn procedures(&self) -> std::result::Result<Vec<(String, ProcedureInfo)>, Self::Error> {
+        let procedures = self
+            .meta_client
+            .list_procedures(&ExecutorContext::default())
+            .await
+            .map_err(BoxedError::new)
+            .context(error::ListProceduresSnafu)?
+            .procedures;
+        let mut result = Vec::with_capacity(procedures.len());
+        for procedure in procedures {
+            let pid = match procedure.id {
+                Some(pid) => pid,
+                None => return error::ProcedureIdNotFoundSnafu {}.fail(),
+            };
+            let pid = procedure::pb_pid_to_pid(&pid)
+                .map_err(BoxedError::new)
+                .context(error::ConvertProtoDataSnafu)?;
+            let status = ProcedureStatus::try_from(procedure.status)
+                .map(|v| v.as_str_name())
+                .unwrap_or("Unknown")
+                .to_string();
+            let procedure_info = ProcedureInfo {
+                id: pid,
+                type_name: procedure.type_name,
+                start_time_ms: procedure.start_time_ms,
+                end_time_ms: procedure.end_time_ms,
+                state: ProcedureState::Running,
+                lock_keys: procedure.lock_keys,
+            };
+            result.push((status, procedure_info));
+        }
+
+        Ok(result)
+    }
+
+    async fn region_stats(&self) -> std::result::Result<Vec<RegionStat>, Self::Error> {
+        self.meta_client
+            .list_region_stats()
+            .await
+            .map_err(BoxedError::new)
+            .context(error::ListRegionStatsSnafu)
+    }
+}
diff --git a/src/catalog/src/lib.rs b/src/catalog/src/lib.rs
index 3444c0e089e6..623f2a363e6d 100644
--- a/src/catalog/src/lib.rs
+++ b/src/catalog/src/lib.rs
@@ -30,6 +30,7 @@ use table::TableRef;
 use crate::error::Result;
 
 pub mod error;
+pub mod information_extension;
 pub mod kvbackend;
 pub mod memory;
 mod metrics;
diff --git a/src/cli/Cargo.toml b/src/cli/Cargo.toml
new file mode 100644
index 000000000000..b49aa00ee2cc
--- /dev/null
+++ b/src/cli/Cargo.toml
@@ -0,0 +1,65 @@
+[package]
+name = "cli"
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+async-trait.workspace = true
+auth.workspace = true
+base64.workspace = true
+cache.workspace = true
+catalog.workspace = true
+chrono.workspace = true
+clap.workspace = true
+client.workspace = true
+common-base.workspace = true
+common-catalog.workspace = true
+common-config.workspace = true
+common-error.workspace = true
+common-grpc.workspace = true
+common-macro.workspace = true
+common-meta.workspace = true
+common-options.workspace = true
+common-procedure.workspace = true
+common-query.workspace = true
+common-recordbatch.workspace = true
+common-runtime.workspace = true
+common-telemetry = { workspace = true, features = [
+    "deadlock_detection",
+] }
+common-time.workspace = true
+common-version.workspace = true
+common-wal.workspace = true
+datatypes.workspace = true
+either = "1.8"
+etcd-client.workspace = true
+futures.workspace = true
+humantime.workspace = true
+meta-client.workspace = true
+nu-ansi-term = "0.46"
+query.workspace = true
+rand.workspace = true
+reqwest.workspace = true
+rustyline = "10.1"
+serde.workspace = true
+serde_json.workspace = true
+servers.workspace = true
+session.workspace = true
+snafu.workspace = true
+store-api.workspace = true
+substrait.workspace = true
+table.workspace = true
+tokio.workspace = true
+tracing-appender.workspace = true
+
+[dev-dependencies]
+client = { workspace = true, features = ["testing"] }
+common-test-util.workspace = true
+common-version.workspace = true
+serde.workspace = true
+temp-env = "0.3"
+tempfile.workspace = true
diff --git a/src/cmd/src/cli/bench.rs b/src/cli/src/bench.rs
similarity index 94%
rename from src/cmd/src/cli/bench.rs
rename to src/cli/src/bench.rs
index f3d1d0f8097f..9731bf8e6fa6 100644
--- a/src/cmd/src/cli/bench.rs
+++ b/src/cli/src/bench.rs
@@ -19,6 +19,7 @@ use std::time::Duration;
 
 use async_trait::async_trait;
 use clap::Parser;
+use common_error::ext::BoxedError;
 use common_meta::key::{TableMetadataManager, TableMetadataManagerRef};
 use common_meta::kv_backend::etcd::EtcdStore;
 use common_meta::peer::Peer;
@@ -30,11 +31,9 @@ use rand::Rng;
 use store_api::storage::RegionNumber;
 use table::metadata::{RawTableInfo, RawTableMeta, TableId, TableIdent, TableType};
 use table::table_name::TableName;
-use tracing_appender::non_blocking::WorkerGuard;
 
 use self::metadata::TableMetadataBencher;
-use crate::cli::{Instance, Tool};
-use crate::error::Result;
+use crate::Tool;
 
 mod metadata;
 
@@ -62,7 +61,7 @@ pub struct BenchTableMetadataCommand {
 }
 
 impl BenchTableMetadataCommand {
-    pub async fn build(&self, guard: Vec<WorkerGuard>) -> Result<Instance> {
+    pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
         let etcd_store = EtcdStore::with_endpoints([&self.etcd_addr], 128)
             .await
             .unwrap();
@@ -73,7 +72,7 @@ impl BenchTableMetadataCommand {
             table_metadata_manager,
             count: self.count,
         };
-        Ok(Instance::new(Box::new(tool), guard))
+        Ok(Box::new(tool))
     }
 }
 
@@ -84,7 +83,7 @@ struct BenchTableMetadata {
 
 #[async_trait]
 impl Tool for BenchTableMetadata {
-    async fn do_work(&self) -> Result<()> {
+    async fn do_work(&self) -> std::result::Result<(), BoxedError> {
         let bencher = TableMetadataBencher::new(self.table_metadata_manager.clone(), self.count);
         bencher.bench_create().await;
         bencher.bench_get().await;
diff --git a/src/cmd/src/cli/bench/metadata.rs b/src/cli/src/bench/metadata.rs
similarity index 99%
rename from src/cmd/src/cli/bench/metadata.rs
rename to src/cli/src/bench/metadata.rs
index 9229b0342e88..28343232a1db 100644
--- a/src/cmd/src/cli/bench/metadata.rs
+++ b/src/cli/src/bench/metadata.rs
@@ -18,7 +18,7 @@ use common_meta::key::table_route::TableRouteValue;
 use common_meta::key::TableMetadataManagerRef;
 use table::table_name::TableName;
 
-use crate::cli::bench::{
+use crate::bench::{
     bench_self_recorded, create_region_routes, create_region_wal_options, create_table_info,
 };
 
diff --git a/src/cmd/src/cli/cmd.rs b/src/cli/src/cmd.rs
similarity index 100%
rename from src/cmd/src/cli/cmd.rs
rename to src/cli/src/cmd.rs
diff --git a/src/cmd/src/cli/database.rs b/src/cli/src/database.rs
similarity index 98%
rename from src/cmd/src/cli/database.rs
rename to src/cli/src/database.rs
index 9e6b752ea51b..7152aac59270 100644
--- a/src/cmd/src/cli/database.rs
+++ b/src/cli/src/database.rs
@@ -26,7 +26,8 @@ use snafu::ResultExt;
 
 use crate::error::{HttpQuerySqlSnafu, Result, SerdeJsonSnafu};
 
-pub(crate) struct DatabaseClient {
+#[derive(Debug, Clone)]
+pub struct DatabaseClient {
     addr: String,
     catalog: String,
     auth_header: Option<String>,
diff --git a/src/cli/src/error.rs b/src/cli/src/error.rs
new file mode 100644
index 000000000000..bf0b6342c1f9
--- /dev/null
+++ b/src/cli/src/error.rs
@@ -0,0 +1,316 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+
+use common_error::ext::{BoxedError, ErrorExt};
+use common_error::status_code::StatusCode;
+use common_macro::stack_trace_debug;
+use rustyline::error::ReadlineError;
+use snafu::{Location, Snafu};
+
+#[derive(Snafu)]
+#[snafu(visibility(pub))]
+#[stack_trace_debug]
+pub enum Error {
+    #[snafu(display("Failed to install ring crypto provider: {}", msg))]
+    InitTlsProvider {
+        #[snafu(implicit)]
+        location: Location,
+        msg: String,
+    },
+    #[snafu(display("Failed to create default catalog and schema"))]
+    InitMetadata {
+        #[snafu(implicit)]
+        location: Location,
+        source: common_meta::error::Error,
+    },
+
+    #[snafu(display("Failed to init DDL manager"))]
+    InitDdlManager {
+        #[snafu(implicit)]
+        location: Location,
+        source: common_meta::error::Error,
+    },
+
+    #[snafu(display("Failed to init default timezone"))]
+    InitTimezone {
+        #[snafu(implicit)]
+        location: Location,
+        source: common_time::error::Error,
+    },
+
+    #[snafu(display("Failed to start procedure manager"))]
+    StartProcedureManager {
+        #[snafu(implicit)]
+        location: Location,
+        source: common_procedure::error::Error,
+    },
+
+    #[snafu(display("Failed to stop procedure manager"))]
+    StopProcedureManager {
+        #[snafu(implicit)]
+        location: Location,
+        source: common_procedure::error::Error,
+    },
+
+    #[snafu(display("Failed to start wal options allocator"))]
+    StartWalOptionsAllocator {
+        #[snafu(implicit)]
+        location: Location,
+        source: common_meta::error::Error,
+    },
+
+    #[snafu(display("Missing config, msg: {}", msg))]
+    MissingConfig {
+        msg: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Illegal config: {}", msg))]
+    IllegalConfig {
+        msg: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Invalid REPL command: {reason}"))]
+    InvalidReplCommand { reason: String },
+
+    #[snafu(display("Cannot create REPL"))]
+    ReplCreation {
+        #[snafu(source)]
+        error: ReadlineError,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Error reading command"))]
+    Readline {
+        #[snafu(source)]
+        error: ReadlineError,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to request database, sql: {sql}"))]
+    RequestDatabase {
+        sql: String,
+        #[snafu(source)]
+        source: client::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to collect RecordBatches"))]
+    CollectRecordBatches {
+        #[snafu(implicit)]
+        location: Location,
+        source: common_recordbatch::error::Error,
+    },
+
+    #[snafu(display("Failed to pretty print Recordbatches"))]
+    PrettyPrintRecordBatches {
+        #[snafu(implicit)]
+        location: Location,
+        source: common_recordbatch::error::Error,
+    },
+
+    #[snafu(display("Failed to start Meta client"))]
+    StartMetaClient {
+        #[snafu(implicit)]
+        location: Location,
+        source: meta_client::error::Error,
+    },
+
+    #[snafu(display("Failed to parse SQL: {}", sql))]
+    ParseSql {
+        sql: String,
+        #[snafu(implicit)]
+        location: Location,
+        source: query::error::Error,
+    },
+
+    #[snafu(display("Failed to plan statement"))]
+    PlanStatement {
+        #[snafu(implicit)]
+        location: Location,
+        source: query::error::Error,
+    },
+
+    #[snafu(display("Failed to encode logical plan in substrait"))]
+    SubstraitEncodeLogicalPlan {
+        #[snafu(implicit)]
+        location: Location,
+        source: substrait::error::Error,
+    },
+
+    #[snafu(display("Failed to load layered config"))]
+    LoadLayeredConfig {
+        #[snafu(source(from(common_config::error::Error, Box::new)))]
+        source: Box<common_config::error::Error>,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to connect to Etcd at {etcd_addr}"))]
+    ConnectEtcd {
+        etcd_addr: String,
+        #[snafu(source)]
+        error: etcd_client::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to serde json"))]
+    SerdeJson {
+        #[snafu(source)]
+        error: serde_json::error::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to run http request: {reason}"))]
+    HttpQuerySql {
+        reason: String,
+        #[snafu(source)]
+        error: reqwest::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Empty result from output"))]
+    EmptyResult {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to manipulate file"))]
+    FileIo {
+        #[snafu(implicit)]
+        location: Location,
+        #[snafu(source)]
+        error: std::io::Error,
+    },
+
+    #[snafu(display("Failed to create directory {}", dir))]
+    CreateDir {
+        dir: String,
+        #[snafu(source)]
+        error: std::io::Error,
+    },
+
+    #[snafu(display("Failed to spawn thread"))]
+    SpawnThread {
+        #[snafu(source)]
+        error: std::io::Error,
+    },
+
+    #[snafu(display("Other error"))]
+    Other {
+        source: BoxedError,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to build runtime"))]
+    BuildRuntime {
+        #[snafu(implicit)]
+        location: Location,
+        source: common_runtime::error::Error,
+    },
+
+    #[snafu(display("Failed to get cache from cache registry: {}", name))]
+    CacheRequired {
+        #[snafu(implicit)]
+        location: Location,
+        name: String,
+    },
+
+    #[snafu(display("Failed to build cache registry"))]
+    BuildCacheRegistry {
+        #[snafu(implicit)]
+        location: Location,
+        source: cache::error::Error,
+    },
+
+    #[snafu(display("Failed to initialize meta client"))]
+    MetaClientInit {
+        #[snafu(implicit)]
+        location: Location,
+        source: meta_client::error::Error,
+    },
+
+    #[snafu(display("Cannot find schema {schema} in catalog {catalog}"))]
+    SchemaNotFound {
+        catalog: String,
+        schema: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+impl ErrorExt for Error {
+    fn status_code(&self) -> StatusCode {
+        match self {
+            Error::InitMetadata { source, .. } | Error::InitDdlManager { source, .. } => {
+                source.status_code()
+            }
+
+            Error::MissingConfig { .. }
+            | Error::LoadLayeredConfig { .. }
+            | Error::IllegalConfig { .. }
+            | Error::InvalidReplCommand { .. }
+            | Error::InitTimezone { .. }
+            | Error::ConnectEtcd { .. }
+            | Error::CreateDir { .. }
+            | Error::EmptyResult { .. } => StatusCode::InvalidArguments,
+
+            Error::StartProcedureManager { source, .. }
+            | Error::StopProcedureManager { source, .. } => source.status_code(),
+            Error::StartWalOptionsAllocator { source, .. } => source.status_code(),
+            Error::ReplCreation { .. } | Error::Readline { .. } | Error::HttpQuerySql { .. } => {
+                StatusCode::Internal
+            }
+            Error::RequestDatabase { source, .. } => source.status_code(),
+            Error::CollectRecordBatches { source, .. }
+            | Error::PrettyPrintRecordBatches { source, .. } => source.status_code(),
+            Error::StartMetaClient { source, .. } => source.status_code(),
+            Error::ParseSql { source, .. } | Error::PlanStatement { source, .. } => {
+                source.status_code()
+            }
+            Error::SubstraitEncodeLogicalPlan { source, .. } => source.status_code(),
+
+            Error::SerdeJson { .. }
+            | Error::FileIo { .. }
+            | Error::SpawnThread { .. }
+            | Error::InitTlsProvider { .. } => StatusCode::Unexpected,
+
+            Error::Other { source, .. } => source.status_code(),
+
+            Error::BuildRuntime { source, .. } => source.status_code(),
+
+            Error::CacheRequired { .. } | Error::BuildCacheRegistry { .. } => StatusCode::Internal,
+            Error::MetaClientInit { source, .. } => source.status_code(),
+            Error::SchemaNotFound { .. } => StatusCode::DatabaseNotFound,
+        }
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
diff --git a/src/cmd/src/cli/export.rs b/src/cli/src/export.rs
similarity index 81%
rename from src/cmd/src/cli/export.rs
rename to src/cli/src/export.rs
index 6d6cb6756b82..91e4be22bb93 100644
--- a/src/cmd/src/cli/export.rs
+++ b/src/cli/src/export.rs
@@ -19,6 +19,7 @@ use std::time::Duration;
 
 use async_trait::async_trait;
 use clap::{Parser, ValueEnum};
+use common_error::ext::BoxedError;
 use common_telemetry::{debug, error, info};
 use serde_json::Value;
 use snafu::{OptionExt, ResultExt};
@@ -26,11 +27,10 @@ use tokio::fs::File;
 use tokio::io::{AsyncWriteExt, BufWriter};
 use tokio::sync::Semaphore;
 use tokio::time::Instant;
-use tracing_appender::non_blocking::WorkerGuard;
 
-use crate::cli::database::DatabaseClient;
-use crate::cli::{database, Instance, Tool};
+use crate::database::DatabaseClient;
 use crate::error::{EmptyResultSnafu, Error, FileIoSnafu, Result, SchemaNotFoundSnafu};
+use crate::{database, Tool};
 
 type TableReference = (String, String, String);
 
@@ -94,8 +94,9 @@ pub struct ExportCommand {
 }
 
 impl ExportCommand {
-    pub async fn build(&self, guard: Vec<WorkerGuard>) -> Result<Instance> {
-        let (catalog, schema) = database::split_database(&self.database)?;
+    pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
+        let (catalog, schema) =
+            database::split_database(&self.database).map_err(BoxedError::new)?;
 
         let database_client = DatabaseClient::new(
             self.addr.clone(),
@@ -105,19 +106,16 @@ impl ExportCommand {
             self.timeout.unwrap_or_default(),
         );
 
-        Ok(Instance::new(
-            Box::new(Export {
-                catalog,
-                schema,
-                database_client,
-                output_dir: self.output_dir.clone(),
-                parallelism: self.export_jobs,
-                target: self.target.clone(),
-                start_time: self.start_time.clone(),
-                end_time: self.end_time.clone(),
-            }),
-            guard,
-        ))
+        Ok(Box::new(Export {
+            catalog,
+            schema,
+            database_client,
+            output_dir: self.output_dir.clone(),
+            parallelism: self.export_jobs,
+            target: self.target.clone(),
+            start_time: self.start_time.clone(),
+            end_time: self.end_time.clone(),
+        }))
     }
 }
 
@@ -465,97 +463,22 @@ impl Export {
 
 #[async_trait]
 impl Tool for Export {
-    async fn do_work(&self) -> Result<()> {
+    async fn do_work(&self) -> std::result::Result<(), BoxedError> {
         match self.target {
             ExportTarget::Schema => {
-                self.export_create_database().await?;
-                self.export_create_table().await
+                self.export_create_database()
+                    .await
+                    .map_err(BoxedError::new)?;
+                self.export_create_table().await.map_err(BoxedError::new)
             }
-            ExportTarget::Data => self.export_database_data().await,
+            ExportTarget::Data => self.export_database_data().await.map_err(BoxedError::new),
             ExportTarget::All => {
-                self.export_create_database().await?;
-                self.export_create_table().await?;
-                self.export_database_data().await
+                self.export_create_database()
+                    .await
+                    .map_err(BoxedError::new)?;
+                self.export_create_table().await.map_err(BoxedError::new)?;
+                self.export_database_data().await.map_err(BoxedError::new)
             }
         }
     }
 }
-
-#[cfg(test)]
-mod tests {
-    use clap::Parser;
-    use client::{Client, Database};
-    use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
-    use common_telemetry::logging::LoggingOptions;
-
-    use crate::error::Result as CmdResult;
-    use crate::options::GlobalOptions;
-    use crate::{cli, standalone, App};
-
-    #[tokio::test(flavor = "multi_thread")]
-    async fn test_export_create_table_with_quoted_names() -> CmdResult<()> {
-        let output_dir = tempfile::tempdir().unwrap();
-
-        let standalone = standalone::Command::parse_from([
-            "standalone",
-            "start",
-            "--data-home",
-            &*output_dir.path().to_string_lossy(),
-        ]);
-
-        let standalone_opts = standalone.load_options(&GlobalOptions::default()).unwrap();
-        let mut instance = standalone.build(standalone_opts).await?;
-        instance.start().await?;
-
-        let client = Client::with_urls(["127.0.0.1:4001"]);
-        let database = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
-        database
-            .sql(r#"CREATE DATABASE "cli.export.create_table";"#)
-            .await
-            .unwrap();
-        database
-            .sql(
-                r#"CREATE TABLE "cli.export.create_table"."a.b.c"(
-                        ts TIMESTAMP,
-                        TIME INDEX (ts)
-                    ) engine=mito;
-                "#,
-            )
-            .await
-            .unwrap();
-
-        let output_dir = tempfile::tempdir().unwrap();
-        let cli = cli::Command::parse_from([
-            "cli",
-            "export",
-            "--addr",
-            "127.0.0.1:4000",
-            "--output-dir",
-            &*output_dir.path().to_string_lossy(),
-            "--target",
-            "schema",
-        ]);
-        let mut cli_app = cli.build(LoggingOptions::default()).await?;
-        cli_app.start().await?;
-
-        instance.stop().await?;
-
-        let output_file = output_dir
-            .path()
-            .join("greptime")
-            .join("cli.export.create_table")
-            .join("create_tables.sql");
-        let res = std::fs::read_to_string(output_file).unwrap();
-        let expect = r#"CREATE TABLE IF NOT EXISTS "a.b.c" (
-  "ts" TIMESTAMP(3) NOT NULL,
-  TIME INDEX ("ts")
-)
-
-ENGINE=mito
-;
-"#;
-        assert_eq!(res.trim(), expect.trim());
-
-        Ok(())
-    }
-}
diff --git a/src/cmd/src/cli/helper.rs b/src/cli/src/helper.rs
similarity index 99%
rename from src/cmd/src/cli/helper.rs
rename to src/cli/src/helper.rs
index 08b12595149e..ee47e0f577b1 100644
--- a/src/cmd/src/cli/helper.rs
+++ b/src/cli/src/helper.rs
@@ -19,7 +19,7 @@ use rustyline::highlight::{Highlighter, MatchingBracketHighlighter};
 use rustyline::hint::{Hinter, HistoryHinter};
 use rustyline::validate::{ValidationContext, ValidationResult, Validator};
 
-use crate::cli::cmd::ReplCommand;
+use crate::cmd::ReplCommand;
 
 pub(crate) struct RustylineHelper {
     hinter: HistoryHinter,
diff --git a/src/cmd/src/cli/import.rs b/src/cli/src/import.rs
similarity index 89%
rename from src/cmd/src/cli/import.rs
rename to src/cli/src/import.rs
index 9cb7b60f59e7..f76560fbcd55 100644
--- a/src/cmd/src/cli/import.rs
+++ b/src/cli/src/import.rs
@@ -19,15 +19,15 @@ use std::time::Duration;
 use async_trait::async_trait;
 use clap::{Parser, ValueEnum};
 use common_catalog::consts::DEFAULT_SCHEMA_NAME;
+use common_error::ext::BoxedError;
 use common_telemetry::{error, info, warn};
 use snafu::{OptionExt, ResultExt};
 use tokio::sync::Semaphore;
 use tokio::time::Instant;
-use tracing_appender::non_blocking::WorkerGuard;
 
-use crate::cli::database::DatabaseClient;
-use crate::cli::{database, Instance, Tool};
+use crate::database::DatabaseClient;
 use crate::error::{Error, FileIoSnafu, Result, SchemaNotFoundSnafu};
+use crate::{database, Tool};
 
 #[derive(Debug, Default, Clone, ValueEnum)]
 enum ImportTarget {
@@ -79,8 +79,9 @@ pub struct ImportCommand {
 }
 
 impl ImportCommand {
-    pub async fn build(&self, guard: Vec<WorkerGuard>) -> Result<Instance> {
-        let (catalog, schema) = database::split_database(&self.database)?;
+    pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
+        let (catalog, schema) =
+            database::split_database(&self.database).map_err(BoxedError::new)?;
         let database_client = DatabaseClient::new(
             self.addr.clone(),
             catalog.clone(),
@@ -89,17 +90,14 @@ impl ImportCommand {
             self.timeout.unwrap_or_default(),
         );
 
-        Ok(Instance::new(
-            Box::new(Import {
-                catalog,
-                schema,
-                database_client,
-                input_dir: self.input_dir.clone(),
-                parallelism: self.import_jobs,
-                target: self.target.clone(),
-            }),
-            guard,
-        ))
+        Ok(Box::new(Import {
+            catalog,
+            schema,
+            database_client,
+            input_dir: self.input_dir.clone(),
+            parallelism: self.import_jobs,
+            target: self.target.clone(),
+        }))
     }
 }
 
@@ -218,13 +216,13 @@ impl Import {
 
 #[async_trait]
 impl Tool for Import {
-    async fn do_work(&self) -> Result<()> {
+    async fn do_work(&self) -> std::result::Result<(), BoxedError> {
         match self.target {
-            ImportTarget::Schema => self.import_create_table().await,
-            ImportTarget::Data => self.import_database_data().await,
+            ImportTarget::Schema => self.import_create_table().await.map_err(BoxedError::new),
+            ImportTarget::Data => self.import_database_data().await.map_err(BoxedError::new),
             ImportTarget::All => {
-                self.import_create_table().await?;
-                self.import_database_data().await
+                self.import_create_table().await.map_err(BoxedError::new)?;
+                self.import_database_data().await.map_err(BoxedError::new)
             }
         }
     }
diff --git a/src/cli/src/lib.rs b/src/cli/src/lib.rs
new file mode 100644
index 000000000000..3991f3a666b5
--- /dev/null
+++ b/src/cli/src/lib.rs
@@ -0,0 +1,60 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+mod bench;
+pub mod error;
+// Wait for https://github.com/GreptimeTeam/greptimedb/issues/2373
+#[allow(unused)]
+mod cmd;
+mod export;
+mod helper;
+
+// Wait for https://github.com/GreptimeTeam/greptimedb/issues/2373
+mod database;
+mod import;
+#[allow(unused)]
+mod repl;
+
+use async_trait::async_trait;
+use clap::Parser;
+use common_error::ext::BoxedError;
+pub use database::DatabaseClient;
+use error::Result;
+pub use repl::Repl;
+
+pub use crate::bench::BenchTableMetadataCommand;
+pub use crate::export::ExportCommand;
+pub use crate::import::ImportCommand;
+
+#[async_trait]
+pub trait Tool: Send + Sync {
+    async fn do_work(&self) -> std::result::Result<(), BoxedError>;
+}
+
+#[derive(Debug, Parser)]
+pub(crate) struct AttachCommand {
+    #[clap(long)]
+    pub(crate) grpc_addr: String,
+    #[clap(long)]
+    pub(crate) meta_addr: Option<String>,
+    #[clap(long, action)]
+    pub(crate) disable_helper: bool,
+}
+
+impl AttachCommand {
+    #[allow(dead_code)]
+    async fn build(self) -> Result<Box<dyn Tool>> {
+        unimplemented!("Wait for https://github.com/GreptimeTeam/greptimedb/issues/2373")
+    }
+}
diff --git a/src/cmd/src/cli/repl.rs b/src/cli/src/repl.rs
similarity index 98%
rename from src/cmd/src/cli/repl.rs
rename to src/cli/src/repl.rs
index 8c6e154a26d6..4c2ef8ffe396 100644
--- a/src/cmd/src/cli/repl.rs
+++ b/src/cli/src/repl.rs
@@ -20,6 +20,7 @@ use cache::{
     build_fundamental_cache_registry, with_default_composite_cache_registry, TABLE_CACHE_NAME,
     TABLE_ROUTE_CACHE_NAME,
 };
+use catalog::information_extension::DistributedInformationExtension;
 use catalog::kvbackend::{
     CachedKvBackend, CachedKvBackendBuilder, KvBackendCatalogManager, MetaKvBackend,
 };
@@ -44,15 +45,14 @@ use session::context::QueryContext;
 use snafu::{OptionExt, ResultExt};
 use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan};
 
-use crate::cli::cmd::ReplCommand;
-use crate::cli::helper::RustylineHelper;
-use crate::cli::AttachCommand;
+use crate::cmd::ReplCommand;
 use crate::error::{
     CollectRecordBatchesSnafu, ParseSqlSnafu, PlanStatementSnafu, PrettyPrintRecordBatchesSnafu,
     ReadlineSnafu, ReplCreationSnafu, RequestDatabaseSnafu, Result, StartMetaClientSnafu,
     SubstraitEncodeLogicalPlanSnafu,
 };
-use crate::{error, DistributedInformationExtension};
+use crate::helper::RustylineHelper;
+use crate::{error, AttachCommand};
 
 /// Captures the state of the repl, gathers commands and executes them one by one
 pub struct Repl {
diff --git a/src/cmd/Cargo.toml b/src/cmd/Cargo.toml
index c1f20cc9c526..3b498c829215 100644
--- a/src/cmd/Cargo.toml
+++ b/src/cmd/Cargo.toml
@@ -25,6 +25,7 @@ cache.workspace = true
 catalog.workspace = true
 chrono.workspace = true
 clap.workspace = true
+cli.workspace = true
 client.workspace = true
 common-base.workspace = true
 common-catalog.workspace = true
diff --git a/src/cmd/src/cli.rs b/src/cmd/src/cli.rs
index fc43e0997665..55ebe64bc262 100644
--- a/src/cmd/src/cli.rs
+++ b/src/cmd/src/cli.rs
@@ -12,39 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-mod bench;
-
-// Wait for https://github.com/GreptimeTeam/greptimedb/issues/2373
-#[allow(unused)]
-mod cmd;
-mod export;
-mod helper;
-
-// Wait for https://github.com/GreptimeTeam/greptimedb/issues/2373
-mod database;
-mod import;
-#[allow(unused)]
-mod repl;
-
-use async_trait::async_trait;
-use bench::BenchTableMetadataCommand;
 use clap::Parser;
+use cli::Tool;
 use common_telemetry::logging::{LoggingOptions, TracingOptions};
-pub use repl::Repl;
+use plugins::SubCommand;
+use snafu::ResultExt;
 use tracing_appender::non_blocking::WorkerGuard;
 
-use self::export::ExportCommand;
-use crate::cli::import::ImportCommand;
-use crate::error::Result;
 use crate::options::GlobalOptions;
-use crate::App;
-
+use crate::{error, App, Result};
 pub const APP_NAME: &str = "greptime-cli";
-
-#[async_trait]
-pub trait Tool: Send + Sync {
-    async fn do_work(&self) -> Result<()>;
-}
+use async_trait::async_trait;
 
 pub struct Instance {
     tool: Box<dyn Tool>,
@@ -54,12 +32,16 @@ pub struct Instance {
 }
 
 impl Instance {
-    fn new(tool: Box<dyn Tool>, guard: Vec<WorkerGuard>) -> Self {
+    pub fn new(tool: Box<dyn Tool>, guard: Vec<WorkerGuard>) -> Self {
         Self {
             tool,
             _guard: guard,
         }
     }
+
+    pub async fn start(&mut self) -> Result<()> {
+        self.tool.do_work().await.context(error::StartCliSnafu)
+    }
 }
 
 #[async_trait]
@@ -69,7 +51,8 @@ impl App for Instance {
     }
 
     async fn start(&mut self) -> Result<()> {
-        self.tool.do_work().await
+        self.start().await.unwrap();
+        Ok(())
     }
 
     fn wait_signal(&self) -> bool {
@@ -96,7 +79,12 @@ impl Command {
             None,
         );
 
-        self.cmd.build(guard).await
+        let tool = self.cmd.build().await.context(error::BuildCliSnafu)?;
+        let instance = Instance {
+            tool,
+            _guard: guard,
+        };
+        Ok(instance)
     }
 
     pub fn load_options(&self, global_options: &GlobalOptions) -> Result<LoggingOptions> {
@@ -112,38 +100,81 @@ impl Command {
     }
 }
 
-#[derive(Parser)]
-enum SubCommand {
-    // Attach(AttachCommand),
-    Bench(BenchTableMetadataCommand),
-    Export(ExportCommand),
-    Import(ImportCommand),
-}
+#[cfg(test)]
+mod tests {
+    use clap::Parser;
+    use client::{Client, Database};
+    use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
+    use common_telemetry::logging::LoggingOptions;
+
+    use crate::error::Result as CmdResult;
+    use crate::options::GlobalOptions;
+    use crate::{cli, standalone, App};
+
+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_export_create_table_with_quoted_names() -> CmdResult<()> {
+        let output_dir = tempfile::tempdir().unwrap();
+
+        let standalone = standalone::Command::parse_from([
+            "standalone",
+            "start",
+            "--data-home",
+            &*output_dir.path().to_string_lossy(),
+        ]);
+
+        let standalone_opts = standalone.load_options(&GlobalOptions::default()).unwrap();
+        let mut instance = standalone.build(standalone_opts).await?;
+        instance.start().await?;
+
+        let client = Client::with_urls(["127.0.0.1:4001"]);
+        let database = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
+        database
+            .sql(r#"CREATE DATABASE "cli.export.create_table";"#)
+            .await
+            .unwrap();
+        database
+            .sql(
+                r#"CREATE TABLE "cli.export.create_table"."a.b.c"(
+                        ts TIMESTAMP,
+                        TIME INDEX (ts)
+                    ) engine=mito;
+                "#,
+            )
+            .await
+            .unwrap();
+
+        let output_dir = tempfile::tempdir().unwrap();
+        let cli = cli::Command::parse_from([
+            "cli",
+            "export",
+            "--addr",
+            "127.0.0.1:4000",
+            "--output-dir",
+            &*output_dir.path().to_string_lossy(),
+            "--target",
+            "schema",
+        ]);
+        let mut cli_app = cli.build(LoggingOptions::default()).await?;
+        cli_app.start().await?;
+
+        instance.stop().await?;
+
+        let output_file = output_dir
+            .path()
+            .join("greptime")
+            .join("cli.export.create_table")
+            .join("create_tables.sql");
+        let res = std::fs::read_to_string(output_file).unwrap();
+        let expect = r#"CREATE TABLE IF NOT EXISTS "a.b.c" (
+  "ts" TIMESTAMP(3) NOT NULL,
+  TIME INDEX ("ts")
+)
+
+ENGINE=mito
+;
+"#;
+        assert_eq!(res.trim(), expect.trim());
 
-impl SubCommand {
-    async fn build(&self, guard: Vec<WorkerGuard>) -> Result<Instance> {
-        match self {
-            // SubCommand::Attach(cmd) => cmd.build().await,
-            SubCommand::Bench(cmd) => cmd.build(guard).await,
-            SubCommand::Export(cmd) => cmd.build(guard).await,
-            SubCommand::Import(cmd) => cmd.build(guard).await,
-        }
-    }
-}
-
-#[derive(Debug, Parser)]
-pub(crate) struct AttachCommand {
-    #[clap(long)]
-    pub(crate) grpc_addr: String,
-    #[clap(long)]
-    pub(crate) meta_addr: Option<String>,
-    #[clap(long, action)]
-    pub(crate) disable_helper: bool,
-}
-
-impl AttachCommand {
-    #[allow(dead_code)]
-    async fn build(self) -> Result<Instance> {
-        unimplemented!("Wait for https://github.com/GreptimeTeam/greptimedb/issues/2373")
+        Ok(())
     }
 }
diff --git a/src/cmd/src/error.rs b/src/cmd/src/error.rs
index f042b48478d4..0af9966fc6ac 100644
--- a/src/cmd/src/error.rs
+++ b/src/cmd/src/error.rs
@@ -114,6 +114,20 @@ pub enum Error {
         source: frontend::error::Error,
     },
 
+    #[snafu(display("Failed to build cli"))]
+    BuildCli {
+        #[snafu(implicit)]
+        location: Location,
+        source: BoxedError,
+    },
+
+    #[snafu(display("Failed to start cli"))]
+    StartCli {
+        #[snafu(implicit)]
+        location: Location,
+        source: BoxedError,
+    },
+
     #[snafu(display("Failed to build meta server"))]
     BuildMetaServer {
         #[snafu(implicit)]
@@ -346,6 +360,8 @@ impl ErrorExt for Error {
             Error::ShutdownMetaServer { source, .. } => source.status_code(),
             Error::BuildMetaServer { source, .. } => source.status_code(),
             Error::UnsupportedSelectorType { source, .. } => source.status_code(),
+            Error::BuildCli { source, .. } => source.status_code(),
+            Error::StartCli { source, .. } => source.status_code(),
 
             Error::InitMetadata { source, .. } | Error::InitDdlManager { source, .. } => {
                 source.status_code()
diff --git a/src/cmd/src/flownode.rs b/src/cmd/src/flownode.rs
index a2b6b41c019a..a9ad12bfbc02 100644
--- a/src/cmd/src/flownode.rs
+++ b/src/cmd/src/flownode.rs
@@ -15,6 +15,7 @@
 use std::sync::Arc;
 
 use cache::{build_fundamental_cache_registry, with_default_composite_cache_registry};
+use catalog::information_extension::DistributedInformationExtension;
 use catalog::kvbackend::{CachedKvBackendBuilder, KvBackendCatalogManager, MetaKvBackend};
 use clap::Parser;
 use client::client_manager::NodeClients;
@@ -41,7 +42,7 @@ use crate::error::{
     MissingConfigSnafu, Result, ShutdownFlownodeSnafu, StartFlownodeSnafu,
 };
 use crate::options::{GlobalOptions, GreptimeOptions};
-use crate::{log_versions, App, DistributedInformationExtension};
+use crate::{log_versions, App};
 
 pub const APP_NAME: &str = "greptime-flownode";
 
diff --git a/src/cmd/src/frontend.rs b/src/cmd/src/frontend.rs
index d90a286fc451..36bd37a51980 100644
--- a/src/cmd/src/frontend.rs
+++ b/src/cmd/src/frontend.rs
@@ -17,6 +17,7 @@ use std::time::Duration;
 
 use async_trait::async_trait;
 use cache::{build_fundamental_cache_registry, with_default_composite_cache_registry};
+use catalog::information_extension::DistributedInformationExtension;
 use catalog::kvbackend::{CachedKvBackendBuilder, KvBackendCatalogManager, MetaKvBackend};
 use clap::Parser;
 use client::client_manager::NodeClients;
@@ -46,7 +47,7 @@ use crate::error::{
     Result, StartFrontendSnafu,
 };
 use crate::options::{GlobalOptions, GreptimeOptions};
-use crate::{log_versions, App, DistributedInformationExtension};
+use crate::{log_versions, App};
 
 type FrontendOptions = GreptimeOptions<frontend::frontend::FrontendOptions>;
 
diff --git a/src/cmd/src/lib.rs b/src/cmd/src/lib.rs
index 3a719b9589a7..acd27f46d731 100644
--- a/src/cmd/src/lib.rs
+++ b/src/cmd/src/lib.rs
@@ -15,17 +15,7 @@
 #![feature(assert_matches, let_chains)]
 
 use async_trait::async_trait;
-use catalog::information_schema::InformationExtension;
-use client::api::v1::meta::ProcedureStatus;
-use common_error::ext::BoxedError;
-use common_meta::cluster::{ClusterInfo, NodeInfo};
-use common_meta::datanode::RegionStat;
-use common_meta::ddl::{ExecutorContext, ProcedureExecutor};
-use common_meta::rpc::procedure;
-use common_procedure::{ProcedureInfo, ProcedureState};
 use common_telemetry::{error, info};
-use meta_client::MetaClientRef;
-use snafu::ResultExt;
 
 use crate::error::Result;
 
@@ -130,69 +120,3 @@ fn log_env_flags() {
         info!("argument: {}", argument);
     }
 }
-
-pub struct DistributedInformationExtension {
-    meta_client: MetaClientRef,
-}
-
-impl DistributedInformationExtension {
-    pub fn new(meta_client: MetaClientRef) -> Self {
-        Self { meta_client }
-    }
-}
-
-#[async_trait::async_trait]
-impl InformationExtension for DistributedInformationExtension {
-    type Error = catalog::error::Error;
-
-    async fn nodes(&self) -> std::result::Result<Vec<NodeInfo>, Self::Error> {
-        self.meta_client
-            .list_nodes(None)
-            .await
-            .map_err(BoxedError::new)
-            .context(catalog::error::ListNodesSnafu)
-    }
-
-    async fn procedures(&self) -> std::result::Result<Vec<(String, ProcedureInfo)>, Self::Error> {
-        let procedures = self
-            .meta_client
-            .list_procedures(&ExecutorContext::default())
-            .await
-            .map_err(BoxedError::new)
-            .context(catalog::error::ListProceduresSnafu)?
-            .procedures;
-        let mut result = Vec::with_capacity(procedures.len());
-        for procedure in procedures {
-            let pid = match procedure.id {
-                Some(pid) => pid,
-                None => return catalog::error::ProcedureIdNotFoundSnafu {}.fail(),
-            };
-            let pid = procedure::pb_pid_to_pid(&pid)
-                .map_err(BoxedError::new)
-                .context(catalog::error::ConvertProtoDataSnafu)?;
-            let status = ProcedureStatus::try_from(procedure.status)
-                .map(|v| v.as_str_name())
-                .unwrap_or("Unknown")
-                .to_string();
-            let procedure_info = ProcedureInfo {
-                id: pid,
-                type_name: procedure.type_name,
-                start_time_ms: procedure.start_time_ms,
-                end_time_ms: procedure.end_time_ms,
-                state: ProcedureState::Running,
-                lock_keys: procedure.lock_keys,
-            };
-            result.push((status, procedure_info));
-        }
-
-        Ok(result)
-    }
-
-    async fn region_stats(&self) -> std::result::Result<Vec<RegionStat>, Self::Error> {
-        self.meta_client
-            .list_region_stats()
-            .await
-            .map_err(BoxedError::new)
-            .context(catalog::error::ListRegionStatsSnafu)
-    }
-}
diff --git a/src/common/meta/src/key.rs b/src/common/meta/src/key.rs
index b6bdf6189c79..90b96f32dc9e 100644
--- a/src/common/meta/src/key.rs
+++ b/src/common/meta/src/key.rs
@@ -149,7 +149,7 @@ use crate::DatanodeId;
 pub const NAME_PATTERN: &str = r"[a-zA-Z_:-][a-zA-Z0-9_:\-\.@#]*";
 pub const MAINTENANCE_KEY: &str = "__maintenance";
 
-const DATANODE_TABLE_KEY_PREFIX: &str = "__dn_table";
+pub const DATANODE_TABLE_KEY_PREFIX: &str = "__dn_table";
 pub const TABLE_INFO_KEY_PREFIX: &str = "__table_info";
 pub const VIEW_INFO_KEY_PREFIX: &str = "__view_info";
 pub const TABLE_NAME_KEY_PREFIX: &str = "__table_name";
diff --git a/src/common/meta/src/kv_backend/txn.rs b/src/common/meta/src/kv_backend/txn.rs
index 77cd0f921e21..ea3e95aa3ca6 100644
--- a/src/common/meta/src/kv_backend/txn.rs
+++ b/src/common/meta/src/kv_backend/txn.rs
@@ -136,6 +136,13 @@ pub struct Txn {
     c_else: bool,
 }
 
+#[cfg(any(test, feature = "testing"))]
+impl Txn {
+    pub fn req(&self) -> &TxnRequest {
+        &self.req
+    }
+}
+
 impl Txn {
     pub fn merge_all<T: IntoIterator<Item = Txn>>(values: T) -> Self {
         values
diff --git a/src/plugins/Cargo.toml b/src/plugins/Cargo.toml
index 977b7d9b7c7e..e352e647edd1 100644
--- a/src/plugins/Cargo.toml
+++ b/src/plugins/Cargo.toml
@@ -9,7 +9,10 @@ workspace = true
 
 [dependencies]
 auth.workspace = true
+clap.workspace = true
+cli.workspace = true
 common-base.workspace = true
+common-error.workspace = true
 datanode.workspace = true
 frontend.workspace = true
 meta-srv.workspace = true
diff --git a/src/plugins/src/cli.rs b/src/plugins/src/cli.rs
new file mode 100644
index 000000000000..79f5c64aa0ef
--- /dev/null
+++ b/src/plugins/src/cli.rs
@@ -0,0 +1,36 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use clap::Parser;
+use cli::{BenchTableMetadataCommand, ExportCommand, ImportCommand, Tool};
+use common_error::ext::BoxedError;
+
+#[derive(Parser)]
+pub enum SubCommand {
+    // Attach(AttachCommand),
+    Bench(BenchTableMetadataCommand),
+    Export(ExportCommand),
+    Import(ImportCommand),
+}
+
+impl SubCommand {
+    pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
+        match self {
+            // SubCommand::Attach(cmd) => cmd.build().await,
+            SubCommand::Bench(cmd) => cmd.build().await,
+            SubCommand::Export(cmd) => cmd.build().await,
+            SubCommand::Import(cmd) => cmd.build().await,
+        }
+    }
+}
diff --git a/src/plugins/src/lib.rs b/src/plugins/src/lib.rs
index a29ed0e4a8e7..fdb7abc01710 100644
--- a/src/plugins/src/lib.rs
+++ b/src/plugins/src/lib.rs
@@ -12,11 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+mod cli;
 mod datanode;
 mod frontend;
 mod meta_srv;
 mod options;
 
+pub use cli::SubCommand;
 pub use datanode::{setup_datanode_plugins, start_datanode_plugins};
 pub use frontend::{setup_frontend_plugins, start_frontend_plugins};
 pub use meta_srv::{setup_metasrv_plugins, start_metasrv_plugins};
diff --git a/tests-integration/src/cluster.rs b/tests-integration/src/cluster.rs
index 8bdb8299f7c4..83778da5bbc4 100644
--- a/tests-integration/src/cluster.rs
+++ b/tests-integration/src/cluster.rs
@@ -23,10 +23,10 @@ use cache::{
     build_datanode_cache_registry, build_fundamental_cache_registry,
     with_default_composite_cache_registry,
 };
+use catalog::information_extension::DistributedInformationExtension;
 use catalog::kvbackend::{CachedKvBackendBuilder, KvBackendCatalogManager, MetaKvBackend};
 use client::client_manager::NodeClients;
 use client::Client;
-use cmd::DistributedInformationExtension;
 use common_base::Plugins;
 use common_grpc::channel_manager::{ChannelConfig, ChannelManager};
 use common_meta::cache::{CacheRegistryBuilder, LayeredCacheRegistryBuilder};

From c732016fa0f6ebee536e00fd945a728ae79ce283 Mon Sep 17 00:00:00 2001
From: ZonaHe <zonahe@qq.com>
Date: Thu, 5 Dec 2024 21:42:36 +0800
Subject: [PATCH 02/36] feat: update dashboard to v0.7.0 (#5100)

Co-authored-by: sunchanglong <sunchanglong@users.noreply.github.com>
---
 src/servers/dashboard/VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/servers/dashboard/VERSION b/src/servers/dashboard/VERSION
index 14909610e644..8b20e48523e5 100644
--- a/src/servers/dashboard/VERSION
+++ b/src/servers/dashboard/VERSION
@@ -1 +1 @@
-v0.6.1
+v0.7.0

From f9ebb58a122dcc9c8ed8aa6336633ec1d63978f1 Mon Sep 17 00:00:00 2001
From: Ruihang Xia <waynestxia@gmail.com>
Date: Fri, 6 Dec 2024 10:10:17 +0800
Subject: [PATCH 03/36] fix: put PipelineChecker at the end (#5092)

fix: put PipelineChecker in the end

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
---
 src/query/src/query_engine/state.rs               | 15 +++++++++++++++
 .../common/tql-explain-analyze/explain.result     |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/query/src/query_engine/state.rs b/src/query/src/query_engine/state.rs
index edc10e076a73..74db773031d0 100644
--- a/src/query/src/query_engine/state.rs
+++ b/src/query/src/query_engine/state.rs
@@ -32,6 +32,8 @@ use datafusion::error::Result as DfResult;
 use datafusion::execution::context::{QueryPlanner, SessionConfig, SessionContext, SessionState};
 use datafusion::execution::runtime_env::RuntimeEnv;
 use datafusion::physical_optimizer::optimizer::PhysicalOptimizer;
+use datafusion::physical_optimizer::pipeline_checker::PipelineChecker;
+use datafusion::physical_optimizer::PhysicalOptimizerRule;
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion::physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner};
 use datafusion_expr::LogicalPlan as DfLogicalPlan;
@@ -127,6 +129,12 @@ impl QueryEngineState {
             .push(Arc::new(WindowedSortPhysicalRule));
         // Add rule to remove duplicate nodes generated by other rules. Run this in the last.
         physical_optimizer.rules.push(Arc::new(RemoveDuplicate));
+        // Place PipelineChecker at the end of the list to ensure that it runs after all other rules.
+        Self::remove_physical_optimizer_rule(
+            &mut physical_optimizer.rules,
+            PipelineChecker {}.name(),
+        );
+        physical_optimizer.rules.push(Arc::new(PipelineChecker {}));
 
         let session_state = SessionState::new_with_config_rt(session_config, runtime_env)
             .with_analyzer_rules(analyzer.rules)
@@ -159,6 +167,13 @@ impl QueryEngineState {
         rules.retain(|rule| rule.name() != name);
     }
 
+    fn remove_physical_optimizer_rule(
+        rules: &mut Vec<Arc<dyn PhysicalOptimizerRule + Send + Sync>>,
+        name: &str,
+    ) {
+        rules.retain(|rule| rule.name() != name);
+    }
+
     /// Optimize the logical plan by the extension anayzer rules.
     pub fn optimize_by_extension_rules(
         &self,
diff --git a/tests/cases/standalone/common/tql-explain-analyze/explain.result b/tests/cases/standalone/common/tql-explain-analyze/explain.result
index 4cdb5a879007..ddfcc97ed1a4 100644
--- a/tests/cases/standalone/common/tql-explain-analyze/explain.result
+++ b/tests/cases/standalone/common/tql-explain-analyze/explain.result
@@ -182,9 +182,9 @@ TQL EXPLAIN VERBOSE (0, 10, '5s') test;
 |_|_|
 | physical_plan after LimitAggregation_| SAME TEXT AS ABOVE_|
 | physical_plan after ProjectionPushdown_| SAME TEXT AS ABOVE_|
-| physical_plan after PipelineChecker_| SAME TEXT AS ABOVE_|
 | physical_plan after WindowedSortRule_| SAME TEXT AS ABOVE_|
 | physical_plan after RemoveDuplicateRule_| SAME TEXT AS ABOVE_|
+| physical_plan after PipelineChecker_| SAME TEXT AS ABOVE_|
 | physical_plan_| PromInstantManipulateExec: range=[0..0], lookback=[300000], interval=[300000], time index=[j]_|
 |_|_PromSeriesNormalizeExec: offset=[0], time index=[j], filter NaN: [false]_|
 |_|_PromSeriesDivideExec: tags=["k"]_|

From 7a3d6f2bd572197d57c503cdb96d8ae16072fb50 Mon Sep 17 00:00:00 2001
From: Yingwen <realevenyag@gmail.com>
Date: Fri, 6 Dec 2024 10:59:16 +0800
Subject: [PATCH 04/36] docs: remove lg_prof_interval from env (#5103)

---
 docs/how-to/how-to-profile-memory.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/how-to/how-to-profile-memory.md b/docs/how-to/how-to-profile-memory.md
index 257e834e0b00..3284df9e328f 100644
--- a/docs/how-to/how-to-profile-memory.md
+++ b/docs/how-to/how-to-profile-memory.md
@@ -23,7 +23,7 @@ curl https://raw.githubusercontent.com/brendangregg/FlameGraph/master/flamegraph
 Start GreptimeDB instance with environment variables:
 
 ```bash
-MALLOC_CONF=prof:true,lg_prof_interval:28 ./target/debug/greptime standalone start
+MALLOC_CONF=prof:true ./target/debug/greptime standalone start
 ```
 
 Dump memory profiling data through HTTP API:

From 2b699e735c08a4b528a02236bb97951203a294a3 Mon Sep 17 00:00:00 2001
From: Weny Xu <wenymedia@gmail.com>
Date: Fri, 6 Dec 2024 11:14:08 +0800
Subject: [PATCH 05/36] chore: correct example config file (#5105)

* chore: correct example config file

* fix: fix unit test
---
 config/config.md            | 2 +-
 config/metasrv.example.toml | 2 +-
 src/meta-srv/src/metasrv.rs | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/config/config.md b/config/config.md
index 15025b871125..ec00eb98b730 100644
--- a/config/config.md
+++ b/config/config.md
@@ -286,7 +286,7 @@
 | `data_home` | String | `/tmp/metasrv/` | The working home directory. |
 | `bind_addr` | String | `127.0.0.1:3002` | The bind address of metasrv. |
 | `server_addr` | String | `127.0.0.1:3002` | The communication server address for frontend and datanode to connect to metasrv,  "127.0.0.1:3002" by default for localhost. |
-| `store_addr` | String | `127.0.0.1:2379` | Store server address default to etcd store. |
+| `store_addrs` | Array | -- | Store server address default to etcd store. |
 | `selector` | String | `round_robin` | Datanode selector type.<br/>- `round_robin` (default value)<br/>- `lease_based`<br/>- `load_based`<br/>For details, please see "https://docs.greptime.com/developer-guide/metasrv/selector". |
 | `use_memory_store` | Bool | `false` | Store data in memory. |
 | `enable_telemetry` | Bool | `true` | Whether to enable greptimedb telemetry. |
diff --git a/config/metasrv.example.toml b/config/metasrv.example.toml
index b80d1c164e0e..bcd7ee41412b 100644
--- a/config/metasrv.example.toml
+++ b/config/metasrv.example.toml
@@ -8,7 +8,7 @@ bind_addr = "127.0.0.1:3002"
 server_addr = "127.0.0.1:3002"
 
 ## Store server address default to etcd store.
-store_addr = "127.0.0.1:2379"
+store_addrs = ["127.0.0.1:2379"]
 
 ## Datanode selector type.
 ## - `round_robin` (default value)
diff --git a/src/meta-srv/src/metasrv.rs b/src/meta-srv/src/metasrv.rs
index 9de0487d01cc..716b85f83485 100644
--- a/src/meta-srv/src/metasrv.rs
+++ b/src/meta-srv/src/metasrv.rs
@@ -179,7 +179,7 @@ impl Default for MetasrvOptions {
 
 impl Configurable for MetasrvOptions {
     fn env_list_keys() -> Option<&'static [&'static str]> {
-        Some(&["wal.broker_endpoints"])
+        Some(&["wal.broker_endpoints", "store_addrs"])
     }
 }
 

From dc83b0aa152cc2d2a9f00b3ca5616c4ca78da2e5 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunng@protonmail.com>
Date: Fri, 6 Dec 2024 16:22:25 +0800
Subject: [PATCH 06/36] feat: add more transaction related statement for
 postgres interface (#5081)

* fix: add match for start and abort transactions

* feat: add commit transaction statement

* feat: add warning on transaction start

* chore: update message
---
 src/servers/src/postgres/fixtures.rs | 45 +++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/src/servers/src/postgres/fixtures.rs b/src/servers/src/postgres/fixtures.rs
index 18c3661b9334..895f5c03e4a9 100644
--- a/src/servers/src/postgres/fixtures.rs
+++ b/src/servers/src/postgres/fixtures.rs
@@ -51,22 +51,40 @@ static VAR_VALUES: Lazy<HashMap<&str, &str>> = Lazy::new(|| {
 static SHOW_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new("(?i)^SHOW (.*?);?$").unwrap());
 static SET_TRANSACTION_PATTERN: Lazy<Regex> =
     Lazy::new(|| Regex::new("(?i)^SET TRANSACTION (.*?);?$").unwrap());
-static TRANSACTION_PATTERN: Lazy<Regex> =
-    Lazy::new(|| Regex::new("(?i)^(BEGIN|ROLLBACK|COMMIT);?").unwrap());
+static START_TRANSACTION_PATTERN: Lazy<Regex> =
+    Lazy::new(|| Regex::new("(?i)^(START TRANSACTION.*|BEGIN);?").unwrap());
+static COMMIT_TRANSACTION_PATTERN: Lazy<Regex> =
+    Lazy::new(|| Regex::new("(?i)^(COMMIT TRANSACTION|COMMIT);?").unwrap());
+static ABORT_TRANSACTION_PATTERN: Lazy<Regex> =
+    Lazy::new(|| Regex::new("(?i)^(ABORT TRANSACTION|ROLLBACK);?").unwrap());
 
 /// Test if given query statement matches the patterns
 pub(crate) fn matches(query: &str) -> bool {
-    TRANSACTION_PATTERN.captures(query).is_some()
+    START_TRANSACTION_PATTERN.is_match(query)
+        || COMMIT_TRANSACTION_PATTERN.is_match(query)
+        || ABORT_TRANSACTION_PATTERN.is_match(query)
         || SHOW_PATTERN.captures(query).is_some()
         || SET_TRANSACTION_PATTERN.is_match(query)
 }
 
+fn set_transaction_warning(query_ctx: QueryContextRef) {
+    query_ctx.set_warning("Please note transaction is not supported in GreptimeDB.".to_string());
+}
+
 /// Process unsupported SQL and return fixed result as a compatibility solution
-pub(crate) fn process<'a>(query: &str, _query_ctx: QueryContextRef) -> Option<Vec<Response<'a>>> {
+pub(crate) fn process<'a>(query: &str, query_ctx: QueryContextRef) -> Option<Vec<Response<'a>>> {
     // Transaction directives:
-    if let Some(tx) = TRANSACTION_PATTERN.captures(query) {
-        let tx_tag = &tx[1];
-        Some(vec![Response::Execution(Tag::new(&tx_tag.to_uppercase()))])
+    if START_TRANSACTION_PATTERN.is_match(query) {
+        set_transaction_warning(query_ctx);
+        if query.to_lowercase().starts_with("begin") {
+            Some(vec![Response::Execution(Tag::new("BEGIN"))])
+        } else {
+            Some(vec![Response::Execution(Tag::new("START TRANSACTION"))])
+        }
+    } else if ABORT_TRANSACTION_PATTERN.is_match(query) {
+        Some(vec![Response::Execution(Tag::new("ROLLBACK"))])
+    } else if COMMIT_TRANSACTION_PATTERN.is_match(query) {
+        Some(vec![Response::Execution(Tag::new("COMMIT"))])
     } else if let Some(show_var) = SHOW_PATTERN.captures(query) {
         let show_var = show_var[1].to_lowercase();
         if let Some(value) = VAR_VALUES.get(&show_var.as_ref()) {
@@ -150,6 +168,19 @@ mod test {
             "SET",
             query_context.clone(),
         );
+        assert_tag(
+            "START TRANSACTION isolation level READ COMMITTED;",
+            "START TRANSACTION",
+            query_context.clone(),
+        );
+        assert_tag(
+            "start transaction isolation level READ COMMITTED;",
+            "START TRANSACTION",
+            query_context.clone(),
+        );
+        assert_tag("abort transaction;", "ROLLBACK", query_context.clone());
+        assert_tag("commit transaction;", "COMMIT", query_context.clone());
+        assert_tag("COMMIT transaction;", "COMMIT", query_context.clone());
 
         let resp = get_data("SHOW transaction isolation level", query_context.clone());
         assert_eq!(1, resp.row_schema().len());

From 8b944268dabff0219f9eb5955e7958cf355f81d3 Mon Sep 17 00:00:00 2001
From: discord9 <55937128+discord9@users.noreply.github.com>
Date: Fri, 6 Dec 2024 17:20:42 +0800
Subject: [PATCH 07/36] feat: ttl=0/instant/forever/humantime&ttl refactor
 (#5089)

* feat: ttl zero filter

* refactor: use TimeToLive enum

* fix: unit test

* tests: sqlness

* refactor: Option<TTL> None means UNSET

* tests: sqlness

* fix: 10000 years --> forever

* chore: minor refactor from reviews

* chore: rename back TimeToLive

* refactor: split imme request from normal requests

* fix: use correct lifetime

* refactor: rename immediate to instant

* tests: flow sink table default ttl

* refactor: per review

* tests: sqlness

* fix: ttl alter to instant

* tests: sqlness

* refactor: per review

* chore: per review

* feat: add db ttl type&forbid instant for db

* tests: more unit test
---
 Cargo.lock                                    |   4 +
 src/common/meta/src/ddl/alter_database.rs     |  10 +-
 src/common/meta/src/key/schema_name.rs        |  64 ++-
 src/common/meta/src/rpc/ddl.rs                |  13 +-
 src/common/time/Cargo.toml                    |   2 +
 src/common/time/src/error.rs                  |  16 +
 src/common/time/src/lib.rs                    |   2 +
 src/common/time/src/ttl.rs                    | 266 +++++++++++++
 src/metric-engine/Cargo.toml                  |   1 +
 src/metric-engine/src/engine/alter.rs         |   2 +-
 src/metric-engine/src/engine/create.rs        |   6 +-
 src/mito2/src/compaction.rs                   |  31 +-
 src/mito2/src/compaction/compactor.rs         |   7 +-
 src/mito2/src/compaction/window.rs            |   2 +-
 src/mito2/src/engine/alter_test.rs            |  12 +-
 src/mito2/src/engine/create_test.rs           |   4 +-
 src/mito2/src/engine/open_test.rs             |   4 +-
 src/mito2/src/region/options.rs               |  15 +-
 src/mito2/src/sst/version.rs                  |  13 +-
 src/mito2/src/worker/handle_alter.rs          |   8 +-
 src/operator/Cargo.toml                       |   1 +
 src/operator/src/insert.rs                    |  90 ++++-
 .../src/req_convert/insert/row_to_region.rs   |  29 +-
 .../src/req_convert/insert/stmt_to_region.rs  |  16 +-
 .../src/req_convert/insert/table_to_region.rs |  21 +-
 src/query/src/sql/show_create_table.rs        |  15 +-
 src/store-api/src/region_request.rs           |  17 +-
 src/table/src/metadata.rs                     |  17 +-
 src/table/src/requests.rs                     |  43 +-
 .../common/alter/alter_database.result        |   3 +
 .../common/alter/alter_table_options.result   |   4 +-
 .../common/flow/flow_advance_ttl.result       | 101 +++++
 .../common/flow/flow_advance_ttl.sql          |  39 ++
 .../standalone/common/flow/flow_basic.result  |  52 +++
 .../standalone/common/flow/flow_basic.sql     |   6 +
 .../standalone/common/ttl/show_ttl.result     | 374 ++++++++++++++++++
 .../cases/standalone/common/ttl/show_ttl.sql  |  82 ++++
 .../standalone/common/ttl/ttl_instant.result  | 340 ++++++++++++++++
 .../standalone/common/ttl/ttl_instant.sql     | 166 ++++++++
 39 files changed, 1729 insertions(+), 169 deletions(-)
 create mode 100644 src/common/time/src/ttl.rs
 create mode 100644 tests/cases/standalone/common/flow/flow_advance_ttl.result
 create mode 100644 tests/cases/standalone/common/flow/flow_advance_ttl.sql
 create mode 100644 tests/cases/standalone/common/ttl/show_ttl.result
 create mode 100644 tests/cases/standalone/common/ttl/show_ttl.sql
 create mode 100644 tests/cases/standalone/common/ttl/ttl_instant.result
 create mode 100644 tests/cases/standalone/common/ttl/ttl_instant.sql

diff --git a/Cargo.lock b/Cargo.lock
index 0085294272e2..f677ee269d4e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2446,6 +2446,8 @@ dependencies = [
  "chrono-tz 0.8.6",
  "common-error",
  "common-macro",
+ "humantime",
+ "humantime-serde",
  "once_cell",
  "rand",
  "serde",
@@ -6592,6 +6594,7 @@ dependencies = [
  "aquamarine",
  "async-trait",
  "base64 0.21.7",
+ "common-base",
  "common-error",
  "common-macro",
  "common-query",
@@ -7656,6 +7659,7 @@ dependencies = [
 name = "operator"
 version = "0.11.0"
 dependencies = [
+ "ahash 0.8.11",
  "api",
  "async-stream",
  "async-trait",
diff --git a/src/common/meta/src/ddl/alter_database.rs b/src/common/meta/src/ddl/alter_database.rs
index 5e992a7d4e81..68f0f5428e08 100644
--- a/src/common/meta/src/ddl/alter_database.rs
+++ b/src/common/meta/src/ddl/alter_database.rs
@@ -46,11 +46,7 @@ fn build_new_schema_value(
             for option in options.0.iter() {
                 match option {
                     SetDatabaseOption::Ttl(ttl) => {
-                        if ttl.is_zero() {
-                            value.ttl = None;
-                        } else {
-                            value.ttl = Some(*ttl);
-                        }
+                        value.ttl = Some(*ttl);
                     }
                 }
             }
@@ -230,12 +226,12 @@ mod tests {
     #[test]
     fn test_build_new_schema_value() {
         let set_ttl = AlterDatabaseKind::SetDatabaseOptions(SetDatabaseOptions(vec![
-            SetDatabaseOption::Ttl(Duration::from_secs(10)),
+            SetDatabaseOption::Ttl(Duration::from_secs(10).into()),
         ]));
         let current_schema_value = SchemaNameValue::default();
         let new_schema_value =
             build_new_schema_value(current_schema_value.clone(), &set_ttl).unwrap();
-        assert_eq!(new_schema_value.ttl, Some(Duration::from_secs(10)));
+        assert_eq!(new_schema_value.ttl, Some(Duration::from_secs(10).into()));
 
         let unset_ttl_alter_kind =
             AlterDatabaseKind::UnsetDatabaseOptions(UnsetDatabaseOptions(vec![
diff --git a/src/common/meta/src/key/schema_name.rs b/src/common/meta/src/key/schema_name.rs
index 1ec8c17eb5a1..35413433a445 100644
--- a/src/common/meta/src/key/schema_name.rs
+++ b/src/common/meta/src/key/schema_name.rs
@@ -15,9 +15,9 @@
 use std::collections::HashMap;
 use std::fmt::Display;
 use std::sync::Arc;
-use std::time::Duration;
 
 use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
+use common_time::DatabaseTimeToLive;
 use futures::stream::BoxStream;
 use humantime_serde::re::humantime;
 use serde::{Deserialize, Serialize};
@@ -57,15 +57,13 @@ impl Default for SchemaNameKey<'_> {
 #[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)]
 pub struct SchemaNameValue {
     #[serde(default)]
-    #[serde(with = "humantime_serde")]
-    pub ttl: Option<Duration>,
+    pub ttl: Option<DatabaseTimeToLive>,
 }
 
 impl Display for SchemaNameValue {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        if let Some(ttl) = self.ttl {
-            let ttl = humantime::format_duration(ttl);
-            write!(f, "ttl='{ttl}'")?;
+        if let Some(ttl) = self.ttl.map(|i| i.to_string()) {
+            write!(f, "ttl='{}'", ttl)?;
         }
 
         Ok(())
@@ -96,11 +94,8 @@ impl TryFrom<&HashMap<String, String>> for SchemaNameValue {
 impl From<SchemaNameValue> for HashMap<String, String> {
     fn from(value: SchemaNameValue) -> Self {
         let mut opts = HashMap::new();
-        if let Some(ttl) = value.ttl {
-            opts.insert(
-                OPT_KEY_TTL.to_string(),
-                format!("{}", humantime::format_duration(ttl)),
-            );
+        if let Some(ttl) = value.ttl.map(|ttl| ttl.to_string()) {
+            opts.insert(OPT_KEY_TTL.to_string(), ttl);
         }
         opts
     }
@@ -313,6 +308,7 @@ impl<'a> From<&'a SchemaName> for SchemaNameKey<'a> {
 
 #[cfg(test)]
 mod tests {
+    use std::time::Duration;
 
     use super::*;
     use crate::kv_backend::memory::MemoryKvBackend;
@@ -323,9 +319,14 @@ mod tests {
         assert_eq!("", schema_value.to_string());
 
         let schema_value = SchemaNameValue {
-            ttl: Some(Duration::from_secs(9)),
+            ttl: Some(Duration::from_secs(9).into()),
         };
         assert_eq!("ttl='9s'", schema_value.to_string());
+
+        let schema_value = SchemaNameValue {
+            ttl: Some(Duration::from_secs(0).into()),
+        };
+        assert_eq!("ttl='forever'", schema_value.to_string());
     }
 
     #[test]
@@ -338,17 +339,36 @@ mod tests {
         assert_eq!(key, parsed);
 
         let value = SchemaNameValue {
-            ttl: Some(Duration::from_secs(10)),
+            ttl: Some(Duration::from_secs(10).into()),
         };
         let mut opts: HashMap<String, String> = HashMap::new();
         opts.insert("ttl".to_string(), "10s".to_string());
         let from_value = SchemaNameValue::try_from(&opts).unwrap();
         assert_eq!(value, from_value);
 
-        let parsed = SchemaNameValue::try_from_raw_value("{\"ttl\":\"10s\"}".as_bytes()).unwrap();
+        let parsed = SchemaNameValue::try_from_raw_value(
+            serde_json::json!({"ttl": "10s"}).to_string().as_bytes(),
+        )
+        .unwrap();
         assert_eq!(Some(value), parsed);
+
+        let forever = SchemaNameValue {
+            ttl: Some(Default::default()),
+        };
+        let parsed = SchemaNameValue::try_from_raw_value(
+            serde_json::json!({"ttl": "forever"}).to_string().as_bytes(),
+        )
+        .unwrap();
+        assert_eq!(Some(forever), parsed);
+
+        let instant_err = SchemaNameValue::try_from_raw_value(
+            serde_json::json!({"ttl": "instant"}).to_string().as_bytes(),
+        );
+        assert!(instant_err.is_err());
+
         let none = SchemaNameValue::try_from_raw_value("null".as_bytes()).unwrap();
         assert!(none.is_none());
+
         let err_empty = SchemaNameValue::try_from_raw_value("".as_bytes());
         assert!(err_empty.is_err());
     }
@@ -374,7 +394,7 @@ mod tests {
 
         let current_schema_value = manager.get(schema_key).await.unwrap().unwrap();
         let new_schema_value = SchemaNameValue {
-            ttl: Some(Duration::from_secs(10)),
+            ttl: Some(Duration::from_secs(10).into()),
         };
         manager
             .update(schema_key, &current_schema_value, &new_schema_value)
@@ -388,10 +408,10 @@ mod tests {
             .unwrap();
 
         let new_schema_value = SchemaNameValue {
-            ttl: Some(Duration::from_secs(40)),
+            ttl: Some(Duration::from_secs(40).into()),
         };
         let incorrect_schema_value = SchemaNameValue {
-            ttl: Some(Duration::from_secs(20)),
+            ttl: Some(Duration::from_secs(20).into()),
         }
         .try_as_raw_value()
         .unwrap();
@@ -402,5 +422,15 @@ mod tests {
             .update(schema_key, &incorrect_schema_value, &new_schema_value)
             .await
             .unwrap_err();
+
+        let current_schema_value = manager.get(schema_key).await.unwrap().unwrap();
+        let new_schema_value = SchemaNameValue { ttl: None };
+        manager
+            .update(schema_key, &current_schema_value, &new_schema_value)
+            .await
+            .unwrap();
+
+        let current_schema_value = manager.get(schema_key).await.unwrap().unwrap();
+        assert_eq!(new_schema_value, *current_schema_value);
     }
 }
diff --git a/src/common/meta/src/rpc/ddl.rs b/src/common/meta/src/rpc/ddl.rs
index 562ecb8ee660..bec12796e791 100644
--- a/src/common/meta/src/rpc/ddl.rs
+++ b/src/common/meta/src/rpc/ddl.rs
@@ -14,7 +14,6 @@
 
 use std::collections::{HashMap, HashSet};
 use std::result;
-use std::time::Duration;
 
 use api::v1::alter_database_expr::Kind as PbAlterDatabaseKind;
 use api::v1::meta::ddl_task_request::Task;
@@ -36,7 +35,7 @@ use api::v1::{
 };
 use base64::engine::general_purpose;
 use base64::Engine as _;
-use humantime_serde::re::humantime;
+use common_time::DatabaseTimeToLive;
 use prost::Message;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DefaultOnNull};
@@ -1009,12 +1008,8 @@ impl TryFrom<PbOption> for SetDatabaseOption {
     fn try_from(PbOption { key, value }: PbOption) -> Result<Self> {
         match key.to_ascii_lowercase().as_str() {
             TTL_KEY => {
-                let ttl = if value.is_empty() {
-                    Duration::from_secs(0)
-                } else {
-                    humantime::parse_duration(&value)
-                        .map_err(|_| InvalidSetDatabaseOptionSnafu { key, value }.build())?
-                };
+                let ttl = DatabaseTimeToLive::from_humantime_or_str(&value)
+                    .map_err(|_| InvalidSetDatabaseOptionSnafu { key, value }.build())?;
 
                 Ok(SetDatabaseOption::Ttl(ttl))
             }
@@ -1025,7 +1020,7 @@ impl TryFrom<PbOption> for SetDatabaseOption {
 
 #[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
 pub enum SetDatabaseOption {
-    Ttl(Duration),
+    Ttl(DatabaseTimeToLive),
 }
 
 #[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
diff --git a/src/common/time/Cargo.toml b/src/common/time/Cargo.toml
index fdd06140f187..28ff40ff90c8 100644
--- a/src/common/time/Cargo.toml
+++ b/src/common/time/Cargo.toml
@@ -13,6 +13,8 @@ chrono.workspace = true
 chrono-tz = "0.8"
 common-error.workspace = true
 common-macro.workspace = true
+humantime.workspace = true
+humantime-serde.workspace = true
 once_cell.workspace = true
 serde = { version = "1.0", features = ["derive"] }
 serde_json.workspace = true
diff --git a/src/common/time/src/error.rs b/src/common/time/src/error.rs
index 45d94a782885..0f6b5bdeb999 100644
--- a/src/common/time/src/error.rs
+++ b/src/common/time/src/error.rs
@@ -93,12 +93,28 @@ pub enum Error {
         #[snafu(implicit)]
         location: Location,
     },
+
+    #[snafu(display("Failed to parse duration"))]
+    ParseDuration {
+        #[snafu(source)]
+        error: humantime::DurationError,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Database's TTL can't be `instant`"))]
+    InvalidDatabaseTtl {
+        #[snafu(implicit)]
+        location: Location,
+    },
 }
 
 impl ErrorExt for Error {
     fn status_code(&self) -> StatusCode {
         match self {
             Error::ParseDateStr { .. }
+            | Error::ParseDuration { .. }
+            | Error::InvalidDatabaseTtl { .. }
             | Error::ParseTimestamp { .. }
             | Error::InvalidTimezoneOffset { .. }
             | Error::Format { .. }
diff --git a/src/common/time/src/lib.rs b/src/common/time/src/lib.rs
index fa025bf661c2..feb19cf9a191 100644
--- a/src/common/time/src/lib.rs
+++ b/src/common/time/src/lib.rs
@@ -22,6 +22,7 @@ pub mod time;
 pub mod timestamp;
 pub mod timestamp_millis;
 pub mod timezone;
+pub mod ttl;
 pub mod util;
 
 pub use date::Date;
@@ -32,3 +33,4 @@ pub use range::RangeMillis;
 pub use timestamp::Timestamp;
 pub use timestamp_millis::TimestampMillis;
 pub use timezone::Timezone;
+pub use ttl::{DatabaseTimeToLive, TimeToLive, FOREVER, INSTANT};
diff --git a/src/common/time/src/ttl.rs b/src/common/time/src/ttl.rs
new file mode 100644
index 000000000000..0544cfb0d198
--- /dev/null
+++ b/src/common/time/src/ttl.rs
@@ -0,0 +1,266 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::Display;
+use std::time::Duration;
+
+use serde::{Deserialize, Serialize};
+use snafu::ResultExt;
+
+use crate::error::{Error, InvalidDatabaseTtlSnafu, ParseDurationSnafu};
+use crate::Timestamp;
+
+pub const INSTANT: &str = "instant";
+pub const FOREVER: &str = "forever";
+
+/// Time To Live for database, which can be `Forever`, or a `Duration`, but can't be `Instant`.
+///
+/// unlike `TimeToLive` which can be `Instant`, `Forever`, or a `Duration`
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Default, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum DatabaseTimeToLive {
+    /// Keep the data forever
+    #[default]
+    Forever,
+    /// Duration to keep the data, this duration should be non-zero
+    #[serde(untagged, with = "humantime_serde")]
+    Duration(Duration),
+}
+
+impl Display for DatabaseTimeToLive {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            DatabaseTimeToLive::Forever => write!(f, "{}", FOREVER),
+            DatabaseTimeToLive::Duration(d) => write!(f, "{}", humantime::Duration::from(*d)),
+        }
+    }
+}
+
+impl DatabaseTimeToLive {
+    /// Parse a string that is either `forever`, or a duration to `TimeToLive`
+    ///
+    /// note that an empty string or a zero duration(a duration that spans no time) is treat as `forever` too
+    pub fn from_humantime_or_str(s: &str) -> Result<Self, Error> {
+        let ttl = match s.to_lowercase().as_ref() {
+            INSTANT => InvalidDatabaseTtlSnafu.fail()?,
+            FOREVER | "" => Self::Forever,
+            _ => {
+                let d = humantime::parse_duration(s).context(ParseDurationSnafu)?;
+                Self::from(d)
+            }
+        };
+        Ok(ttl)
+    }
+}
+
+impl TryFrom<TimeToLive> for DatabaseTimeToLive {
+    type Error = Error;
+    fn try_from(value: TimeToLive) -> Result<Self, Self::Error> {
+        match value {
+            TimeToLive::Instant => InvalidDatabaseTtlSnafu.fail()?,
+            TimeToLive::Forever => Ok(Self::Forever),
+            TimeToLive::Duration(d) => Ok(Self::from(d)),
+        }
+    }
+}
+
+impl From<DatabaseTimeToLive> for TimeToLive {
+    fn from(value: DatabaseTimeToLive) -> Self {
+        match value {
+            DatabaseTimeToLive::Forever => TimeToLive::Forever,
+            DatabaseTimeToLive::Duration(d) => TimeToLive::from(d),
+        }
+    }
+}
+
+impl From<Duration> for DatabaseTimeToLive {
+    fn from(duration: Duration) -> Self {
+        if duration.is_zero() {
+            Self::Forever
+        } else {
+            Self::Duration(duration)
+        }
+    }
+}
+
+impl From<humantime::Duration> for DatabaseTimeToLive {
+    fn from(duration: humantime::Duration) -> Self {
+        Self::from(*duration)
+    }
+}
+
+/// Time To Live
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Default, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum TimeToLive {
+    /// Instantly discard upon insert
+    Instant,
+    /// Keep the data forever
+    #[default]
+    Forever,
+    /// Duration to keep the data, this duration should be non-zero
+    #[serde(untagged, with = "humantime_serde")]
+    Duration(Duration),
+}
+
+impl Display for TimeToLive {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            TimeToLive::Instant => write!(f, "{}", INSTANT),
+            TimeToLive::Duration(d) => write!(f, "{}", humantime::Duration::from(*d)),
+            TimeToLive::Forever => write!(f, "{}", FOREVER),
+        }
+    }
+}
+
+impl TimeToLive {
+    /// Parse a string that is either `instant`, `forever`, or a duration to `TimeToLive`
+    ///
+    /// note that an empty string or a zero duration(a duration that spans no time) is treat as `forever` too
+    pub fn from_humantime_or_str(s: &str) -> Result<Self, Error> {
+        match s.to_lowercase().as_ref() {
+            INSTANT => Ok(TimeToLive::Instant),
+            FOREVER | "" => Ok(TimeToLive::Forever),
+            _ => {
+                let d = humantime::parse_duration(s).context(ParseDurationSnafu)?;
+                Ok(TimeToLive::from(d))
+            }
+        }
+    }
+
+    /// Check if the TimeToLive is expired
+    /// with the given `created_at` and `now` timestamp
+    pub fn is_expired(
+        &self,
+        created_at: &Timestamp,
+        now: &Timestamp,
+    ) -> crate::error::Result<bool> {
+        Ok(match self {
+            TimeToLive::Instant => true,
+            TimeToLive::Forever => false,
+            TimeToLive::Duration(d) => now.sub_duration(*d)? > *created_at,
+        })
+    }
+
+    /// is instant variant
+    pub fn is_instant(&self) -> bool {
+        matches!(self, TimeToLive::Instant)
+    }
+
+    /// Is the default value, which is `Forever`
+    pub fn is_forever(&self) -> bool {
+        matches!(self, TimeToLive::Forever)
+    }
+}
+
+impl From<Duration> for TimeToLive {
+    fn from(duration: Duration) -> Self {
+        if duration.is_zero() {
+            // compatibility with old code, and inline with cassandra's behavior when ttl set to 0
+            TimeToLive::Forever
+        } else {
+            TimeToLive::Duration(duration)
+        }
+    }
+}
+
+impl From<humantime::Duration> for TimeToLive {
+    fn from(duration: humantime::Duration) -> Self {
+        Self::from(*duration)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_db_ttl_table_ttl() {
+        // test from ttl to db ttl
+        let ttl = TimeToLive::from(Duration::from_secs(10));
+        let db_ttl: DatabaseTimeToLive = ttl.try_into().unwrap();
+        assert_eq!(db_ttl, DatabaseTimeToLive::from(Duration::from_secs(10)));
+        assert_eq!(TimeToLive::from(db_ttl), ttl);
+
+        let ttl = TimeToLive::from(Duration::from_secs(0));
+        let db_ttl: DatabaseTimeToLive = ttl.try_into().unwrap();
+        assert_eq!(db_ttl, DatabaseTimeToLive::Forever);
+        assert_eq!(TimeToLive::from(db_ttl), ttl);
+
+        let ttl = TimeToLive::Instant;
+        let err_instant = DatabaseTimeToLive::try_from(ttl);
+        assert!(err_instant.is_err());
+
+        // test 0 duration
+        let ttl = Duration::from_secs(0);
+        let db_ttl: DatabaseTimeToLive = ttl.into();
+        assert_eq!(db_ttl, DatabaseTimeToLive::Forever);
+
+        let ttl = Duration::from_secs(10);
+        let db_ttl: DatabaseTimeToLive = ttl.into();
+        assert_eq!(
+            db_ttl,
+            DatabaseTimeToLive::Duration(Duration::from_secs(10))
+        );
+
+        let ttl = DatabaseTimeToLive::from_humantime_or_str("10s").unwrap();
+        let ttl: TimeToLive = ttl.into();
+        assert_eq!(ttl, TimeToLive::from(Duration::from_secs(10)));
+
+        let ttl = DatabaseTimeToLive::from_humantime_or_str("forever").unwrap();
+        let ttl: TimeToLive = ttl.into();
+        assert_eq!(ttl, TimeToLive::Forever);
+
+        assert!(DatabaseTimeToLive::from_humantime_or_str("instant").is_err());
+
+        // test 0s
+        let ttl = DatabaseTimeToLive::from_humantime_or_str("0s").unwrap();
+        let ttl: TimeToLive = ttl.into();
+        assert_eq!(ttl, TimeToLive::Forever);
+    }
+
+    #[test]
+    fn test_serde() {
+        let cases = vec![
+            ("\"instant\"", TimeToLive::Instant),
+            ("\"forever\"", TimeToLive::Forever),
+            ("\"10d\"", Duration::from_secs(86400 * 10).into()),
+            (
+                "\"10000 years\"",
+                humantime::parse_duration("10000 years").unwrap().into(),
+            ),
+        ];
+
+        for (s, expected) in cases {
+            let serialized = serde_json::to_string(&expected).unwrap();
+            let deserialized: TimeToLive = serde_json::from_str(&serialized).unwrap();
+            assert_eq!(deserialized, expected);
+
+            let deserialized: TimeToLive = serde_json::from_str(s).unwrap_or_else(|err| {
+                panic!("Actual serialized: {}, s=`{s}`, err: {:?}", serialized, err)
+            });
+            assert_eq!(deserialized, expected);
+
+            // test db ttl too
+            if s == "\"instant\"" {
+                assert!(serde_json::from_str::<DatabaseTimeToLive>(s).is_err());
+                continue;
+            }
+
+            let db_ttl: DatabaseTimeToLive = serde_json::from_str(s).unwrap();
+            let re_serialized = serde_json::to_string(&db_ttl).unwrap();
+            assert_eq!(re_serialized, serialized);
+        }
+    }
+}
diff --git a/src/metric-engine/Cargo.toml b/src/metric-engine/Cargo.toml
index 13aa59fe8b30..85aa371594e8 100644
--- a/src/metric-engine/Cargo.toml
+++ b/src/metric-engine/Cargo.toml
@@ -12,6 +12,7 @@ api.workspace = true
 aquamarine.workspace = true
 async-trait.workspace = true
 base64.workspace = true
+common-base.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
 common-query.workspace = true
diff --git a/src/metric-engine/src/engine/alter.rs b/src/metric-engine/src/engine/alter.rs
index b6108f133a38..0a961d498833 100644
--- a/src/metric-engine/src/engine/alter.rs
+++ b/src/metric-engine/src/engine/alter.rs
@@ -207,7 +207,7 @@ mod test {
         let alter_region_option_request = RegionAlterRequest {
             schema_version: 0,
             kind: AlterKind::SetRegionOptions {
-                options: vec![SetRegionOption::TTL(Duration::from_secs(500))],
+                options: vec![SetRegionOption::Ttl(Some(Duration::from_secs(500).into()))],
             },
         };
         let result = engine_inner
diff --git a/src/metric-engine/src/engine/create.rs b/src/metric-engine/src/engine/create.rs
index d897640cc529..b87682d4599e 100644
--- a/src/metric-engine/src/engine/create.rs
+++ b/src/metric-engine/src/engine/create.rs
@@ -17,7 +17,7 @@ use std::collections::{HashMap, HashSet};
 use api::v1::SemanticType;
 use common_error::ext::BoxedError;
 use common_telemetry::{info, warn};
-use common_time::Timestamp;
+use common_time::{Timestamp, FOREVER};
 use datatypes::data_type::ConcreteDataType;
 use datatypes::schema::ColumnSchema;
 use datatypes::value::Value;
@@ -540,7 +540,7 @@ pub(crate) fn region_options_for_metadata_region(
     mut original: HashMap<String, String>,
 ) -> HashMap<String, String> {
     original.remove(APPEND_MODE_KEY);
-    original.insert(TTL_KEY.to_string(), "10000 years".to_string());
+    original.insert(TTL_KEY.to_string(), FOREVER.to_string());
     original
 }
 
@@ -731,7 +731,7 @@ mod test {
         );
         assert_eq!(
             metadata_region_request.options.get("ttl").unwrap(),
-            "10000 years"
+            "forever"
         );
     }
 }
diff --git a/src/mito2/src/compaction.rs b/src/mito2/src/compaction.rs
index 44aa03a67df8..31e1b0674f72 100644
--- a/src/mito2/src/compaction.rs
+++ b/src/mito2/src/compaction.rs
@@ -24,7 +24,7 @@ mod window;
 
 use std::collections::HashMap;
 use std::sync::Arc;
-use std::time::{Duration, Instant};
+use std::time::Instant;
 
 use api::v1::region::compact_request;
 use common_base::Plugins;
@@ -32,7 +32,7 @@ use common_meta::key::SchemaMetadataManagerRef;
 use common_telemetry::{debug, error, info, warn};
 use common_time::range::TimestampRange;
 use common_time::timestamp::TimeUnit;
-use common_time::Timestamp;
+use common_time::{TimeToLive, Timestamp};
 use datafusion_common::ScalarValue;
 use datafusion_expr::Expr;
 use serde::{Deserialize, Serialize};
@@ -273,7 +273,7 @@ impl CompactionScheduler {
         .await
         .unwrap_or_else(|e| {
             warn!(e; "Failed to get ttl for region: {}", region_id);
-            None
+            TimeToLive::default()
         });
 
         debug!(
@@ -292,7 +292,7 @@ impl CompactionScheduler {
             access_layer: access_layer.clone(),
             manifest_ctx: manifest_ctx.clone(),
             file_purger: None,
-            ttl,
+            ttl: Some(ttl),
         };
 
         let picker_output = {
@@ -437,18 +437,21 @@ impl PendingCompaction {
 /// Finds TTL of table by first examine table options then database options.
 async fn find_ttl(
     table_id: TableId,
-    table_ttl: Option<Duration>,
+    table_ttl: Option<TimeToLive>,
     schema_metadata_manager: &SchemaMetadataManagerRef,
-) -> Result<Option<Duration>> {
+) -> Result<TimeToLive> {
+    // If table TTL is set, we use it.
     if let Some(table_ttl) = table_ttl {
-        return Ok(Some(table_ttl));
+        return Ok(table_ttl);
     }
 
     let ttl = schema_metadata_manager
         .get_schema_options_by_table_id(table_id)
         .await
         .context(GetSchemaMetadataSnafu)?
-        .and_then(|options| options.ttl);
+        .and_then(|options| options.ttl)
+        .unwrap_or_default()
+        .into();
     Ok(ttl)
 }
 
@@ -656,24 +659,16 @@ fn ts_to_lit(ts: Timestamp, ts_col_unit: TimeUnit) -> Result<Expr> {
 /// Finds all expired SSTs across levels.
 fn get_expired_ssts(
     levels: &[LevelMeta],
-    ttl: Option<Duration>,
+    ttl: Option<TimeToLive>,
     now: Timestamp,
 ) -> Vec<FileHandle> {
     let Some(ttl) = ttl else {
         return vec![];
     };
 
-    let expire_time = match now.sub_duration(ttl) {
-        Ok(expire_time) => expire_time,
-        Err(e) => {
-            error!(e; "Failed to calculate region TTL expire time");
-            return vec![];
-        }
-    };
-
     levels
         .iter()
-        .flat_map(|l| l.get_expired_files(&expire_time).into_iter())
+        .flat_map(|l| l.get_expired_files(&now, &ttl).into_iter())
         .collect()
 }
 
diff --git a/src/mito2/src/compaction/compactor.rs b/src/mito2/src/compaction/compactor.rs
index 3e0228a4b2a4..792634b2e4a2 100644
--- a/src/mito2/src/compaction/compactor.rs
+++ b/src/mito2/src/compaction/compactor.rs
@@ -18,6 +18,7 @@ use std::time::Duration;
 use api::v1::region::compact_request;
 use common_meta::key::SchemaMetadataManagerRef;
 use common_telemetry::{info, warn};
+use common_time::TimeToLive;
 use object_store::manager::ObjectStoreManagerRef;
 use serde::{Deserialize, Serialize};
 use smallvec::SmallVec;
@@ -63,7 +64,7 @@ pub struct CompactionRegion {
     pub(crate) manifest_ctx: Arc<ManifestContext>,
     pub(crate) current_version: VersionRef,
     pub(crate) file_purger: Option<Arc<LocalFilePurger>>,
-    pub(crate) ttl: Option<Duration>,
+    pub(crate) ttl: Option<TimeToLive>,
 }
 
 /// OpenCompactionRegionRequest represents the request to open a compaction region.
@@ -180,7 +181,7 @@ pub async fn open_compaction_region(
     .await
     .unwrap_or_else(|e| {
         warn!(e; "Failed to get ttl for region: {}", region_metadata.region_id);
-        None
+        TimeToLive::default()
     });
     Ok(CompactionRegion {
         region_id: req.region_id,
@@ -193,7 +194,7 @@ pub async fn open_compaction_region(
         manifest_ctx,
         current_version,
         file_purger: Some(file_purger),
-        ttl,
+        ttl: Some(ttl),
     })
 }
 
diff --git a/src/mito2/src/compaction/window.rs b/src/mito2/src/compaction/window.rs
index 9d6207bba298..f16b8e4c95d3 100644
--- a/src/mito2/src/compaction/window.rs
+++ b/src/mito2/src/compaction/window.rs
@@ -253,7 +253,7 @@ mod tests {
             truncated_entry_id: None,
             compaction_time_window: None,
             options: RegionOptions {
-                ttl,
+                ttl: ttl.map(|t| t.into()),
                 compaction: Default::default(),
                 storage: None,
                 append_mode: false,
diff --git a/src/mito2/src/engine/alter_test.rs b/src/mito2/src/engine/alter_test.rs
index 069d64fb5a87..b774dd8a05cf 100644
--- a/src/mito2/src/engine/alter_test.rs
+++ b/src/mito2/src/engine/alter_test.rs
@@ -604,7 +604,7 @@ async fn test_alter_region_ttl_options() {
     let alter_ttl_request = RegionAlterRequest {
         schema_version: 0,
         kind: AlterKind::SetRegionOptions {
-            options: vec![SetRegionOption::TTL(Duration::from_secs(500))],
+            options: vec![SetRegionOption::Ttl(Some(Duration::from_secs(500).into()))],
         },
     };
     let alter_job = tokio::spawn(async move {
@@ -617,14 +617,8 @@ async fn test_alter_region_ttl_options() {
     alter_job.await.unwrap();
 
     let check_ttl = |engine: &MitoEngine, expected: &Duration| {
-        let current_ttl = engine
-            .get_region(region_id)
-            .unwrap()
-            .version()
-            .options
-            .ttl
-            .unwrap();
-        assert_eq!(*expected, current_ttl);
+        let current_ttl = engine.get_region(region_id).unwrap().version().options.ttl;
+        assert_eq!(current_ttl, Some((*expected).into()));
     };
     // Verify the ttl.
     check_ttl(&engine, &Duration::from_secs(500));
diff --git a/src/mito2/src/engine/create_test.rs b/src/mito2/src/engine/create_test.rs
index 9ce3c53b7661..48b04dc86d91 100644
--- a/src/mito2/src/engine/create_test.rs
+++ b/src/mito2/src/engine/create_test.rs
@@ -165,8 +165,8 @@ async fn test_engine_create_with_options() {
     assert!(engine.is_region_exists(region_id));
     let region = engine.get_region(region_id).unwrap();
     assert_eq!(
-        Duration::from_secs(3600 * 24 * 10),
-        region.version().options.ttl.unwrap()
+        region.version().options.ttl,
+        Some(Duration::from_secs(3600 * 24 * 10).into())
     );
 }
 
diff --git a/src/mito2/src/engine/open_test.rs b/src/mito2/src/engine/open_test.rs
index 8fd084a24ffa..6752bbd04b12 100644
--- a/src/mito2/src/engine/open_test.rs
+++ b/src/mito2/src/engine/open_test.rs
@@ -180,8 +180,8 @@ async fn test_engine_region_open_with_options() {
 
     let region = engine.get_region(region_id).unwrap();
     assert_eq!(
-        Duration::from_secs(3600 * 24 * 4),
-        region.version().options.ttl.unwrap()
+        region.version().options.ttl,
+        Some(Duration::from_secs(3600 * 24 * 4).into())
     );
 }
 
diff --git a/src/mito2/src/region/options.rs b/src/mito2/src/region/options.rs
index 4abc5925b705..4514137cc335 100644
--- a/src/mito2/src/region/options.rs
+++ b/src/mito2/src/region/options.rs
@@ -20,6 +20,7 @@ use std::collections::HashMap;
 use std::time::Duration;
 
 use common_base::readable_size::ReadableSize;
+use common_time::TimeToLive;
 use common_wal::options::{WalOptions, WAL_OPTIONS_KEY};
 use serde::de::Error as _;
 use serde::{Deserialize, Deserializer, Serialize};
@@ -55,8 +56,7 @@ pub enum MergeMode {
 #[serde(default)]
 pub struct RegionOptions {
     /// Region SST files TTL.
-    #[serde(with = "humantime_serde")]
-    pub ttl: Option<Duration>,
+    pub ttl: Option<TimeToLive>,
     /// Compaction options.
     pub compaction: CompactionOptions,
     /// Custom storage. Uses default storage if it is `None`.
@@ -252,8 +252,7 @@ impl Default for TwcsOptions {
 #[serde(default)]
 struct RegionOptionsWithoutEnum {
     /// Region SST files TTL.
-    #[serde(with = "humantime_serde")]
-    ttl: Option<Duration>,
+    ttl: Option<TimeToLive>,
     storage: Option<String>,
     #[serde_as(as = "DisplayFromStr")]
     append_mode: bool,
@@ -458,7 +457,7 @@ mod tests {
         let map = make_map(&[("ttl", "7d")]);
         let options = RegionOptions::try_from(&map).unwrap();
         let expect = RegionOptions {
-            ttl: Some(Duration::from_secs(3600 * 24 * 7)),
+            ttl: Some(Duration::from_secs(3600 * 24 * 7).into()),
             ..Default::default()
         };
         assert_eq!(expect, options);
@@ -621,7 +620,7 @@ mod tests {
         ]);
         let options = RegionOptions::try_from(&map).unwrap();
         let expect = RegionOptions {
-            ttl: Some(Duration::from_secs(3600 * 24 * 7)),
+            ttl: Some(Duration::from_secs(3600 * 24 * 7).into()),
             compaction: CompactionOptions::Twcs(TwcsOptions {
                 max_active_window_runs: 8,
                 max_active_window_files: 11,
@@ -654,7 +653,7 @@ mod tests {
     #[test]
     fn test_region_options_serde() {
         let options = RegionOptions {
-            ttl: Some(Duration::from_secs(3600 * 24 * 7)),
+            ttl: Some(Duration::from_secs(3600 * 24 * 7).into()),
             compaction: CompactionOptions::Twcs(TwcsOptions {
                 max_active_window_runs: 8,
                 max_active_window_files: usize::MAX,
@@ -722,7 +721,7 @@ mod tests {
 }"#;
         let got: RegionOptions = serde_json::from_str(region_options_json_str).unwrap();
         let options = RegionOptions {
-            ttl: Some(Duration::from_secs(3600 * 24 * 7)),
+            ttl: Some(Duration::from_secs(3600 * 24 * 7).into()),
             compaction: CompactionOptions::Twcs(TwcsOptions {
                 max_active_window_runs: 8,
                 max_active_window_files: 11,
diff --git a/src/mito2/src/sst/version.rs b/src/mito2/src/sst/version.rs
index c677a9541344..891ded08f6bd 100644
--- a/src/mito2/src/sst/version.rs
+++ b/src/mito2/src/sst/version.rs
@@ -17,7 +17,7 @@ use std::collections::HashMap;
 use std::fmt;
 use std::sync::Arc;
 
-use common_time::Timestamp;
+use common_time::{TimeToLive, Timestamp};
 
 use crate::sst::file::{FileHandle, FileId, FileMeta, Level, MAX_LEVEL};
 use crate::sst::file_purger::FilePurgerRef;
@@ -160,12 +160,19 @@ impl LevelMeta {
     }
 
     /// Returns expired SSTs from current level.
-    pub fn get_expired_files(&self, expire_time: &Timestamp) -> Vec<FileHandle> {
+    pub fn get_expired_files(&self, now: &Timestamp, ttl: &TimeToLive) -> Vec<FileHandle> {
         self.files
             .values()
             .filter(|v| {
                 let (_, end) = v.time_range();
-                &end < expire_time
+
+                match ttl.is_expired(&end, now) {
+                    Ok(expired) => expired,
+                    Err(e) => {
+                        common_telemetry::error!(e; "Failed to calculate region TTL expire time");
+                        false
+                    }
+                }
             })
             .cloned()
             .collect()
diff --git a/src/mito2/src/worker/handle_alter.rs b/src/mito2/src/worker/handle_alter.rs
index 3908cee98be0..10d87e2940c2 100644
--- a/src/mito2/src/worker/handle_alter.rs
+++ b/src/mito2/src/worker/handle_alter.rs
@@ -184,16 +184,12 @@ impl<S> RegionWorkerLoop<S> {
         let mut current_options = version.options.clone();
         for option in options {
             match option {
-                SetRegionOption::TTL(new_ttl) => {
+                SetRegionOption::Ttl(new_ttl) => {
                     info!(
                         "Update region ttl: {}, previous: {:?} new: {:?}",
                         region.region_id, current_options.ttl, new_ttl
                     );
-                    if new_ttl.is_zero() {
-                        current_options.ttl = None;
-                    } else {
-                        current_options.ttl = Some(new_ttl);
-                    }
+                    current_options.ttl = new_ttl;
                 }
                 SetRegionOption::Twsc(key, value) => {
                     let Twcs(options) = &mut current_options.compaction;
diff --git a/src/operator/Cargo.toml b/src/operator/Cargo.toml
index d20034155f1b..cd26458fad68 100644
--- a/src/operator/Cargo.toml
+++ b/src/operator/Cargo.toml
@@ -11,6 +11,7 @@ testing = []
 workspace = true
 
 [dependencies]
+ahash.workspace = true
 api.workspace = true
 async-stream.workspace = true
 async-trait = "0.1"
diff --git a/src/operator/src/insert.rs b/src/operator/src/insert.rs
index 4637f7fd10bb..ec01b329457f 100644
--- a/src/operator/src/insert.rs
+++ b/src/operator/src/insert.rs
@@ -12,11 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::collections::HashMap;
 use std::sync::Arc;
 
+use ahash::{HashMap, HashMapExt, HashSet, HashSetExt};
 use api::v1::alter_table_expr::Kind;
-use api::v1::region::{InsertRequests as RegionInsertRequests, RegionRequestHeader};
+use api::v1::region::{
+    InsertRequest as RegionInsertRequest, InsertRequests as RegionInsertRequests,
+    RegionRequestHeader,
+};
 use api::v1::{
     AlterTableExpr, ColumnDataType, ColumnSchema, CreateTableExpr, InsertRequests,
     RowInsertRequest, RowInsertRequests, SemanticType,
@@ -91,6 +94,20 @@ impl AutoCreateTableType {
     }
 }
 
+/// Split insert requests into normal and instant requests.
+///
+/// Where instant requests are requests with ttl=instant,
+/// and normal requests are requests with ttl set to other values.
+///
+/// This is used to split requests for different processing.
+#[derive(Clone)]
+pub struct InstantAndNormalInsertRequests {
+    /// Requests with normal ttl.
+    pub normal_requests: RegionInsertRequests,
+    /// Requests with ttl=instant.
+    pub instant_requests: RegionInsertRequests,
+}
+
 impl Inserter {
     pub fn new(
         catalog_manager: CatalogManagerRef,
@@ -183,12 +200,16 @@ impl Inserter {
         });
         validate_column_count_match(&requests)?;
 
-        let table_name_to_ids = self
+        let (table_name_to_ids, instant_table_ids) = self
             .create_or_alter_tables_on_demand(&requests, &ctx, create_type, statement_executor)
             .await?;
-        let inserts = RowToRegion::new(table_name_to_ids, self.partition_manager.as_ref())
-            .convert(requests)
-            .await?;
+        let inserts = RowToRegion::new(
+            table_name_to_ids,
+            instant_table_ids,
+            self.partition_manager.as_ref(),
+        )
+        .convert(requests)
+        .await?;
 
         self.do_request(inserts, &ctx).await
     }
@@ -215,7 +236,7 @@ impl Inserter {
             .await?;
 
         // check and create logical tables
-        let table_name_to_ids = self
+        let (table_name_to_ids, instant_table_ids) = self
             .create_or_alter_tables_on_demand(
                 &requests,
                 &ctx,
@@ -223,9 +244,13 @@ impl Inserter {
                 statement_executor,
             )
             .await?;
-        let inserts = RowToRegion::new(table_name_to_ids, &self.partition_manager)
-            .convert(requests)
-            .await?;
+        let inserts = RowToRegion::new(
+            table_name_to_ids,
+            instant_table_ids,
+            &self.partition_manager,
+        )
+        .convert(requests)
+        .await?;
 
         self.do_request(inserts, &ctx).await
     }
@@ -268,7 +293,7 @@ impl Inserter {
 impl Inserter {
     async fn do_request(
         &self,
-        requests: RegionInsertRequests,
+        requests: InstantAndNormalInsertRequests,
         ctx: &QueryContextRef,
     ) -> Result<Output> {
         let write_cost = write_meter!(
@@ -283,8 +308,21 @@ impl Inserter {
             ..Default::default()
         });
 
+        let InstantAndNormalInsertRequests {
+            normal_requests,
+            instant_requests,
+        } = requests;
+
         // Mirror requests for source table to flownode
-        match self.mirror_flow_node_requests(&requests).await {
+        match self
+            .mirror_flow_node_requests(
+                normal_requests
+                    .requests
+                    .iter()
+                    .chain(instant_requests.requests.iter()),
+            )
+            .await
+        {
             Ok(flow_requests) => {
                 let node_manager = self.node_manager.clone();
                 let flow_tasks = flow_requests.into_iter().map(|(peer, inserts)| {
@@ -320,7 +358,7 @@ impl Inserter {
         }
 
         let write_tasks = self
-            .group_requests_by_peer(requests)
+            .group_requests_by_peer(normal_requests)
             .await?
             .into_iter()
             .map(|(peer, inserts)| {
@@ -350,14 +388,14 @@ impl Inserter {
     }
 
     /// Mirror requests for source table to flownode
-    async fn mirror_flow_node_requests(
-        &self,
-        requests: &RegionInsertRequests,
+    async fn mirror_flow_node_requests<'it, 'zelf: 'it>(
+        &'zelf self,
+        requests: impl Iterator<Item = &'it RegionInsertRequest>,
     ) -> Result<HashMap<Peer, RegionInsertRequests>> {
         // store partial source table requests used by flow node(only store what's used)
         let mut src_table_reqs: HashMap<TableId, Option<(Vec<Peer>, RegionInsertRequests)>> =
             HashMap::new();
-        for req in &requests.requests {
+        for req in requests {
             let table_id = RegionId::from_u64(req.region_id).table_id();
             match src_table_reqs.get_mut(&table_id) {
                 Some(Some((_peers, reqs))) => reqs.requests.push(req.clone()),
@@ -422,7 +460,6 @@ impl Inserter {
         // group by region ids first to reduce repeatedly call `find_region_leader`
         // TODO(discord9): determine if a addition clone is worth it
         let mut requests_per_region: HashMap<RegionId, RegionInsertRequests> = HashMap::new();
-
         for req in requests.requests {
             let region_id = RegionId::from_u64(req.region_id);
             requests_per_region
@@ -462,7 +499,7 @@ impl Inserter {
         ctx: &QueryContextRef,
         auto_create_table_type: AutoCreateTableType,
         statement_executor: &StatementExecutor,
-    ) -> Result<HashMap<String, TableId>> {
+    ) -> Result<(HashMap<String, TableId>, HashSet<TableId>)> {
         let _timer = crate::metrics::CREATE_ALTER_ON_DEMAND
             .with_label_values(&[auto_create_table_type.as_str()])
             .start_timer();
@@ -483,6 +520,7 @@ impl Inserter {
             })?
             .unwrap_or(true);
         if !auto_create_table_hint {
+            let mut instant_table_ids = HashSet::new();
             for req in &requests.inserts {
                 let table = self
                     .get_table(catalog, &schema, &req.table_name)
@@ -494,17 +532,25 @@ impl Inserter {
                         ),
                     })?;
                 let table_info = table.table_info();
+                if table_info.is_ttl_instant_table() {
+                    instant_table_ids.insert(table_info.table_id());
+                }
                 table_name_to_ids.insert(table_info.name.clone(), table_info.table_id());
             }
-            return Ok(table_name_to_ids);
+            return Ok((table_name_to_ids, instant_table_ids));
         }
 
         let mut create_tables = vec![];
         let mut alter_tables = vec![];
+        let mut instant_table_ids = HashSet::new();
+
         for req in &requests.inserts {
             match self.get_table(catalog, &schema, &req.table_name).await? {
                 Some(table) => {
                     let table_info = table.table_info();
+                    if table_info.is_ttl_instant_table() {
+                        instant_table_ids.insert(table_info.table_id());
+                    }
                     table_name_to_ids.insert(table_info.name.clone(), table_info.table_id());
                     if let Some(alter_expr) =
                         self.get_alter_table_expr_on_demand(req, &table, ctx)?
@@ -543,6 +589,8 @@ impl Inserter {
             AutoCreateTableType::Physical
             | AutoCreateTableType::Log
             | AutoCreateTableType::LastNonNull => {
+                // note that auto create table shouldn't be ttl instant table
+                // for it's a very unexpected behavior and should be set by user explicitly
                 for create_table in create_tables {
                     let table = self
                         .create_physical_table(create_table, ctx, statement_executor)
@@ -558,7 +606,7 @@ impl Inserter {
             }
         }
 
-        Ok(table_name_to_ids)
+        Ok((table_name_to_ids, instant_table_ids))
     }
 
     async fn create_physical_table_on_demand(
diff --git a/src/operator/src/req_convert/insert/row_to_region.rs b/src/operator/src/req_convert/insert/row_to_region.rs
index a33a1329026d..125910ba455f 100644
--- a/src/operator/src/req_convert/insert/row_to_region.rs
+++ b/src/operator/src/req_convert/insert/row_to_region.rs
@@ -12,8 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::collections::HashMap;
-
+use ahash::{HashMap, HashSet};
 use api::v1::region::InsertRequests as RegionInsertRequests;
 use api::v1::RowInsertRequests;
 use partition::manager::PartitionRuleManager;
@@ -21,37 +20,53 @@ use snafu::OptionExt;
 use table::metadata::TableId;
 
 use crate::error::{Result, TableNotFoundSnafu};
+use crate::insert::InstantAndNormalInsertRequests;
 use crate::req_convert::common::partitioner::Partitioner;
 
 pub struct RowToRegion<'a> {
     table_name_to_ids: HashMap<String, TableId>,
+    instant_table_ids: HashSet<TableId>,
     partition_manager: &'a PartitionRuleManager,
 }
 
 impl<'a> RowToRegion<'a> {
     pub fn new(
         table_name_to_ids: HashMap<String, TableId>,
+        instant_table_ids: HashSet<TableId>,
         partition_manager: &'a PartitionRuleManager,
     ) -> Self {
         Self {
             table_name_to_ids,
+            instant_table_ids,
             partition_manager,
         }
     }
 
-    pub async fn convert(&self, requests: RowInsertRequests) -> Result<RegionInsertRequests> {
+    pub async fn convert(
+        &self,
+        requests: RowInsertRequests,
+    ) -> Result<InstantAndNormalInsertRequests> {
         let mut region_request = Vec::with_capacity(requests.inserts.len());
+        let mut instant_request = Vec::with_capacity(requests.inserts.len());
         for request in requests.inserts {
             let table_id = self.get_table_id(&request.table_name)?;
             let requests = Partitioner::new(self.partition_manager)
                 .partition_insert_requests(table_id, request.rows.unwrap_or_default())
                 .await?;
-
-            region_request.extend(requests);
+            if self.instant_table_ids.contains(&table_id) {
+                instant_request.extend(requests);
+            } else {
+                region_request.extend(requests);
+            }
         }
 
-        Ok(RegionInsertRequests {
-            requests: region_request,
+        Ok(InstantAndNormalInsertRequests {
+            normal_requests: RegionInsertRequests {
+                requests: region_request,
+            },
+            instant_requests: RegionInsertRequests {
+                requests: instant_request,
+            },
         })
     }
 
diff --git a/src/operator/src/req_convert/insert/stmt_to_region.rs b/src/operator/src/req_convert/insert/stmt_to_region.rs
index 8124edc19514..cd48b4fca54e 100644
--- a/src/operator/src/req_convert/insert/stmt_to_region.rs
+++ b/src/operator/src/req_convert/insert/stmt_to_region.rs
@@ -32,6 +32,7 @@ use crate::error::{
     ColumnNotFoundSnafu, InvalidSqlSnafu, MissingInsertBodySnafu, ParseSqlSnafu, Result,
     SchemaReadOnlySnafu, TableNotFoundSnafu,
 };
+use crate::insert::InstantAndNormalInsertRequests;
 use crate::req_convert::common::partitioner::Partitioner;
 use crate::req_convert::insert::semantic_type;
 
@@ -60,7 +61,7 @@ impl<'a> StatementToRegion<'a> {
         &self,
         stmt: &Insert,
         query_ctx: &QueryContextRef,
-    ) -> Result<RegionInsertRequests> {
+    ) -> Result<InstantAndNormalInsertRequests> {
         let (catalog, schema, table_name) = self.get_full_name(stmt.table_name())?;
         let table = self.get_table(&catalog, &schema, &table_name).await?;
         let table_schema = table.schema();
@@ -134,7 +135,18 @@ impl<'a> StatementToRegion<'a> {
         let requests = Partitioner::new(self.partition_manager)
             .partition_insert_requests(table_info.table_id(), Rows { schema, rows })
             .await?;
-        Ok(RegionInsertRequests { requests })
+        let requests = RegionInsertRequests { requests };
+        if table_info.is_ttl_instant_table() {
+            Ok(InstantAndNormalInsertRequests {
+                normal_requests: Default::default(),
+                instant_requests: requests,
+            })
+        } else {
+            Ok(InstantAndNormalInsertRequests {
+                normal_requests: requests,
+                instant_requests: Default::default(),
+            })
+        }
     }
 
     async fn get_table(&self, catalog: &str, schema: &str, table: &str) -> Result<TableRef> {
diff --git a/src/operator/src/req_convert/insert/table_to_region.rs b/src/operator/src/req_convert/insert/table_to_region.rs
index 729355cf0159..ac79cce503ed 100644
--- a/src/operator/src/req_convert/insert/table_to_region.rs
+++ b/src/operator/src/req_convert/insert/table_to_region.rs
@@ -19,6 +19,7 @@ use table::metadata::TableInfo;
 use table::requests::InsertRequest as TableInsertRequest;
 
 use crate::error::Result;
+use crate::insert::InstantAndNormalInsertRequests;
 use crate::req_convert::common::partitioner::Partitioner;
 use crate::req_convert::common::{column_schema, row_count};
 
@@ -35,7 +36,10 @@ impl<'a> TableToRegion<'a> {
         }
     }
 
-    pub async fn convert(&self, request: TableInsertRequest) -> Result<RegionInsertRequests> {
+    pub async fn convert(
+        &self,
+        request: TableInsertRequest,
+    ) -> Result<InstantAndNormalInsertRequests> {
         let row_count = row_count(&request.columns_values)?;
         let schema = column_schema(self.table_info, &request.columns_values)?;
         let rows = api::helper::vectors_to_rows(request.columns_values.values(), row_count);
@@ -44,7 +48,19 @@ impl<'a> TableToRegion<'a> {
         let requests = Partitioner::new(self.partition_manager)
             .partition_insert_requests(self.table_info.table_id(), rows)
             .await?;
-        Ok(RegionInsertRequests { requests })
+
+        let requests = RegionInsertRequests { requests };
+        if self.table_info.is_ttl_instant_table() {
+            Ok(InstantAndNormalInsertRequests {
+                normal_requests: Default::default(),
+                instant_requests: requests,
+            })
+        } else {
+            Ok(InstantAndNormalInsertRequests {
+                normal_requests: requests,
+                instant_requests: Default::default(),
+            })
+        }
     }
 }
 
@@ -112,6 +128,7 @@ mod tests {
 
         let region_requests = converter.convert(table_request).await.unwrap();
         let mut region_id_to_region_requests = region_requests
+            .normal_requests
             .requests
             .into_iter()
             .map(|r| (r.region_id, r))
diff --git a/src/query/src/sql/show_create_table.rs b/src/query/src/sql/show_create_table.rs
index 68be43785703..17f1dcdd394b 100644
--- a/src/query/src/sql/show_create_table.rs
+++ b/src/query/src/sql/show_create_table.rs
@@ -21,7 +21,6 @@ use datatypes::schema::{
     ColumnDefaultConstraint, ColumnSchema, SchemaRef, COLUMN_FULLTEXT_OPT_KEY_ANALYZER,
     COLUMN_FULLTEXT_OPT_KEY_CASE_SENSITIVE, COMMENT_KEY,
 };
-use humantime::format_duration;
 use snafu::ResultExt;
 use sql::ast::{ColumnDef, ColumnOption, ColumnOptionDef, Expr, Ident, ObjectName};
 use sql::dialect::GreptimeDbDialect;
@@ -46,13 +45,13 @@ fn create_sql_options(table_meta: &TableMeta, schema_options: Option<SchemaOptio
             write_buffer_size.to_string(),
         );
     }
-    if let Some(ttl) = table_opts.ttl {
-        options.insert(TTL_KEY.to_string(), format_duration(ttl).to_string());
-    } else if let Some(database_ttl) = schema_options.and_then(|o| o.ttl) {
-        options.insert(
-            TTL_KEY.to_string(),
-            format_duration(database_ttl).to_string(),
-        );
+    if let Some(ttl) = table_opts.ttl.map(|t| t.to_string()) {
+        options.insert(TTL_KEY.to_string(), ttl);
+    } else if let Some(database_ttl) = schema_options
+        .and_then(|o| o.ttl)
+        .map(|ttl| ttl.to_string())
+    {
+        options.insert(TTL_KEY.to_string(), database_ttl);
     };
     for (k, v) in table_opts
         .extra_options
diff --git a/src/store-api/src/region_request.rs b/src/store-api/src/region_request.rs
index dc37e4a2cce4..ee9ebf483d98 100644
--- a/src/store-api/src/region_request.rs
+++ b/src/store-api/src/region_request.rs
@@ -14,7 +14,6 @@
 
 use std::collections::HashMap;
 use std::fmt::{self, Display};
-use std::time::Duration;
 
 use api::helper::ColumnDataTypeWrapper;
 use api::v1::add_column_location::LocationType;
@@ -26,6 +25,7 @@ use api::v1::region::{
 };
 use api::v1::{self, Analyzer, Option as PbOption, Rows, SemanticType};
 pub use common_base::AffectedRows;
+use common_time::TimeToLive;
 use datatypes::data_type::ConcreteDataType;
 use datatypes::schema::FulltextOptions;
 use serde::{Deserialize, Serialize};
@@ -746,7 +746,7 @@ impl From<v1::ModifyColumnType> for ModifyColumnType {
 
 #[derive(Debug, Eq, PartialEq, Clone, Serialize, Deserialize)]
 pub enum SetRegionOption {
-    TTL(Duration),
+    Ttl(Option<TimeToLive>),
     // Modifying TwscOptions with values as (option name, new value).
     Twsc(String, String),
 }
@@ -758,13 +758,10 @@ impl TryFrom<&PbOption> for SetRegionOption {
         let PbOption { key, value } = value;
         match key.as_str() {
             TTL_KEY => {
-                let ttl = if value.is_empty() {
-                    Duration::from_secs(0)
-                } else {
-                    humantime::parse_duration(value)
-                        .map_err(|_| InvalidSetRegionOptionRequestSnafu { key, value }.build())?
-                };
-                Ok(Self::TTL(ttl))
+                let ttl = TimeToLive::from_humantime_or_str(value)
+                    .map_err(|_| InvalidSetRegionOptionRequestSnafu { key, value }.build())?;
+
+                Ok(Self::Ttl(Some(ttl)))
             }
             TWCS_MAX_ACTIVE_WINDOW_RUNS
             | TWCS_MAX_ACTIVE_WINDOW_FILES
@@ -798,7 +795,7 @@ impl From<&UnsetRegionOption> for SetRegionOption {
             UnsetRegionOption::TwcsTimeWindow => {
                 SetRegionOption::Twsc(unset_option.to_string(), String::new())
             }
-            UnsetRegionOption::Ttl => SetRegionOption::TTL(Duration::default()),
+            UnsetRegionOption::Ttl => SetRegionOption::Ttl(Default::default()),
         }
     }
 }
diff --git a/src/table/src/metadata.rs b/src/table/src/metadata.rs
index 7f4ddb409acd..6dfc47314a36 100644
--- a/src/table/src/metadata.rs
+++ b/src/table/src/metadata.rs
@@ -224,12 +224,8 @@ impl TableMeta {
 
         for request in requests {
             match request {
-                SetRegionOption::TTL(new_ttl) => {
-                    if new_ttl.is_zero() {
-                        new_options.ttl = None;
-                    } else {
-                        new_options.ttl = Some(*new_ttl);
-                    }
+                SetRegionOption::Ttl(new_ttl) => {
+                    new_options.ttl = *new_ttl;
                 }
                 SetRegionOption::Twsc(key, value) => {
                     if !value.is_empty() {
@@ -826,6 +822,15 @@ impl TableInfo {
             .extra_options
             .contains_key(PHYSICAL_TABLE_METADATA_KEY)
     }
+
+    /// Return true if the table's TTL is `instant`.
+    pub fn is_ttl_instant_table(&self) -> bool {
+        self.meta
+            .options
+            .ttl
+            .map(|t| t.is_instant())
+            .unwrap_or(false)
+    }
 }
 
 impl TableInfoBuilder {
diff --git a/src/table/src/requests.rs b/src/table/src/requests.rs
index 4c273e03b785..74554631c62d 100644
--- a/src/table/src/requests.rs
+++ b/src/table/src/requests.rs
@@ -17,12 +17,12 @@
 use std::collections::HashMap;
 use std::fmt;
 use std::str::FromStr;
-use std::time::Duration;
 
 use common_base::readable_size::ReadableSize;
 use common_datasource::object_store::s3::is_supported_in_s3;
 use common_query::AddColumnLocation;
 use common_time::range::TimestampRange;
+use common_time::TimeToLive;
 use datatypes::data_type::ConcreteDataType;
 use datatypes::prelude::VectorRef;
 use datatypes::schema::{ColumnSchema, FulltextOptions};
@@ -74,8 +74,7 @@ pub struct TableOptions {
     /// Memtable size of memtable.
     pub write_buffer_size: Option<ReadableSize>,
     /// Time-to-live of table. Expired data will be automatically purged.
-    #[serde(with = "humantime_serde")]
-    pub ttl: Option<Duration>,
+    pub ttl: Option<TimeToLive>,
     /// Extra options that may not applicable to all table engines.
     pub extra_options: HashMap<String, String>,
 }
@@ -109,16 +108,13 @@ impl TableOptions {
         }
 
         if let Some(ttl) = kvs.get(TTL_KEY) {
-            let ttl_value = ttl
-                .parse::<humantime::Duration>()
-                .map_err(|_| {
-                    ParseTableOptionSnafu {
-                        key: TTL_KEY,
-                        value: ttl,
-                    }
-                    .build()
-                })?
-                .into();
+            let ttl_value = TimeToLive::from_humantime_or_str(ttl).map_err(|_| {
+                ParseTableOptionSnafu {
+                    key: TTL_KEY,
+                    value: ttl,
+                }
+                .build()
+            })?;
             options.ttl = Some(ttl_value);
         }
 
@@ -138,8 +134,8 @@ impl fmt::Display for TableOptions {
             key_vals.push(format!("{}={}", WRITE_BUFFER_SIZE_KEY, size));
         }
 
-        if let Some(ttl) = self.ttl {
-            key_vals.push(format!("{}={}", TTL_KEY, humantime::Duration::from(ttl)));
+        if let Some(ttl) = self.ttl.map(|ttl| ttl.to_string()) {
+            key_vals.push(format!("{}={}", TTL_KEY, ttl));
         }
 
         for (k, v) in &self.extra_options {
@@ -159,8 +155,7 @@ impl From<&TableOptions> for HashMap<String, String> {
                 write_buffer_size.to_string(),
             );
         }
-        if let Some(ttl) = opts.ttl {
-            let ttl_str = humantime::format_duration(ttl).to_string();
+        if let Some(ttl_str) = opts.ttl.map(|ttl| ttl.to_string()) {
             let _ = res.insert(TTL_KEY.to_string(), ttl_str);
         }
         res.extend(
@@ -326,6 +321,8 @@ pub struct CopyDatabaseRequest {
 
 #[cfg(test)]
 mod tests {
+    use std::time::Duration;
+
     use super::*;
 
     #[test]
@@ -343,7 +340,7 @@ mod tests {
     fn test_serialize_table_options() {
         let options = TableOptions {
             write_buffer_size: None,
-            ttl: Some(Duration::from_secs(1000)),
+            ttl: Some(Duration::from_secs(1000).into()),
             extra_options: HashMap::new(),
         };
         let serialized = serde_json::to_string(&options).unwrap();
@@ -355,7 +352,7 @@ mod tests {
     fn test_convert_hashmap_between_table_options() {
         let options = TableOptions {
             write_buffer_size: Some(ReadableSize::mb(128)),
-            ttl: Some(Duration::from_secs(1000)),
+            ttl: Some(Duration::from_secs(1000).into()),
             extra_options: HashMap::new(),
         };
         let serialized_map = HashMap::from(&options);
@@ -364,7 +361,7 @@ mod tests {
 
         let options = TableOptions {
             write_buffer_size: None,
-            ttl: None,
+            ttl: Default::default(),
             extra_options: HashMap::new(),
         };
         let serialized_map = HashMap::from(&options);
@@ -373,7 +370,7 @@ mod tests {
 
         let options = TableOptions {
             write_buffer_size: Some(ReadableSize::mb(128)),
-            ttl: Some(Duration::from_secs(1000)),
+            ttl: Some(Duration::from_secs(1000).into()),
             extra_options: HashMap::from([("a".to_string(), "A".to_string())]),
         };
         let serialized_map = HashMap::from(&options);
@@ -385,7 +382,7 @@ mod tests {
     fn test_table_options_to_string() {
         let options = TableOptions {
             write_buffer_size: Some(ReadableSize::mb(128)),
-            ttl: Some(Duration::from_secs(1000)),
+            ttl: Some(Duration::from_secs(1000).into()),
             extra_options: HashMap::new(),
         };
 
@@ -396,7 +393,7 @@ mod tests {
 
         let options = TableOptions {
             write_buffer_size: Some(ReadableSize::mb(128)),
-            ttl: Some(Duration::from_secs(1000)),
+            ttl: Some(Duration::from_secs(1000).into()),
             extra_options: HashMap::from([("a".to_string(), "A".to_string())]),
         };
 
diff --git a/tests/cases/standalone/common/alter/alter_database.result b/tests/cases/standalone/common/alter/alter_database.result
index a98d48323659..8ff458989e4c 100644
--- a/tests/cases/standalone/common/alter/alter_database.result
+++ b/tests/cases/standalone/common/alter/alter_database.result
@@ -62,6 +62,9 @@ SHOW CREATE DATABASE alter_database;
 | Database       | Create Database                              |
 +----------------+----------------------------------------------+
 | alter_database | CREATE DATABASE IF NOT EXISTS alter_database |
+|                | WITH(                                        |
+|                |   ttl = 'forever'                            |
+|                | )                                            |
 +----------------+----------------------------------------------+
 
 ALTER DATABASE alter_database SET 'ttl'='😁';
diff --git a/tests/cases/standalone/common/alter/alter_table_options.result b/tests/cases/standalone/common/alter/alter_table_options.result
index 8fa08eefea6b..b38a99d8465e 100644
--- a/tests/cases/standalone/common/alter/alter_table_options.result
+++ b/tests/cases/standalone/common/alter/alter_table_options.result
@@ -103,7 +103,9 @@ SHOW CREATE TABLE ato;
 |       | )                                  |
 |       |                                    |
 |       | ENGINE=mito                        |
-|       |                                    |
+|       | WITH(                              |
+|       |   ttl = 'forever'                  |
+|       | )                                  |
 +-------+------------------------------------+
 
 ALTER TABLE ato SET 'ttl'='1s';
diff --git a/tests/cases/standalone/common/flow/flow_advance_ttl.result b/tests/cases/standalone/common/flow/flow_advance_ttl.result
new file mode 100644
index 000000000000..38d14d6b3155
--- /dev/null
+++ b/tests/cases/standalone/common/flow/flow_advance_ttl.result
@@ -0,0 +1,101 @@
+-- test ttl = instant
+CREATE TABLE distinct_basic (
+    number INT,
+    ts TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    PRIMARY KEY(number),
+    TIME INDEX(ts)
+)WITH ('ttl' = 'instant');
+
+Affected Rows: 0
+
+CREATE FLOW test_distinct_basic SINK TO out_distinct_basic AS
+SELECT
+    DISTINCT number as dis
+FROM
+    distinct_basic;
+
+Affected Rows: 0
+
+-- SQLNESS ARG restart=true
+INSERT INTO
+    distinct_basic
+VALUES
+    (20, "2021-07-01 00:00:00.200"),
+    (20, "2021-07-01 00:00:00.200"),
+    (22, "2021-07-01 00:00:00.600");
+
+Affected Rows: 0
+
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('test_distinct_basic');
+
++-----------------------------------------+
+| ADMIN FLUSH_FLOW('test_distinct_basic') |
++-----------------------------------------+
+|  FLOW_FLUSHED  |
++-----------------------------------------+
+
+SHOW CREATE TABLE distinct_basic;
+
++----------------+-----------------------------------------------------------+
+| Table          | Create Table                                              |
++----------------+-----------------------------------------------------------+
+| distinct_basic | CREATE TABLE IF NOT EXISTS "distinct_basic" (             |
+|                |   "number" INT NULL,                                      |
+|                |   "ts" TIMESTAMP(3) NOT NULL DEFAULT current_timestamp(), |
+|                |   TIME INDEX ("ts"),                                      |
+|                |   PRIMARY KEY ("number")                                  |
+|                | )                                                         |
+|                |                                                           |
+|                | ENGINE=mito                                               |
+|                | WITH(                                                     |
+|                |   ttl = 'instant'                                         |
+|                | )                                                         |
++----------------+-----------------------------------------------------------+
+
+SHOW CREATE TABLE out_distinct_basic;
+
++--------------------+---------------------------------------------------+
+| Table              | Create Table                                      |
++--------------------+---------------------------------------------------+
+| out_distinct_basic | CREATE TABLE IF NOT EXISTS "out_distinct_basic" ( |
+|                    |   "dis" INT NULL,                                 |
+|                    |   "update_at" TIMESTAMP(3) NULL,                  |
+|                    |   "__ts_placeholder" TIMESTAMP(3) NOT NULL,       |
+|                    |   TIME INDEX ("__ts_placeholder"),                |
+|                    |   PRIMARY KEY ("dis")                             |
+|                    | )                                                 |
+|                    |                                                   |
+|                    | ENGINE=mito                                       |
+|                    |                                                   |
++--------------------+---------------------------------------------------+
+
+SELECT
+    dis
+FROM
+    out_distinct_basic;
+
++-----+
+| dis |
++-----+
+| 20  |
+| 22  |
++-----+
+
+SELECT number FROM distinct_basic;
+
+++
+++
+
+DROP FLOW test_distinct_basic;
+
+Affected Rows: 0
+
+DROP TABLE distinct_basic;
+
+Affected Rows: 0
+
+DROP TABLE out_distinct_basic;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/flow/flow_advance_ttl.sql b/tests/cases/standalone/common/flow/flow_advance_ttl.sql
new file mode 100644
index 000000000000..18dfea25db04
--- /dev/null
+++ b/tests/cases/standalone/common/flow/flow_advance_ttl.sql
@@ -0,0 +1,39 @@
+-- test ttl = instant
+CREATE TABLE distinct_basic (
+    number INT,
+    ts TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    PRIMARY KEY(number),
+    TIME INDEX(ts)
+)WITH ('ttl' = 'instant');
+
+CREATE FLOW test_distinct_basic SINK TO out_distinct_basic AS
+SELECT
+    DISTINCT number as dis
+FROM
+    distinct_basic;
+
+-- SQLNESS ARG restart=true
+INSERT INTO
+    distinct_basic
+VALUES
+    (20, "2021-07-01 00:00:00.200"),
+    (20, "2021-07-01 00:00:00.200"),
+    (22, "2021-07-01 00:00:00.600");
+
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('test_distinct_basic');
+
+SHOW CREATE TABLE distinct_basic;
+
+SHOW CREATE TABLE out_distinct_basic;
+
+SELECT
+    dis
+FROM
+    out_distinct_basic;
+
+SELECT number FROM distinct_basic;
+
+DROP FLOW test_distinct_basic;
+DROP TABLE distinct_basic;
+DROP TABLE out_distinct_basic;
\ No newline at end of file
diff --git a/tests/cases/standalone/common/flow/flow_basic.result b/tests/cases/standalone/common/flow/flow_basic.result
index cc9b4e038b0f..8ee6a90c83bf 100644
--- a/tests/cases/standalone/common/flow/flow_basic.result
+++ b/tests/cases/standalone/common/flow/flow_basic.result
@@ -227,6 +227,23 @@ ADMIN FLUSH_FLOW('test_distinct_basic');
 |  FLOW_FLUSHED  |
 +-----------------------------------------+
 
+SHOW CREATE TABLE out_distinct_basic;
+
++--------------------+---------------------------------------------------+
+| Table              | Create Table                                      |
++--------------------+---------------------------------------------------+
+| out_distinct_basic | CREATE TABLE IF NOT EXISTS "out_distinct_basic" ( |
+|                    |   "dis" INT NULL,                                 |
+|                    |   "update_at" TIMESTAMP(3) NULL,                  |
+|                    |   "__ts_placeholder" TIMESTAMP(3) NOT NULL,       |
+|                    |   TIME INDEX ("__ts_placeholder"),                |
+|                    |   PRIMARY KEY ("dis")                             |
+|                    | )                                                 |
+|                    |                                                   |
+|                    | ENGINE=mito                                       |
+|                    |                                                   |
++--------------------+---------------------------------------------------+
+
 SELECT
     dis
 FROM
@@ -478,6 +495,23 @@ ADMIN FLUSH_FLOW('calc_ngx_country');
 |  FLOW_FLUSHED  |
 +--------------------------------------+
 
+SHOW CREATE TABLE ngx_country;
+
++-------------+---------------------------------------------+
+| Table       | Create Table                                |
++-------------+---------------------------------------------+
+| ngx_country | CREATE TABLE IF NOT EXISTS "ngx_country" (  |
+|             |   "ngx_access_log.country" STRING NULL,     |
+|             |   "update_at" TIMESTAMP(3) NULL,            |
+|             |   "__ts_placeholder" TIMESTAMP(3) NOT NULL, |
+|             |   TIME INDEX ("__ts_placeholder"),          |
+|             |   PRIMARY KEY ("ngx_access_log.country")    |
+|             | )                                           |
+|             |                                             |
+|             | ENGINE=mito                                 |
+|             |                                             |
++-------------+---------------------------------------------+
+
 SELECT
     "ngx_access_log.country"
 FROM
@@ -594,6 +628,24 @@ ADMIN FLUSH_FLOW('calc_ngx_country');
 |  FLOW_FLUSHED  |
 +--------------------------------------+
 
+SHOW CREATE TABLE ngx_country;
+
++-------------+---------------------------------------------------------+
+| Table       | Create Table                                            |
++-------------+---------------------------------------------------------+
+| ngx_country | CREATE TABLE IF NOT EXISTS "ngx_country" (              |
+|             |   "ngx_access_log.country" STRING NULL,                 |
+|             |   "time_window" TIMESTAMP(3) NULL,                      |
+|             |   "update_at" TIMESTAMP(3) NULL,                        |
+|             |   "__ts_placeholder" TIMESTAMP(3) NOT NULL,             |
+|             |   TIME INDEX ("__ts_placeholder"),                      |
+|             |   PRIMARY KEY ("ngx_access_log.country", "time_window") |
+|             | )                                                       |
+|             |                                                         |
+|             | ENGINE=mito                                             |
+|             |                                                         |
++-------------+---------------------------------------------------------+
+
 SELECT
     "ngx_access_log.country",
     time_window
diff --git a/tests/cases/standalone/common/flow/flow_basic.sql b/tests/cases/standalone/common/flow/flow_basic.sql
index 70d7b14157c2..43a42de4dd5f 100644
--- a/tests/cases/standalone/common/flow/flow_basic.sql
+++ b/tests/cases/standalone/common/flow/flow_basic.sql
@@ -128,6 +128,8 @@ VALUES
 -- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
 ADMIN FLUSH_FLOW('test_distinct_basic');
 
+SHOW CREATE TABLE out_distinct_basic;
+
 SELECT
     dis
 FROM
@@ -270,6 +272,8 @@ VALUES
 -- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
 ADMIN FLUSH_FLOW('calc_ngx_country');
 
+SHOW CREATE TABLE ngx_country;
+
 SELECT
     "ngx_access_log.country"
 FROM
@@ -333,6 +337,8 @@ VALUES
 -- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
 ADMIN FLUSH_FLOW('calc_ngx_country');
 
+SHOW CREATE TABLE ngx_country;
+
 SELECT
     "ngx_access_log.country",
     time_window
diff --git a/tests/cases/standalone/common/ttl/show_ttl.result b/tests/cases/standalone/common/ttl/show_ttl.result
new file mode 100644
index 000000000000..d98c1b612bca
--- /dev/null
+++ b/tests/cases/standalone/common/ttl/show_ttl.result
@@ -0,0 +1,374 @@
+CREATE DATABASE test_ttl_db WITH (ttl = '1 second');
+
+Affected Rows: 1
+
+USE test_ttl_db;
+
+Affected Rows: 0
+
+CREATE TABLE test_ttl(ts TIMESTAMP TIME INDEX, val INT);
+
+Affected Rows: 0
+
+SHOW CREATE TABLE test_ttl;
+
++----------+-----------------------------------------+
+| Table    | Create Table                            |
++----------+-----------------------------------------+
+| test_ttl | CREATE TABLE IF NOT EXISTS "test_ttl" ( |
+|          |   "ts" TIMESTAMP(3) NOT NULL,           |
+|          |   "val" INT NULL,                       |
+|          |   TIME INDEX ("ts")                     |
+|          | )                                       |
+|          |                                         |
+|          | ENGINE=mito                             |
+|          | WITH(                                   |
+|          |   ttl = '1s'                            |
+|          | )                                       |
++----------+-----------------------------------------+
+
+SHOW CREATE DATABASE test_ttl_db;
+
++-------------+-------------------------------------------+
+| Database    | Create Database                           |
++-------------+-------------------------------------------+
+| test_ttl_db | CREATE DATABASE IF NOT EXISTS test_ttl_db |
+|             | WITH(                                     |
+|             |   ttl = '1s'                              |
+|             | )                                         |
++-------------+-------------------------------------------+
+
+ALTER DATABASE test_ttl_db SET ttl = '1 day';
+
+Affected Rows: 0
+
+SHOW CREATE TABLE test_ttl;
+
++----------+-----------------------------------------+
+| Table    | Create Table                            |
++----------+-----------------------------------------+
+| test_ttl | CREATE TABLE IF NOT EXISTS "test_ttl" ( |
+|          |   "ts" TIMESTAMP(3) NOT NULL,           |
+|          |   "val" INT NULL,                       |
+|          |   TIME INDEX ("ts")                     |
+|          | )                                       |
+|          |                                         |
+|          | ENGINE=mito                             |
+|          | WITH(                                   |
+|          |   ttl = '1day'                          |
+|          | )                                       |
++----------+-----------------------------------------+
+
+SHOW CREATE DATABASE test_ttl_db;
+
++-------------+-------------------------------------------+
+| Database    | Create Database                           |
++-------------+-------------------------------------------+
+| test_ttl_db | CREATE DATABASE IF NOT EXISTS test_ttl_db |
+|             | WITH(                                     |
+|             |   ttl = '1day'                            |
+|             | )                                         |
++-------------+-------------------------------------------+
+
+ALTER TABLE test_ttl SET 'ttl' = '6 hours';
+
+Affected Rows: 0
+
+SHOW CREATE TABLE test_ttl;
+
++----------+-----------------------------------------+
+| Table    | Create Table                            |
++----------+-----------------------------------------+
+| test_ttl | CREATE TABLE IF NOT EXISTS "test_ttl" ( |
+|          |   "ts" TIMESTAMP(3) NOT NULL,           |
+|          |   "val" INT NULL,                       |
+|          |   TIME INDEX ("ts")                     |
+|          | )                                       |
+|          |                                         |
+|          | ENGINE=mito                             |
+|          | WITH(                                   |
+|          |   ttl = '6h'                            |
+|          | )                                       |
++----------+-----------------------------------------+
+
+ALTER TABLE test_ttl SET 'ttl' = 'instant';
+
+Affected Rows: 0
+
+SHOW CREATE TABLE test_ttl;
+
++----------+-----------------------------------------+
+| Table    | Create Table                            |
++----------+-----------------------------------------+
+| test_ttl | CREATE TABLE IF NOT EXISTS "test_ttl" ( |
+|          |   "ts" TIMESTAMP(3) NOT NULL,           |
+|          |   "val" INT NULL,                       |
+|          |   TIME INDEX ("ts")                     |
+|          | )                                       |
+|          |                                         |
+|          | ENGINE=mito                             |
+|          | WITH(                                   |
+|          |   ttl = 'instant'                       |
+|          | )                                       |
++----------+-----------------------------------------+
+
+ALTER TABLE test_ttl SET 'ttl' = '0s';
+
+Affected Rows: 0
+
+SHOW CREATE TABLE test_ttl;
+
++----------+-----------------------------------------+
+| Table    | Create Table                            |
++----------+-----------------------------------------+
+| test_ttl | CREATE TABLE IF NOT EXISTS "test_ttl" ( |
+|          |   "ts" TIMESTAMP(3) NOT NULL,           |
+|          |   "val" INT NULL,                       |
+|          |   TIME INDEX ("ts")                     |
+|          | )                                       |
+|          |                                         |
+|          | ENGINE=mito                             |
+|          | WITH(                                   |
+|          |   ttl = 'forever'                       |
+|          | )                                       |
++----------+-----------------------------------------+
+
+ALTER TABLE test_ttl SET 'ttl' = 'forever';
+
+Affected Rows: 0
+
+SHOW CREATE TABLE test_ttl;
+
++----------+-----------------------------------------+
+| Table    | Create Table                            |
++----------+-----------------------------------------+
+| test_ttl | CREATE TABLE IF NOT EXISTS "test_ttl" ( |
+|          |   "ts" TIMESTAMP(3) NOT NULL,           |
+|          |   "val" INT NULL,                       |
+|          |   TIME INDEX ("ts")                     |
+|          | )                                       |
+|          |                                         |
+|          | ENGINE=mito                             |
+|          | WITH(                                   |
+|          |   ttl = 'forever'                       |
+|          | )                                       |
++----------+-----------------------------------------+
+
+SHOW CREATE DATABASE test_ttl_db;
+
++-------------+-------------------------------------------+
+| Database    | Create Database                           |
++-------------+-------------------------------------------+
+| test_ttl_db | CREATE DATABASE IF NOT EXISTS test_ttl_db |
+|             | WITH(                                     |
+|             |   ttl = '1day'                            |
+|             | )                                         |
++-------------+-------------------------------------------+
+
+ALTER TABLE test_ttl UNSET 'ttl';
+
+Affected Rows: 0
+
+SHOW CREATE TABLE test_ttl;
+
++----------+-----------------------------------------+
+| Table    | Create Table                            |
++----------+-----------------------------------------+
+| test_ttl | CREATE TABLE IF NOT EXISTS "test_ttl" ( |
+|          |   "ts" TIMESTAMP(3) NOT NULL,           |
+|          |   "val" INT NULL,                       |
+|          |   TIME INDEX ("ts")                     |
+|          | )                                       |
+|          |                                         |
+|          | ENGINE=mito                             |
+|          | WITH(                                   |
+|          |   ttl = '1day'                          |
+|          | )                                       |
++----------+-----------------------------------------+
+
+SHOW CREATE DATABASE test_ttl_db;
+
++-------------+-------------------------------------------+
+| Database    | Create Database                           |
++-------------+-------------------------------------------+
+| test_ttl_db | CREATE DATABASE IF NOT EXISTS test_ttl_db |
+|             | WITH(                                     |
+|             |   ttl = '1day'                            |
+|             | )                                         |
++-------------+-------------------------------------------+
+
+ALTER DATABASE test_ttl_db SET 'ttl' = 'forever';
+
+Affected Rows: 0
+
+SHOW CREATE TABLE test_ttl;
+
++----------+-----------------------------------------+
+| Table    | Create Table                            |
++----------+-----------------------------------------+
+| test_ttl | CREATE TABLE IF NOT EXISTS "test_ttl" ( |
+|          |   "ts" TIMESTAMP(3) NOT NULL,           |
+|          |   "val" INT NULL,                       |
+|          |   TIME INDEX ("ts")                     |
+|          | )                                       |
+|          |                                         |
+|          | ENGINE=mito                             |
+|          | WITH(                                   |
+|          |   ttl = 'forever'                       |
+|          | )                                       |
++----------+-----------------------------------------+
+
+SHOW CREATE DATABASE test_ttl_db;
+
++-------------+-------------------------------------------+
+| Database    | Create Database                           |
++-------------+-------------------------------------------+
+| test_ttl_db | CREATE DATABASE IF NOT EXISTS test_ttl_db |
+|             | WITH(                                     |
+|             |   ttl = 'forever'                         |
+|             | )                                         |
++-------------+-------------------------------------------+
+
+ALTER DATABASE test_ttl_db SET 'ttl' = '0s';
+
+Affected Rows: 0
+
+SHOW CREATE TABLE test_ttl;
+
++----------+-----------------------------------------+
+| Table    | Create Table                            |
++----------+-----------------------------------------+
+| test_ttl | CREATE TABLE IF NOT EXISTS "test_ttl" ( |
+|          |   "ts" TIMESTAMP(3) NOT NULL,           |
+|          |   "val" INT NULL,                       |
+|          |   TIME INDEX ("ts")                     |
+|          | )                                       |
+|          |                                         |
+|          | ENGINE=mito                             |
+|          | WITH(                                   |
+|          |   ttl = 'forever'                       |
+|          | )                                       |
++----------+-----------------------------------------+
+
+SHOW CREATE DATABASE test_ttl_db;
+
++-------------+-------------------------------------------+
+| Database    | Create Database                           |
++-------------+-------------------------------------------+
+| test_ttl_db | CREATE DATABASE IF NOT EXISTS test_ttl_db |
+|             | WITH(                                     |
+|             |   ttl = 'forever'                         |
+|             | )                                         |
++-------------+-------------------------------------------+
+
+ALTER DATABASE test_ttl_db SET 'ttl' = 'instant';
+
+Error: 1004(InvalidArguments), Invalid set database option, key: ttl, value: instant
+
+SHOW CREATE TABLE test_ttl;
+
++----------+-----------------------------------------+
+| Table    | Create Table                            |
++----------+-----------------------------------------+
+| test_ttl | CREATE TABLE IF NOT EXISTS "test_ttl" ( |
+|          |   "ts" TIMESTAMP(3) NOT NULL,           |
+|          |   "val" INT NULL,                       |
+|          |   TIME INDEX ("ts")                     |
+|          | )                                       |
+|          |                                         |
+|          | ENGINE=mito                             |
+|          | WITH(                                   |
+|          |   ttl = 'forever'                       |
+|          | )                                       |
++----------+-----------------------------------------+
+
+SHOW CREATE DATABASE test_ttl_db;
+
++-------------+-------------------------------------------+
+| Database    | Create Database                           |
++-------------+-------------------------------------------+
+| test_ttl_db | CREATE DATABASE IF NOT EXISTS test_ttl_db |
+|             | WITH(                                     |
+|             |   ttl = 'forever'                         |
+|             | )                                         |
++-------------+-------------------------------------------+
+
+ALTER DATABASE test_ttl_db UNSET 'ttl';
+
+Affected Rows: 0
+
+SHOW CREATE TABLE test_ttl;
+
++----------+-----------------------------------------+
+| Table    | Create Table                            |
++----------+-----------------------------------------+
+| test_ttl | CREATE TABLE IF NOT EXISTS "test_ttl" ( |
+|          |   "ts" TIMESTAMP(3) NOT NULL,           |
+|          |   "val" INT NULL,                       |
+|          |   TIME INDEX ("ts")                     |
+|          | )                                       |
+|          |                                         |
+|          | ENGINE=mito                             |
+|          |                                         |
++----------+-----------------------------------------+
+
+SHOW CREATE DATABASE test_ttl_db;
+
++-------------+-------------------------------------------+
+| Database    | Create Database                           |
++-------------+-------------------------------------------+
+| test_ttl_db | CREATE DATABASE IF NOT EXISTS test_ttl_db |
++-------------+-------------------------------------------+
+
+ALTER TABLE test_ttl UNSET 'ttl';
+
+Affected Rows: 0
+
+SHOW CREATE TABLE test_ttl;
+
++----------+-----------------------------------------+
+| Table    | Create Table                            |
++----------+-----------------------------------------+
+| test_ttl | CREATE TABLE IF NOT EXISTS "test_ttl" ( |
+|          |   "ts" TIMESTAMP(3) NOT NULL,           |
+|          |   "val" INT NULL,                       |
+|          |   TIME INDEX ("ts")                     |
+|          | )                                       |
+|          |                                         |
+|          | ENGINE=mito                             |
+|          |                                         |
++----------+-----------------------------------------+
+
+SHOW CREATE DATABASE test_ttl_db;
+
++-------------+-------------------------------------------+
+| Database    | Create Database                           |
++-------------+-------------------------------------------+
+| test_ttl_db | CREATE DATABASE IF NOT EXISTS test_ttl_db |
++-------------+-------------------------------------------+
+
+DROP TABLE test_ttl;
+
+Affected Rows: 0
+
+USE public;
+
+Affected Rows: 0
+
+DROP DATABASE test_ttl_db;
+
+Affected Rows: 0
+
+-- test both set database to instant and alter ttl to instant for a database is forbidden
+CREATE DATABASE test_ttl_db WITH (ttl = 'instant');
+
+Error: 1002(Unexpected), Failed to parse value instant into key ttl
+
+CREATE DATABASE test_ttl_db_2 WITH (ttl = '1s');
+
+Affected Rows: 1
+
+ALTER DATABASE test_ttl_db_2 SET 'ttl' = 'instant';
+
+Error: 1004(InvalidArguments), Invalid set database option, key: ttl, value: instant
+
diff --git a/tests/cases/standalone/common/ttl/show_ttl.sql b/tests/cases/standalone/common/ttl/show_ttl.sql
new file mode 100644
index 000000000000..d226b96211d5
--- /dev/null
+++ b/tests/cases/standalone/common/ttl/show_ttl.sql
@@ -0,0 +1,82 @@
+CREATE DATABASE test_ttl_db WITH (ttl = '1 second');
+
+USE test_ttl_db;
+
+CREATE TABLE test_ttl(ts TIMESTAMP TIME INDEX, val INT);
+
+SHOW CREATE TABLE test_ttl;
+
+SHOW CREATE DATABASE test_ttl_db;
+
+ALTER DATABASE test_ttl_db SET ttl = '1 day';
+
+SHOW CREATE TABLE test_ttl;
+
+SHOW CREATE DATABASE test_ttl_db;
+
+ALTER TABLE test_ttl SET 'ttl' = '6 hours';
+
+SHOW CREATE TABLE test_ttl;
+
+ALTER TABLE test_ttl SET 'ttl' = 'instant';
+
+SHOW CREATE TABLE test_ttl;
+
+ALTER TABLE test_ttl SET 'ttl' = '0s';
+
+SHOW CREATE TABLE test_ttl;
+
+ALTER TABLE test_ttl SET 'ttl' = 'forever';
+
+SHOW CREATE TABLE test_ttl;
+
+SHOW CREATE DATABASE test_ttl_db;
+
+ALTER TABLE test_ttl UNSET 'ttl';
+
+SHOW CREATE TABLE test_ttl;
+
+SHOW CREATE DATABASE test_ttl_db;
+
+ALTER DATABASE test_ttl_db SET 'ttl' = 'forever';
+
+SHOW CREATE TABLE test_ttl;
+
+SHOW CREATE DATABASE test_ttl_db;
+
+ALTER DATABASE test_ttl_db SET 'ttl' = '0s';
+
+SHOW CREATE TABLE test_ttl;
+
+SHOW CREATE DATABASE test_ttl_db;
+
+ALTER DATABASE test_ttl_db SET 'ttl' = 'instant';
+
+SHOW CREATE TABLE test_ttl;
+
+SHOW CREATE DATABASE test_ttl_db;
+
+ALTER DATABASE test_ttl_db UNSET 'ttl';
+
+SHOW CREATE TABLE test_ttl;
+
+SHOW CREATE DATABASE test_ttl_db;
+
+ALTER TABLE test_ttl UNSET 'ttl';
+
+SHOW CREATE TABLE test_ttl;
+
+SHOW CREATE DATABASE test_ttl_db;
+
+DROP TABLE test_ttl;
+
+USE public;
+
+DROP DATABASE test_ttl_db;
+
+-- test both set database to instant and alter ttl to instant for a database is forbidden
+CREATE DATABASE test_ttl_db WITH (ttl = 'instant');
+
+CREATE DATABASE test_ttl_db_2 WITH (ttl = '1s');
+
+ALTER DATABASE test_ttl_db_2 SET 'ttl' = 'instant';
diff --git a/tests/cases/standalone/common/ttl/ttl_instant.result b/tests/cases/standalone/common/ttl/ttl_instant.result
new file mode 100644
index 000000000000..57f12e01ddc7
--- /dev/null
+++ b/tests/cases/standalone/common/ttl/ttl_instant.result
@@ -0,0 +1,340 @@
+CREATE TABLE test_ttl(
+       ts TIMESTAMP TIME INDEX,
+       val INT,
+       PRIMARY KEY (`val`)
+) WITH (ttl = 'instant');
+
+Affected Rows: 0
+
+SHOW CREATE TABLE test_ttl;
+
++----------+-----------------------------------------+
+| Table    | Create Table                            |
++----------+-----------------------------------------+
+| test_ttl | CREATE TABLE IF NOT EXISTS "test_ttl" ( |
+|          |   "ts" TIMESTAMP(3) NOT NULL,           |
+|          |   "val" INT NULL,                       |
+|          |   TIME INDEX ("ts"),                    |
+|          |   PRIMARY KEY ("val")                   |
+|          | )                                       |
+|          |                                         |
+|          | ENGINE=mito                             |
+|          | WITH(                                   |
+|          |   ttl = 'instant'                       |
+|          | )                                       |
++----------+-----------------------------------------+
+
+INSERT INTO
+       test_ttl
+VALUES
+       (now(), 1),
+       (now(), 2),
+       (now(), 3);
+
+Affected Rows: 0
+
+SELECT
+       val
+from
+       test_ttl
+ORDER BY
+       val;
+
+++
+++
+
+-- SQLNESS SLEEP 2s
+ADMIN flush_table('test_ttl');
+
++-------------------------------+
+| ADMIN flush_table('test_ttl') |
++-------------------------------+
+| 0                             |
++-------------------------------+
+
+ADMIN compact_table('test_ttl');
+
++---------------------------------+
+| ADMIN compact_table('test_ttl') |
++---------------------------------+
+| 0                               |
++---------------------------------+
+
+SELECT
+       val
+from
+       test_ttl
+ORDER BY
+       val;
+
+++
+++
+
+ALTER TABLE
+       test_ttl UNSET 'ttl';
+
+Affected Rows: 0
+
+INSERT INTO
+       test_ttl
+VALUES
+       (now(), 1),
+       (now(), 2),
+       (now(), 3);
+
+Affected Rows: 3
+
+SELECT
+       val
+from
+       test_ttl
+ORDER BY
+       val;
+
++-----+
+| val |
++-----+
+| 1   |
+| 2   |
+| 3   |
++-----+
+
+DROP TABLE test_ttl;
+
+Affected Rows: 0
+
+CREATE TABLE test_ttl(
+       ts TIMESTAMP TIME INDEX,
+       val INT,
+       PRIMARY KEY (`val`)
+) WITH (ttl = '1s');
+
+Affected Rows: 0
+
+SHOW CREATE TABLE test_ttl;
+
++----------+-----------------------------------------+
+| Table    | Create Table                            |
++----------+-----------------------------------------+
+| test_ttl | CREATE TABLE IF NOT EXISTS "test_ttl" ( |
+|          |   "ts" TIMESTAMP(3) NOT NULL,           |
+|          |   "val" INT NULL,                       |
+|          |   TIME INDEX ("ts"),                    |
+|          |   PRIMARY KEY ("val")                   |
+|          | )                                       |
+|          |                                         |
+|          | ENGINE=mito                             |
+|          | WITH(                                   |
+|          |   ttl = '1s'                            |
+|          | )                                       |
++----------+-----------------------------------------+
+
+INSERT INTO
+       test_ttl
+VALUES
+       (now(), 1),
+       (now(), 2),
+       (now(), 3);
+
+Affected Rows: 3
+
+SELECT
+       val
+from
+       test_ttl
+ORDER BY
+       val;
+
++-----+
+| val |
++-----+
+| 1   |
+| 2   |
+| 3   |
++-----+
+
+ADMIN flush_table('test_ttl');
+
++-------------------------------+
+| ADMIN flush_table('test_ttl') |
++-------------------------------+
+| 0                             |
++-------------------------------+
+
+ADMIN compact_table('test_ttl');
+
++---------------------------------+
+| ADMIN compact_table('test_ttl') |
++---------------------------------+
+| 0                               |
++---------------------------------+
+
+SELECT
+       val
+from
+       test_ttl
+ORDER BY
+       val;
+
++-----+
+| val |
++-----+
+| 1   |
+| 2   |
+| 3   |
++-----+
+
+-- SQLNESS SLEEP 2s
+ADMIN flush_table('test_ttl');
+
++-------------------------------+
+| ADMIN flush_table('test_ttl') |
++-------------------------------+
+| 0                             |
++-------------------------------+
+
+ADMIN compact_table('test_ttl');
+
++---------------------------------+
+| ADMIN compact_table('test_ttl') |
++---------------------------------+
+| 0                               |
++---------------------------------+
+
+SELECT
+       val
+from
+       test_ttl
+ORDER BY
+       val;
+
+++
+++
+
+ALTER TABLE
+       test_ttl
+SET
+       ttl = '1d';
+
+Affected Rows: 0
+
+INSERT INTO
+       test_ttl
+VALUES
+       (now(), 1),
+       (now(), 2),
+       (now(), 3);
+
+Affected Rows: 3
+
+SELECT
+       val
+from
+       test_ttl
+ORDER BY
+       val;
+
++-----+
+| val |
++-----+
+| 1   |
+| 2   |
+| 3   |
++-----+
+
+ALTER TABLE
+       test_ttl
+SET
+       ttl = 'instant';
+
+Affected Rows: 0
+
+ADMIN flush_table('test_ttl');
+
++-------------------------------+
+| ADMIN flush_table('test_ttl') |
++-------------------------------+
+| 0                             |
++-------------------------------+
+
+ADMIN compact_table('test_ttl');
+
++---------------------------------+
+| ADMIN compact_table('test_ttl') |
++---------------------------------+
+| 0                               |
++---------------------------------+
+
+SELECT
+       val
+from
+       test_ttl
+ORDER BY
+       val;
+
+++
+++
+
+-- to make sure alter back and forth from duration to/from instant wouldn't break anything
+ALTER TABLE
+       test_ttl
+SET
+       ttl = '1s';
+
+Affected Rows: 0
+
+INSERT INTO
+       test_ttl
+VALUES
+       (now(), 1),
+       (now(), 2),
+       (now(), 3);
+
+Affected Rows: 3
+
+SELECT
+       val
+from
+       test_ttl
+ORDER BY
+       val;
+
++-----+
+| val |
++-----+
+| 1   |
+| 2   |
+| 3   |
++-----+
+
+-- SQLNESS SLEEP 2s
+ADMIN flush_table('test_ttl');
+
++-------------------------------+
+| ADMIN flush_table('test_ttl') |
++-------------------------------+
+| 0                             |
++-------------------------------+
+
+ADMIN compact_table('test_ttl');
+
++---------------------------------+
+| ADMIN compact_table('test_ttl') |
++---------------------------------+
+| 0                               |
++---------------------------------+
+
+SELECT
+       val
+from
+       test_ttl
+ORDER BY
+       val;
+
+++
+++
+
+DROP TABLE test_ttl;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/ttl/ttl_instant.sql b/tests/cases/standalone/common/ttl/ttl_instant.sql
new file mode 100644
index 000000000000..b76128ccdf0b
--- /dev/null
+++ b/tests/cases/standalone/common/ttl/ttl_instant.sql
@@ -0,0 +1,166 @@
+CREATE TABLE test_ttl(
+       ts TIMESTAMP TIME INDEX,
+       val INT,
+       PRIMARY KEY (`val`)
+) WITH (ttl = 'instant');
+
+SHOW CREATE TABLE test_ttl;
+
+INSERT INTO
+       test_ttl
+VALUES
+       (now(), 1),
+       (now(), 2),
+       (now(), 3);
+
+SELECT
+       val
+from
+       test_ttl
+ORDER BY
+       val;
+
+-- SQLNESS SLEEP 2s
+ADMIN flush_table('test_ttl');
+
+ADMIN compact_table('test_ttl');
+
+SELECT
+       val
+from
+       test_ttl
+ORDER BY
+       val;
+
+ALTER TABLE
+       test_ttl UNSET 'ttl';
+
+INSERT INTO
+       test_ttl
+VALUES
+       (now(), 1),
+       (now(), 2),
+       (now(), 3);
+
+SELECT
+       val
+from
+       test_ttl
+ORDER BY
+       val;
+
+DROP TABLE test_ttl;
+
+CREATE TABLE test_ttl(
+       ts TIMESTAMP TIME INDEX,
+       val INT,
+       PRIMARY KEY (`val`)
+) WITH (ttl = '1s');
+
+SHOW CREATE TABLE test_ttl;
+
+INSERT INTO
+       test_ttl
+VALUES
+       (now(), 1),
+       (now(), 2),
+       (now(), 3);
+
+SELECT
+       val
+from
+       test_ttl
+ORDER BY
+       val;
+
+ADMIN flush_table('test_ttl');
+
+ADMIN compact_table('test_ttl');
+
+SELECT
+       val
+from
+       test_ttl
+ORDER BY
+       val;
+
+-- SQLNESS SLEEP 2s
+ADMIN flush_table('test_ttl');
+
+ADMIN compact_table('test_ttl');
+
+SELECT
+       val
+from
+       test_ttl
+ORDER BY
+       val;
+
+ALTER TABLE
+       test_ttl
+SET
+       ttl = '1d';
+
+INSERT INTO
+       test_ttl
+VALUES
+       (now(), 1),
+       (now(), 2),
+       (now(), 3);
+
+SELECT
+       val
+from
+       test_ttl
+ORDER BY
+       val;
+
+ALTER TABLE
+       test_ttl
+SET
+       ttl = 'instant';
+
+ADMIN flush_table('test_ttl');
+
+ADMIN compact_table('test_ttl');
+
+SELECT
+       val
+from
+       test_ttl
+ORDER BY
+       val;
+
+-- to make sure alter back and forth from duration to/from instant wouldn't break anything
+ALTER TABLE
+       test_ttl
+SET
+       ttl = '1s';
+
+INSERT INTO
+       test_ttl
+VALUES
+       (now(), 1),
+       (now(), 2),
+       (now(), 3);
+
+SELECT
+       val
+from
+       test_ttl
+ORDER BY
+       val;
+
+-- SQLNESS SLEEP 2s
+ADMIN flush_table('test_ttl');
+
+ADMIN compact_table('test_ttl');
+
+SELECT
+       val
+from
+       test_ttl
+ORDER BY
+       val;
+
+DROP TABLE test_ttl;

From 3133f3fb4e39edd94b58d2a350cc9380b3ff259d Mon Sep 17 00:00:00 2001
From: Ning Sun <sunng@protonmail.com>
Date: Fri, 6 Dec 2024 17:32:22 +0800
Subject: [PATCH 08/36] feat: add cursor statements (#5094)

* feat: add sql parsers for cursor operations

* feat: cursor operator

* feat: implement RecordBatchStreamCursor

* feat: implement cursor storage and execution

* test: add tests

* chore: update docstring

* feat: add a temporary sql rewrite for cast in limit

this issue is described in #5097

* test: add more sql for cursor integration test

* feat: reject non-select query for cursor statement

* refactor: address review issues

* test: add empty result case

* feat: address review comments
---
 Cargo.lock                                |   2 +
 src/common/recordbatch/src/cursor.rs      | 173 ++++++++++++++++++++++
 src/common/recordbatch/src/error.rs       |  10 +-
 src/common/recordbatch/src/lib.rs         |   1 +
 src/common/recordbatch/src/recordbatch.rs | 121 ++++++++++++++-
 src/frontend/src/instance.rs              |   8 +-
 src/operator/src/error.rs                 |  10 +-
 src/operator/src/statement.rs             |  11 ++
 src/operator/src/statement/cursor.rs      |  98 ++++++++++++
 src/servers/src/postgres/fixtures.rs      |  17 +++
 src/servers/src/postgres/handler.rs       |   6 +
 src/session/Cargo.toml                    |   2 +
 src/session/src/context.rs                |  25 ++++
 src/session/src/lib.rs                    |   6 +
 src/sql/src/parser.rs                     |   6 +
 src/sql/src/parsers.rs                    |   1 +
 src/sql/src/parsers/cursor_parser.rs      | 157 ++++++++++++++++++++
 src/sql/src/statements.rs                 |   3 +-
 src/sql/src/statements/cursor.rs          |  60 ++++++++
 src/sql/src/statements/statement.rs       |  10 ++
 tests-integration/tests/sql.rs            |  64 ++++++++
 21 files changed, 786 insertions(+), 5 deletions(-)
 create mode 100644 src/common/recordbatch/src/cursor.rs
 create mode 100644 src/operator/src/statement/cursor.rs
 create mode 100644 src/sql/src/parsers/cursor_parser.rs
 create mode 100644 src/sql/src/statements/cursor.rs

diff --git a/Cargo.lock b/Cargo.lock
index f677ee269d4e..16a234728983 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10987,9 +10987,11 @@ dependencies = [
  "common-catalog",
  "common-error",
  "common-macro",
+ "common-recordbatch",
  "common-telemetry",
  "common-time",
  "derive_builder 0.12.0",
+ "derive_more",
  "meter-core",
  "snafu 0.8.5",
  "sql",
diff --git a/src/common/recordbatch/src/cursor.rs b/src/common/recordbatch/src/cursor.rs
new file mode 100644
index 000000000000..a741953ccc25
--- /dev/null
+++ b/src/common/recordbatch/src/cursor.rs
@@ -0,0 +1,173 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use futures::StreamExt;
+use tokio::sync::Mutex;
+
+use crate::error::Result;
+use crate::recordbatch::merge_record_batches;
+use crate::{RecordBatch, SendableRecordBatchStream};
+
+struct Inner {
+    stream: SendableRecordBatchStream,
+    current_row_index: usize,
+    current_batch: Option<RecordBatch>,
+    total_rows_in_current_batch: usize,
+}
+
+/// A cursor on RecordBatchStream that fetches data batch by batch
+pub struct RecordBatchStreamCursor {
+    inner: Mutex<Inner>,
+}
+
+impl RecordBatchStreamCursor {
+    pub fn new(stream: SendableRecordBatchStream) -> RecordBatchStreamCursor {
+        Self {
+            inner: Mutex::new(Inner {
+                stream,
+                current_row_index: 0,
+                current_batch: None,
+                total_rows_in_current_batch: 0,
+            }),
+        }
+    }
+
+    /// Take `size` of row from the `RecordBatchStream` and create a new
+    /// `RecordBatch` for these rows.
+    pub async fn take(&self, size: usize) -> Result<RecordBatch> {
+        let mut remaining_rows_to_take = size;
+        let mut accumulated_rows = Vec::new();
+
+        let mut inner = self.inner.lock().await;
+
+        while remaining_rows_to_take > 0 {
+            // Ensure we have a current batch or fetch the next one
+            if inner.current_batch.is_none()
+                || inner.current_row_index >= inner.total_rows_in_current_batch
+            {
+                match inner.stream.next().await {
+                    Some(Ok(batch)) => {
+                        inner.total_rows_in_current_batch = batch.num_rows();
+                        inner.current_batch = Some(batch);
+                        inner.current_row_index = 0;
+                    }
+                    Some(Err(e)) => return Err(e),
+                    None => {
+                        // Stream is exhausted
+                        break;
+                    }
+                }
+            }
+
+            // If we still have no batch after attempting to fetch
+            let current_batch = match &inner.current_batch {
+                Some(batch) => batch,
+                None => break,
+            };
+
+            // Calculate how many rows we can take from this batch
+            let rows_to_take_from_batch = remaining_rows_to_take
+                .min(inner.total_rows_in_current_batch - inner.current_row_index);
+
+            // Slice the current batch to get the desired rows
+            let taken_batch =
+                current_batch.slice(inner.current_row_index, rows_to_take_from_batch)?;
+
+            // Add the taken batch to accumulated rows
+            accumulated_rows.push(taken_batch);
+
+            // Update cursor and remaining rows
+            inner.current_row_index += rows_to_take_from_batch;
+            remaining_rows_to_take -= rows_to_take_from_batch;
+        }
+
+        // If no rows were accumulated, return empty
+        if accumulated_rows.is_empty() {
+            return Ok(RecordBatch::new_empty(inner.stream.schema()));
+        }
+
+        // If only one batch was accumulated, return it directly
+        if accumulated_rows.len() == 1 {
+            return Ok(accumulated_rows.remove(0));
+        }
+
+        // Merge multiple batches
+        merge_record_batches(inner.stream.schema(), &accumulated_rows)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use datatypes::prelude::ConcreteDataType;
+    use datatypes::schema::{ColumnSchema, Schema};
+    use datatypes::vectors::StringVector;
+
+    use super::*;
+    use crate::RecordBatches;
+
+    #[tokio::test]
+    async fn test_cursor() {
+        let schema = Arc::new(Schema::new(vec![ColumnSchema::new(
+            "a",
+            ConcreteDataType::string_datatype(),
+            false,
+        )]));
+
+        let rbs = RecordBatches::try_from_columns(
+            schema.clone(),
+            vec![Arc::new(StringVector::from(vec!["hello", "world"])) as _],
+        )
+        .unwrap();
+
+        let cursor = RecordBatchStreamCursor::new(rbs.as_stream());
+        let result_rb = cursor.take(1).await.expect("take from cursor failed");
+        assert_eq!(result_rb.num_rows(), 1);
+
+        let result_rb = cursor.take(1).await.expect("take from cursor failed");
+        assert_eq!(result_rb.num_rows(), 1);
+
+        let result_rb = cursor.take(1).await.expect("take from cursor failed");
+        assert_eq!(result_rb.num_rows(), 0);
+
+        let rb = RecordBatch::new(
+            schema.clone(),
+            vec![Arc::new(StringVector::from(vec!["hello", "world"])) as _],
+        )
+        .unwrap();
+        let rbs2 =
+            RecordBatches::try_new(schema.clone(), vec![rb.clone(), rb.clone(), rb]).unwrap();
+        let cursor = RecordBatchStreamCursor::new(rbs2.as_stream());
+        let result_rb = cursor.take(3).await.expect("take from cursor failed");
+        assert_eq!(result_rb.num_rows(), 3);
+        let result_rb = cursor.take(2).await.expect("take from cursor failed");
+        assert_eq!(result_rb.num_rows(), 2);
+        let result_rb = cursor.take(2).await.expect("take from cursor failed");
+        assert_eq!(result_rb.num_rows(), 1);
+        let result_rb = cursor.take(2).await.expect("take from cursor failed");
+        assert_eq!(result_rb.num_rows(), 0);
+
+        let rb = RecordBatch::new(
+            schema.clone(),
+            vec![Arc::new(StringVector::from(vec!["hello", "world"])) as _],
+        )
+        .unwrap();
+        let rbs3 =
+            RecordBatches::try_new(schema.clone(), vec![rb.clone(), rb.clone(), rb]).unwrap();
+        let cursor = RecordBatchStreamCursor::new(rbs3.as_stream());
+        let result_rb = cursor.take(10).await.expect("take from cursor failed");
+        assert_eq!(result_rb.num_rows(), 6);
+    }
+}
diff --git a/src/common/recordbatch/src/error.rs b/src/common/recordbatch/src/error.rs
index 6e038d1b7e70..6a1c61c0a0f0 100644
--- a/src/common/recordbatch/src/error.rs
+++ b/src/common/recordbatch/src/error.rs
@@ -168,6 +168,13 @@ pub enum Error {
         #[snafu(source)]
         error: tokio::time::error::Elapsed,
     },
+    #[snafu(display("RecordBatch slice index overflow: {visit_index} > {size}"))]
+    RecordBatchSliceIndexOverflow {
+        #[snafu(implicit)]
+        location: Location,
+        size: usize,
+        visit_index: usize,
+    },
 }
 
 impl ErrorExt for Error {
@@ -182,7 +189,8 @@ impl ErrorExt for Error {
             | Error::Format { .. }
             | Error::ToArrowScalar { .. }
             | Error::ProjectArrowRecordBatch { .. }
-            | Error::PhysicalExpr { .. } => StatusCode::Internal,
+            | Error::PhysicalExpr { .. }
+            | Error::RecordBatchSliceIndexOverflow { .. } => StatusCode::Internal,
 
             Error::PollStream { .. } => StatusCode::EngineExecuteQuery,
 
diff --git a/src/common/recordbatch/src/lib.rs b/src/common/recordbatch/src/lib.rs
index 0016e02e94ed..257b6f09732a 100644
--- a/src/common/recordbatch/src/lib.rs
+++ b/src/common/recordbatch/src/lib.rs
@@ -15,6 +15,7 @@
 #![feature(never_type)]
 
 pub mod adapter;
+pub mod cursor;
 pub mod error;
 pub mod filter;
 mod recordbatch;
diff --git a/src/common/recordbatch/src/recordbatch.rs b/src/common/recordbatch/src/recordbatch.rs
index 71f7f60685e5..4641cc0d9a60 100644
--- a/src/common/recordbatch/src/recordbatch.rs
+++ b/src/common/recordbatch/src/recordbatch.rs
@@ -23,7 +23,7 @@ use datatypes::value::Value;
 use datatypes::vectors::{Helper, VectorRef};
 use serde::ser::{Error, SerializeStruct};
 use serde::{Serialize, Serializer};
-use snafu::{OptionExt, ResultExt};
+use snafu::{ensure, OptionExt, ResultExt};
 
 use crate::error::{
     self, CastVectorSnafu, ColumnNotExistsSnafu, DataTypesSnafu, ProjectArrowRecordBatchSnafu,
@@ -194,6 +194,19 @@ impl RecordBatch {
             .map(|t| t.to_string())
             .unwrap_or("failed to pretty display a record batch".to_string())
     }
+
+    /// Return a slice record batch starts from offset, with len rows
+    pub fn slice(&self, offset: usize, len: usize) -> Result<RecordBatch> {
+        ensure!(
+            offset + len <= self.num_rows(),
+            error::RecordBatchSliceIndexOverflowSnafu {
+                size: self.num_rows(),
+                visit_index: offset + len
+            }
+        );
+        let columns = self.columns.iter().map(|vector| vector.slice(offset, len));
+        RecordBatch::new(self.schema.clone(), columns)
+    }
 }
 
 impl Serialize for RecordBatch {
@@ -256,6 +269,36 @@ impl Iterator for RecordBatchRowIterator<'_> {
     }
 }
 
+/// merge multiple recordbatch into a single
+pub fn merge_record_batches(schema: SchemaRef, batches: &[RecordBatch]) -> Result<RecordBatch> {
+    let batches_len = batches.len();
+    if batches_len == 0 {
+        return Ok(RecordBatch::new_empty(schema));
+    }
+
+    let n_rows = batches.iter().map(|b| b.num_rows()).sum();
+    let n_columns = schema.num_columns();
+    // Collect arrays from each batch
+    let mut merged_columns = Vec::with_capacity(n_columns);
+
+    for col_idx in 0..n_columns {
+        let mut acc = schema.column_schemas()[col_idx]
+            .data_type
+            .create_mutable_vector(n_rows);
+
+        for batch in batches {
+            let column = batch.column(col_idx);
+            acc.extend_slice_of(column.as_ref(), 0, column.len())
+                .context(error::DataTypesSnafu)?;
+        }
+
+        merged_columns.push(acc.to_vector());
+    }
+
+    // Create a new RecordBatch with merged columns
+    RecordBatch::new(schema, merged_columns)
+}
+
 #[cfg(test)]
 mod tests {
     use std::sync::Arc;
@@ -375,4 +418,80 @@ mod tests {
 
         assert!(record_batch_iter.next().is_none());
     }
+
+    #[test]
+    fn test_record_batch_slice() {
+        let column_schemas = vec![
+            ColumnSchema::new("numbers", ConcreteDataType::uint32_datatype(), false),
+            ColumnSchema::new("strings", ConcreteDataType::string_datatype(), true),
+        ];
+        let schema = Arc::new(Schema::new(column_schemas));
+        let columns: Vec<VectorRef> = vec![
+            Arc::new(UInt32Vector::from_slice(vec![1, 2, 3, 4])),
+            Arc::new(StringVector::from(vec![
+                None,
+                Some("hello"),
+                Some("greptime"),
+                None,
+            ])),
+        ];
+        let recordbatch = RecordBatch::new(schema, columns).unwrap();
+        let recordbatch = recordbatch.slice(1, 2).expect("recordbatch slice");
+        let mut record_batch_iter = recordbatch.rows();
+        assert_eq!(
+            vec![Value::UInt32(2), Value::String("hello".into())],
+            record_batch_iter
+                .next()
+                .unwrap()
+                .into_iter()
+                .collect::<Vec<Value>>()
+        );
+
+        assert_eq!(
+            vec![Value::UInt32(3), Value::String("greptime".into())],
+            record_batch_iter
+                .next()
+                .unwrap()
+                .into_iter()
+                .collect::<Vec<Value>>()
+        );
+
+        assert!(record_batch_iter.next().is_none());
+
+        assert!(recordbatch.slice(1, 5).is_err());
+    }
+
+    #[test]
+    fn test_merge_record_batch() {
+        let column_schemas = vec![
+            ColumnSchema::new("numbers", ConcreteDataType::uint32_datatype(), false),
+            ColumnSchema::new("strings", ConcreteDataType::string_datatype(), true),
+        ];
+        let schema = Arc::new(Schema::new(column_schemas));
+        let columns: Vec<VectorRef> = vec![
+            Arc::new(UInt32Vector::from_slice(vec![1, 2, 3, 4])),
+            Arc::new(StringVector::from(vec![
+                None,
+                Some("hello"),
+                Some("greptime"),
+                None,
+            ])),
+        ];
+        let recordbatch = RecordBatch::new(schema.clone(), columns).unwrap();
+
+        let columns: Vec<VectorRef> = vec![
+            Arc::new(UInt32Vector::from_slice(vec![1, 2, 3, 4])),
+            Arc::new(StringVector::from(vec![
+                None,
+                Some("hello"),
+                Some("greptime"),
+                None,
+            ])),
+        ];
+        let recordbatch2 = RecordBatch::new(schema.clone(), columns).unwrap();
+
+        let merged = merge_record_batches(schema.clone(), &[recordbatch, recordbatch2])
+            .expect("merge recordbatch");
+        assert_eq!(merged.num_rows(), 8);
+    }
 }
diff --git a/src/frontend/src/instance.rs b/src/frontend/src/instance.rs
index ad387cc5dd96..b22bde96e0ff 100644
--- a/src/frontend/src/instance.rs
+++ b/src/frontend/src/instance.rs
@@ -487,7 +487,11 @@ pub fn check_permission(
         // TODO(dennis): add a hook for admin commands.
         Statement::Admin(_) => {}
         // These are executed by query engine, and will be checked there.
-        Statement::Query(_) | Statement::Explain(_) | Statement::Tql(_) | Statement::Delete(_) => {}
+        Statement::Query(_)
+        | Statement::Explain(_)
+        | Statement::Tql(_)
+        | Statement::Delete(_)
+        | Statement::DeclareCursor(_) => {}
         // database ops won't be checked
         Statement::CreateDatabase(_)
         | Statement::ShowDatabases(_)
@@ -580,6 +584,8 @@ pub fn check_permission(
         Statement::TruncateTable(stmt) => {
             validate_param(stmt.table_name(), query_ctx)?;
         }
+        // cursor operations are always allowed once it's created
+        Statement::FetchCursor(_) | Statement::CloseCursor(_) => {}
     }
     Ok(())
 }
diff --git a/src/operator/src/error.rs b/src/operator/src/error.rs
index 48bc7a81c221..3a5aae897399 100644
--- a/src/operator/src/error.rs
+++ b/src/operator/src/error.rs
@@ -786,6 +786,12 @@ pub enum Error {
         #[snafu(source)]
         error: Elapsed,
     },
+
+    #[snafu(display("Cursor {name} is not found"))]
+    CursorNotFound { name: String },
+
+    #[snafu(display("A cursor named {name} already exists"))]
+    CursorExists { name: String },
 }
 
 pub type Result<T> = std::result::Result<T, Error>;
@@ -825,7 +831,9 @@ impl ErrorExt for Error {
             | Error::FunctionArityMismatch { .. }
             | Error::InvalidPartition { .. }
             | Error::PhysicalExpr { .. }
-            | Error::InvalidJsonFormat { .. } => StatusCode::InvalidArguments,
+            | Error::InvalidJsonFormat { .. }
+            | Error::CursorNotFound { .. }
+            | Error::CursorExists { .. } => StatusCode::InvalidArguments,
 
             Error::TableAlreadyExists { .. } | Error::ViewAlreadyExists { .. } => {
                 StatusCode::TableAlreadyExists
diff --git a/src/operator/src/statement.rs b/src/operator/src/statement.rs
index 64417dbd6b0d..b3251ca6bf2c 100644
--- a/src/operator/src/statement.rs
+++ b/src/operator/src/statement.rs
@@ -16,6 +16,7 @@ mod admin;
 mod copy_database;
 mod copy_table_from;
 mod copy_table_to;
+mod cursor;
 mod ddl;
 mod describe;
 mod dml;
@@ -133,6 +134,16 @@ impl StatementExecutor {
                 self.plan_exec(QueryStatement::Sql(stmt), query_ctx).await
             }
 
+            Statement::DeclareCursor(declare_cursor) => {
+                self.declare_cursor(declare_cursor, query_ctx).await
+            }
+            Statement::FetchCursor(fetch_cursor) => {
+                self.fetch_cursor(fetch_cursor, query_ctx).await
+            }
+            Statement::CloseCursor(close_cursor) => {
+                self.close_cursor(close_cursor, query_ctx).await
+            }
+
             Statement::Insert(insert) => self.insert(insert, query_ctx).await,
 
             Statement::Tql(tql) => self.execute_tql(tql, query_ctx).await,
diff --git a/src/operator/src/statement/cursor.rs b/src/operator/src/statement/cursor.rs
new file mode 100644
index 000000000000..85de4ef36697
--- /dev/null
+++ b/src/operator/src/statement/cursor.rs
@@ -0,0 +1,98 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use common_query::{Output, OutputData};
+use common_recordbatch::cursor::RecordBatchStreamCursor;
+use common_recordbatch::RecordBatches;
+use common_telemetry::tracing;
+use query::parser::QueryStatement;
+use session::context::QueryContextRef;
+use snafu::ResultExt;
+use sql::statements::cursor::{CloseCursor, DeclareCursor, FetchCursor};
+use sql::statements::statement::Statement;
+
+use crate::error::{self, Result};
+use crate::statement::StatementExecutor;
+
+impl StatementExecutor {
+    #[tracing::instrument(skip_all)]
+    pub(super) async fn declare_cursor(
+        &self,
+        declare_cursor: DeclareCursor,
+        query_ctx: QueryContextRef,
+    ) -> Result<Output> {
+        let cursor_name = declare_cursor.cursor_name.to_string();
+
+        if query_ctx.get_cursor(&cursor_name).is_some() {
+            error::CursorExistsSnafu {
+                name: cursor_name.to_string(),
+            }
+            .fail()?;
+        }
+
+        let query_stmt = Statement::Query(declare_cursor.query);
+
+        let output = self
+            .plan_exec(QueryStatement::Sql(query_stmt), query_ctx.clone())
+            .await?;
+        match output.data {
+            OutputData::RecordBatches(rb) => {
+                let rbs = rb.as_stream();
+                query_ctx.insert_cursor(cursor_name, RecordBatchStreamCursor::new(rbs));
+            }
+            OutputData::Stream(rbs) => {
+                query_ctx.insert_cursor(cursor_name, RecordBatchStreamCursor::new(rbs));
+            }
+            // Should not happen because we have query type ensured from parser.
+            OutputData::AffectedRows(_) => error::NotSupportedSnafu {
+                feat: "Non-query statement on cursor",
+            }
+            .fail()?,
+        }
+
+        Ok(Output::new_with_affected_rows(0))
+    }
+
+    #[tracing::instrument(skip_all)]
+    pub(super) async fn fetch_cursor(
+        &self,
+        fetch_cursor: FetchCursor,
+        query_ctx: QueryContextRef,
+    ) -> Result<Output> {
+        let cursor_name = fetch_cursor.cursor_name.to_string();
+        let fetch_size = fetch_cursor.fetch_size;
+        if let Some(rb) = query_ctx.get_cursor(&cursor_name) {
+            let record_batch = rb
+                .take(fetch_size as usize)
+                .await
+                .context(error::BuildRecordBatchSnafu)?;
+            let record_batches =
+                RecordBatches::try_new(record_batch.schema.clone(), vec![record_batch])
+                    .context(error::BuildRecordBatchSnafu)?;
+            Ok(Output::new_with_record_batches(record_batches))
+        } else {
+            error::CursorNotFoundSnafu { name: cursor_name }.fail()
+        }
+    }
+
+    #[tracing::instrument(skip_all)]
+    pub(super) async fn close_cursor(
+        &self,
+        close_cursor: CloseCursor,
+        query_ctx: QueryContextRef,
+    ) -> Result<Output> {
+        query_ctx.remove_cursor(&close_cursor.cursor_name.to_string());
+        Ok(Output::new_with_affected_rows(0))
+    }
+}
diff --git a/src/servers/src/postgres/fixtures.rs b/src/servers/src/postgres/fixtures.rs
index 895f5c03e4a9..2ca3ad02eaa7 100644
--- a/src/servers/src/postgres/fixtures.rs
+++ b/src/servers/src/postgres/fixtures.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::borrow::Cow;
 use std::collections::HashMap;
 use std::sync::Arc;
 
@@ -112,6 +113,13 @@ pub(crate) fn process<'a>(query: &str, query_ctx: QueryContextRef) -> Option<Vec
     }
 }
 
+static LIMIT_CAST_PATTERN: Lazy<Regex> =
+    Lazy::new(|| Regex::new("(?i)(LIMIT\\s+\\d+)::bigint").unwrap());
+pub(crate) fn rewrite_sql(query: &str) -> Cow<'_, str> {
+    //TODO(sunng87): remove this when we upgraded datafusion to 43 or newer
+    LIMIT_CAST_PATTERN.replace_all(query, "$1")
+}
+
 #[cfg(test)]
 mod test {
     use session::context::{QueryContext, QueryContextRef};
@@ -195,4 +203,13 @@ mod test {
         assert!(process("SHOW TABLES ", query_context.clone()).is_none());
         assert!(process("SET TIME_ZONE=utc ", query_context.clone()).is_none());
     }
+
+    #[test]
+    fn test_rewrite() {
+        let sql = "SELECT * FROM number LIMIT 1::bigint";
+        let sql2 = "SELECT * FROM number limit      1::BIGINT";
+
+        assert_eq!("SELECT * FROM number LIMIT 1", rewrite_sql(sql));
+        assert_eq!("SELECT * FROM number limit      1", rewrite_sql(sql2));
+    }
 }
diff --git a/src/servers/src/postgres/handler.rs b/src/servers/src/postgres/handler.rs
index 522c558cdc71..e2e46534b5a1 100644
--- a/src/servers/src/postgres/handler.rs
+++ b/src/servers/src/postgres/handler.rs
@@ -70,6 +70,9 @@ impl SimpleQueryHandler for PostgresServerHandlerInner {
             return Ok(vec![Response::EmptyQuery]);
         }
 
+        let query = fixtures::rewrite_sql(query);
+        let query = query.as_ref();
+
         if let Some(resps) = fixtures::process(query, query_ctx.clone()) {
             send_warning_opt(client, query_ctx).await?;
             Ok(resps)
@@ -229,6 +232,9 @@ impl QueryParser for DefaultQueryParser {
             });
         }
 
+        let sql = fixtures::rewrite_sql(sql);
+        let sql = sql.as_ref();
+
         let mut stmts =
             ParserContext::create_with_dialect(sql, &PostgreSqlDialect {}, ParseOptions::default())
                 .map_err(|e| PgWireError::ApiError(Box::new(e)))?;
diff --git a/src/session/Cargo.toml b/src/session/Cargo.toml
index b6dbb0095546..f15d3b2609b3 100644
--- a/src/session/Cargo.toml
+++ b/src/session/Cargo.toml
@@ -17,9 +17,11 @@ auth.workspace = true
 common-catalog.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
+common-recordbatch.workspace = true
 common-telemetry.workspace = true
 common-time.workspace = true
 derive_builder.workspace = true
+derive_more = { version = "1", default-features = false, features = ["debug"] }
 meter-core.workspace = true
 snafu.workspace = true
 sql.workspace = true
diff --git a/src/session/src/context.rs b/src/session/src/context.rs
index 4e681253c100..1c621b3ab711 100644
--- a/src/session/src/context.rs
+++ b/src/session/src/context.rs
@@ -23,6 +23,8 @@ use arc_swap::ArcSwap;
 use auth::UserInfoRef;
 use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
 use common_catalog::{build_db_string, parse_catalog_and_schema_from_db_string};
+use common_recordbatch::cursor::RecordBatchStreamCursor;
+use common_telemetry::warn;
 use common_time::timezone::parse_timezone;
 use common_time::Timezone;
 use derive_builder::Builder;
@@ -34,6 +36,8 @@ use crate::MutableInner;
 pub type QueryContextRef = Arc<QueryContext>;
 pub type ConnInfoRef = Arc<ConnInfo>;
 
+const CURSOR_COUNT_WARNING_LIMIT: usize = 10;
+
 #[derive(Debug, Builder, Clone)]
 #[builder(pattern = "owned")]
 #[builder(build_fn(skip))]
@@ -299,6 +303,27 @@ impl QueryContext {
     pub fn set_query_timeout(&self, timeout: Duration) {
         self.mutable_session_data.write().unwrap().query_timeout = Some(timeout);
     }
+
+    pub fn insert_cursor(&self, name: String, rb: RecordBatchStreamCursor) {
+        let mut guard = self.mutable_session_data.write().unwrap();
+        guard.cursors.insert(name, Arc::new(rb));
+
+        let cursor_count = guard.cursors.len();
+        if cursor_count > CURSOR_COUNT_WARNING_LIMIT {
+            warn!("Current connection has {} open cursors", cursor_count);
+        }
+    }
+
+    pub fn remove_cursor(&self, name: &str) {
+        let mut guard = self.mutable_session_data.write().unwrap();
+        guard.cursors.remove(name);
+    }
+
+    pub fn get_cursor(&self, name: &str) -> Option<Arc<RecordBatchStreamCursor>> {
+        let guard = self.mutable_session_data.read().unwrap();
+        let rb = guard.cursors.get(name);
+        rb.cloned()
+    }
 }
 
 impl QueryContextBuilder {
diff --git a/src/session/src/lib.rs b/src/session/src/lib.rs
index 5ddaae7eb579..f553fef58c42 100644
--- a/src/session/src/lib.rs
+++ b/src/session/src/lib.rs
@@ -16,6 +16,7 @@ pub mod context;
 pub mod session_config;
 pub mod table_name;
 
+use std::collections::HashMap;
 use std::net::SocketAddr;
 use std::sync::{Arc, RwLock};
 use std::time::Duration;
@@ -23,9 +24,11 @@ use std::time::Duration;
 use auth::UserInfoRef;
 use common_catalog::build_db_string;
 use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
+use common_recordbatch::cursor::RecordBatchStreamCursor;
 use common_time::timezone::get_timezone;
 use common_time::Timezone;
 use context::{ConfigurationVariables, QueryContextBuilder};
+use derive_more::Debug;
 
 use crate::context::{Channel, ConnInfo, QueryContextRef};
 
@@ -47,6 +50,8 @@ pub(crate) struct MutableInner {
     user_info: UserInfoRef,
     timezone: Timezone,
     query_timeout: Option<Duration>,
+    #[debug(skip)]
+    pub(crate) cursors: HashMap<String, Arc<RecordBatchStreamCursor>>,
 }
 
 impl Default for MutableInner {
@@ -56,6 +61,7 @@ impl Default for MutableInner {
             user_info: auth::userinfo_by_name(None),
             timezone: get_timezone(None).clone(),
             query_timeout: None,
+            cursors: HashMap::with_capacity(0),
         }
     }
 }
diff --git a/src/sql/src/parser.rs b/src/sql/src/parser.rs
index bf62a1ad9b67..da03031bc44e 100644
--- a/src/sql/src/parser.rs
+++ b/src/sql/src/parser.rs
@@ -167,6 +167,12 @@ impl ParserContext<'_> {
                         self.parse_tql()
                     }
 
+                    Keyword::DECLARE => self.parse_declare_cursor(),
+
+                    Keyword::FETCH => self.parse_fetch_cursor(),
+
+                    Keyword::CLOSE => self.parse_close_cursor(),
+
                     Keyword::USE => {
                         let _ = self.parser.next_token();
 
diff --git a/src/sql/src/parsers.rs b/src/sql/src/parsers.rs
index 2ae0697231c5..26f3ae9903d7 100644
--- a/src/sql/src/parsers.rs
+++ b/src/sql/src/parsers.rs
@@ -16,6 +16,7 @@ pub(crate) mod admin_parser;
 mod alter_parser;
 pub(crate) mod copy_parser;
 pub(crate) mod create_parser;
+pub(crate) mod cursor_parser;
 pub(crate) mod deallocate_parser;
 pub(crate) mod delete_parser;
 pub(crate) mod describe_parser;
diff --git a/src/sql/src/parsers/cursor_parser.rs b/src/sql/src/parsers/cursor_parser.rs
new file mode 100644
index 000000000000..706f820c189e
--- /dev/null
+++ b/src/sql/src/parsers/cursor_parser.rs
@@ -0,0 +1,157 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use snafu::{ensure, ResultExt};
+use sqlparser::keywords::Keyword;
+use sqlparser::tokenizer::Token;
+
+use crate::error::{self, Result};
+use crate::parser::ParserContext;
+use crate::statements::cursor::{CloseCursor, DeclareCursor, FetchCursor};
+use crate::statements::statement::Statement;
+
+impl ParserContext<'_> {
+    pub(crate) fn parse_declare_cursor(&mut self) -> Result<Statement> {
+        let _ = self.parser.expect_keyword(Keyword::DECLARE);
+        let cursor_name = self
+            .parser
+            .parse_object_name(false)
+            .context(error::SyntaxSnafu)?;
+        let _ = self
+            .parser
+            .expect_keywords(&[Keyword::CURSOR, Keyword::FOR]);
+
+        let mut is_select = false;
+        if let Token::Word(w) = self.parser.peek_token().token {
+            match w.keyword {
+                Keyword::SELECT | Keyword::WITH => {
+                    is_select = true;
+                }
+                _ => {}
+            }
+        };
+        ensure!(
+            is_select,
+            error::InvalidSqlSnafu {
+                msg: "Expect select query in cursor statement".to_string(),
+            }
+        );
+
+        let query_stmt = self.parse_query()?;
+        match query_stmt {
+            Statement::Query(query) => Ok(Statement::DeclareCursor(DeclareCursor {
+                cursor_name: ParserContext::canonicalize_object_name(cursor_name),
+                query,
+            })),
+            _ => error::InvalidSqlSnafu {
+                msg: format!("Expect query, found {}", query_stmt),
+            }
+            .fail(),
+        }
+    }
+
+    pub(crate) fn parse_fetch_cursor(&mut self) -> Result<Statement> {
+        let _ = self.parser.expect_keyword(Keyword::FETCH);
+
+        let fetch_size = self
+            .parser
+            .parse_literal_uint()
+            .context(error::SyntaxSnafu)?;
+        let _ = self.parser.parse_keyword(Keyword::FROM);
+
+        let cursor_name = self
+            .parser
+            .parse_object_name(false)
+            .context(error::SyntaxSnafu)?;
+
+        Ok(Statement::FetchCursor(FetchCursor {
+            cursor_name: ParserContext::canonicalize_object_name(cursor_name),
+            fetch_size,
+        }))
+    }
+
+    pub(crate) fn parse_close_cursor(&mut self) -> Result<Statement> {
+        let _ = self.parser.expect_keyword(Keyword::CLOSE);
+        let cursor_name = self
+            .parser
+            .parse_object_name(false)
+            .context(error::SyntaxSnafu)?;
+
+        Ok(Statement::CloseCursor(CloseCursor {
+            cursor_name: ParserContext::canonicalize_object_name(cursor_name),
+        }))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+    use crate::dialect::GreptimeDbDialect;
+    use crate::parser::ParseOptions;
+
+    #[test]
+    fn test_parse_declare_cursor() {
+        let sql = "DECLARE c1 CURSOR FOR\nSELECT * FROM numbers";
+        let result =
+            ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
+                .unwrap();
+
+        if let Statement::DeclareCursor(dc) = &result[0] {
+            assert_eq!("c1", dc.cursor_name.to_string());
+            assert_eq!(
+                "DECLARE c1 CURSOR FOR SELECT * FROM numbers",
+                dc.to_string()
+            );
+        } else {
+            panic!("Unexpected statement");
+        }
+
+        let sql = "DECLARE c1 CURSOR FOR\nINSERT INTO numbers VALUES (1);";
+        let result =
+            ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default());
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_parese_fetch_cursor() {
+        let sql = "FETCH 1000 FROM c1";
+        let result =
+            ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
+                .unwrap();
+
+        if let Statement::FetchCursor(fc) = &result[0] {
+            assert_eq!("c1", fc.cursor_name.to_string());
+            assert_eq!("1000", fc.fetch_size.to_string());
+            assert_eq!(sql, fc.to_string());
+        } else {
+            panic!("Unexpected statement")
+        }
+    }
+
+    #[test]
+    fn test_close_fetch_cursor() {
+        let sql = "CLOSE c1";
+        let result =
+            ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
+                .unwrap();
+
+        if let Statement::CloseCursor(cc) = &result[0] {
+            assert_eq!("c1", cc.cursor_name.to_string());
+            assert_eq!(sql, cc.to_string());
+        } else {
+            panic!("Unexpected statement")
+        }
+    }
+}
diff --git a/src/sql/src/statements.rs b/src/sql/src/statements.rs
index 3e1e505a9b1b..25cc3bf7e5be 100644
--- a/src/sql/src/statements.rs
+++ b/src/sql/src/statements.rs
@@ -16,6 +16,7 @@ pub mod admin;
 pub mod alter;
 pub mod copy;
 pub mod create;
+pub mod cursor;
 pub mod delete;
 pub mod describe;
 pub mod drop;
@@ -224,7 +225,7 @@ pub fn sql_number_to_value(data_type: &ConcreteDataType, n: &str) -> Result<Valu
     // TODO(hl): also Date/DateTime
 }
 
-fn parse_sql_number<R: FromStr + std::fmt::Debug>(n: &str) -> Result<R>
+pub(crate) fn parse_sql_number<R: FromStr + std::fmt::Debug>(n: &str) -> Result<R>
 where
     <R as FromStr>::Err: std::fmt::Debug,
 {
diff --git a/src/sql/src/statements/cursor.rs b/src/sql/src/statements/cursor.rs
new file mode 100644
index 000000000000..72ef4cdcae98
--- /dev/null
+++ b/src/sql/src/statements/cursor.rs
@@ -0,0 +1,60 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::Display;
+
+use sqlparser::ast::ObjectName;
+use sqlparser_derive::{Visit, VisitMut};
+
+use super::query::Query;
+
+/// Represents a DECLARE CURSOR statement
+///
+/// This statement will carry a SQL query
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+pub struct DeclareCursor {
+    pub cursor_name: ObjectName,
+    pub query: Box<Query>,
+}
+
+impl Display for DeclareCursor {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "DECLARE {} CURSOR FOR {}", self.cursor_name, self.query)
+    }
+}
+
+/// Represents a FETCH FROM cursor statement
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+pub struct FetchCursor {
+    pub cursor_name: ObjectName,
+    pub fetch_size: u64,
+}
+
+impl Display for FetchCursor {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "FETCH {} FROM {}", self.fetch_size, self.cursor_name)
+    }
+}
+
+/// Represents a CLOSE cursor statement
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+pub struct CloseCursor {
+    pub cursor_name: ObjectName,
+}
+
+impl Display for CloseCursor {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "CLOSE {}", self.cursor_name)
+    }
+}
diff --git a/src/sql/src/statements/statement.rs b/src/sql/src/statements/statement.rs
index 0c4b324cd63f..8ad391a00dd2 100644
--- a/src/sql/src/statements/statement.rs
+++ b/src/sql/src/statements/statement.rs
@@ -24,6 +24,7 @@ use crate::statements::alter::{AlterDatabase, AlterTable};
 use crate::statements::create::{
     CreateDatabase, CreateExternalTable, CreateFlow, CreateTable, CreateTableLike, CreateView,
 };
+use crate::statements::cursor::{CloseCursor, DeclareCursor, FetchCursor};
 use crate::statements::delete::Delete;
 use crate::statements::describe::DescribeTable;
 use crate::statements::drop::{DropDatabase, DropFlow, DropTable, DropView};
@@ -118,6 +119,12 @@ pub enum Statement {
     Use(String),
     // Admin statement(extension)
     Admin(Admin),
+    // DECLARE ... CURSOR FOR ...
+    DeclareCursor(DeclareCursor),
+    // FETCH ... FROM ...
+    FetchCursor(FetchCursor),
+    // CLOSE
+    CloseCursor(CloseCursor),
 }
 
 impl Display for Statement {
@@ -165,6 +172,9 @@ impl Display for Statement {
             Statement::CreateView(s) => s.fmt(f),
             Statement::Use(s) => s.fmt(f),
             Statement::Admin(admin) => admin.fmt(f),
+            Statement::DeclareCursor(s) => s.fmt(f),
+            Statement::FetchCursor(s) => s.fmt(f),
+            Statement::CloseCursor(s) => s.fmt(f),
         }
     }
 }
diff --git a/tests-integration/tests/sql.rs b/tests-integration/tests/sql.rs
index f15e3743256d..303a49ac9b01 100644
--- a/tests-integration/tests/sql.rs
+++ b/tests-integration/tests/sql.rs
@@ -72,6 +72,7 @@ macro_rules! sql_tests {
                 test_postgres_parameter_inference,
                 test_postgres_array_types,
                 test_mysql_prepare_stmt_insert_timestamp,
+                test_declare_fetch_close_cursor,
             );
         )*
     };
@@ -1198,3 +1199,66 @@ pub async fn test_postgres_array_types(store_type: StorageType) {
     let _ = fe_pg_server.shutdown().await;
     guard.remove_all().await;
 }
+
+pub async fn test_declare_fetch_close_cursor(store_type: StorageType) {
+    let (addr, mut guard, fe_pg_server) = setup_pg_server(store_type, "sql_inference").await;
+
+    let (client, connection) = tokio_postgres::connect(&format!("postgres://{addr}/public"), NoTls)
+        .await
+        .unwrap();
+
+    let (tx, rx) = tokio::sync::oneshot::channel();
+    tokio::spawn(async move {
+        connection.await.unwrap();
+        tx.send(()).unwrap();
+    });
+
+    client
+        .execute(
+            "DECLARE c1 CURSOR FOR SELECT * FROM numbers WHERE number > 2 LIMIT 50::bigint",
+            &[],
+        )
+        .await
+        .expect("declare cursor");
+
+    // duplicated cursor
+    assert!(client
+        .execute("DECLARE c1 CURSOR FOR SELECT 1", &[],)
+        .await
+        .is_err());
+
+    let rows = client.query("FETCH 5 FROM c1", &[]).await.unwrap();
+    assert_eq!(5, rows.len());
+
+    let rows = client.query("FETCH 100 FROM c1", &[]).await.unwrap();
+    assert_eq!(45, rows.len());
+
+    let rows = client.query("FETCH 100 FROM c1", &[]).await.unwrap();
+    assert_eq!(0, rows.len());
+
+    client.execute("CLOSE c1", &[]).await.expect("close cursor");
+
+    // cursor not found
+    let result = client.query("FETCH 100 FROM c1", &[]).await;
+    assert!(result.is_err());
+
+    client
+        .execute(
+            "DECLARE c2 CURSOR FOR SELECT * FROM numbers WHERE number < 0",
+            &[],
+        )
+        .await
+        .expect("declare cursor");
+
+    let rows = client.query("FETCH 5 FROM c2", &[]).await.unwrap();
+    assert_eq!(0, rows.len());
+
+    client.execute("CLOSE c2", &[]).await.expect("close cursor");
+
+    // Shutdown the client.
+    drop(client);
+    rx.await.unwrap();
+
+    let _ = fe_pg_server.shutdown().await;
+    guard.remove_all().await;
+}

From 19373d806d7bedbb8f701d169706d23a3d1c2d8f Mon Sep 17 00:00:00 2001
From: Lin Yihai <1161813899@qq.com>
Date: Fri, 6 Dec 2024 23:02:15 +0800
Subject: [PATCH 09/36] chore: Add timeout setting for `find_ttl`. (#5088)

---
 src/mito2/src/compaction.rs | 19 ++++++++++++-------
 src/mito2/src/config.rs     |  3 +++
 src/mito2/src/error.rs      | 10 ++++++++++
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/src/mito2/src/compaction.rs b/src/mito2/src/compaction.rs
index 31e1b0674f72..a4094af74121 100644
--- a/src/mito2/src/compaction.rs
+++ b/src/mito2/src/compaction.rs
@@ -51,6 +51,7 @@ use crate::config::MitoConfig;
 use crate::error::{
     CompactRegionSnafu, Error, GetSchemaMetadataSnafu, RegionClosedSnafu, RegionDroppedSnafu,
     RegionTruncatedSnafu, RemoteCompactionSnafu, Result, TimeRangePredicateOverflowSnafu,
+    TimeoutSnafu,
 };
 use crate::metrics::COMPACTION_STAGE_ELAPSED;
 use crate::read::projection::ProjectionMapper;
@@ -445,13 +446,17 @@ async fn find_ttl(
         return Ok(table_ttl);
     }
 
-    let ttl = schema_metadata_manager
-        .get_schema_options_by_table_id(table_id)
-        .await
-        .context(GetSchemaMetadataSnafu)?
-        .and_then(|options| options.ttl)
-        .unwrap_or_default()
-        .into();
+    let ttl = tokio::time::timeout(
+        crate::config::FETCH_OPTION_TIMEOUT,
+        schema_metadata_manager.get_schema_options_by_table_id(table_id),
+    )
+    .await
+    .context(TimeoutSnafu)?
+    .context(GetSchemaMetadataSnafu)?
+    .and_then(|options| options.ttl)
+    .unwrap_or_default()
+    .into();
+
     Ok(ttl)
 }
 
diff --git a/src/mito2/src/config.rs b/src/mito2/src/config.rs
index 8cd2b08f2e59..797c42f8084c 100644
--- a/src/mito2/src/config.rs
+++ b/src/mito2/src/config.rs
@@ -45,6 +45,9 @@ const PAGE_CACHE_SIZE_FACTOR: u64 = 8;
 /// Use `1/INDEX_CREATE_MEM_THRESHOLD_FACTOR` of OS memory size as mem threshold for creating index
 const INDEX_CREATE_MEM_THRESHOLD_FACTOR: u64 = 16;
 
+/// Fetch option timeout
+pub(crate) const FETCH_OPTION_TIMEOUT: Duration = Duration::from_secs(10);
+
 /// Configuration for [MitoEngine](crate::engine::MitoEngine).
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
 #[serde(default)]
diff --git a/src/mito2/src/error.rs b/src/mito2/src/error.rs
index 6cb4f8abdd7a..407c8c29e258 100644
--- a/src/mito2/src/error.rs
+++ b/src/mito2/src/error.rs
@@ -30,6 +30,7 @@ use snafu::{Location, Snafu};
 use store_api::logstore::provider::Provider;
 use store_api::manifest::ManifestVersion;
 use store_api::storage::RegionId;
+use tokio::time::error::Elapsed;
 
 use crate::cache::file_cache::FileType;
 use crate::region::{RegionLeaderState, RegionRoleState};
@@ -877,6 +878,14 @@ pub enum Error {
         #[snafu(implicit)]
         location: Location,
     },
+
+    #[snafu(display("Timeout"))]
+    Timeout {
+        #[snafu(source)]
+        error: Elapsed,
+        #[snafu(implicit)]
+        location: Location,
+    },
 }
 
 pub type Result<T, E = Error> = std::result::Result<T, E>;
@@ -1010,6 +1019,7 @@ impl ErrorExt for Error {
             DecodeStats { .. } | StatsNotPresent { .. } => StatusCode::Internal,
             RegionBusy { .. } => StatusCode::RegionBusy,
             GetSchemaMetadata { source, .. } => source.status_code(),
+            Timeout { .. } => StatusCode::Cancelled,
         }
     }
 

From c0f498b00c263998a521d6c4f36f63aacf4f875e Mon Sep 17 00:00:00 2001
From: Ning Sun <sunng@protonmail.com>
Date: Mon, 9 Dec 2024 11:12:11 +0800
Subject: [PATCH 10/36] feat: update pgwire to 0.28 (#5113)

* feat: update pgwire to 0.28

* test: update tests
---
 Cargo.lock                               | 270 ++++++++++++-----------
 src/servers/Cargo.toml                   |   2 +-
 src/servers/src/postgres.rs              |   9 +-
 src/servers/src/postgres/auth_handler.rs |   4 +-
 src/servers/src/postgres/fixtures.rs     |  14 +-
 src/servers/src/postgres/handler.rs      |  11 +-
 src/servers/src/postgres/types/bytea.rs  |   1 -
 7 files changed, 174 insertions(+), 137 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 16a234728983..920393daa030 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -637,7 +637,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -659,7 +659,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -676,7 +676,7 @@ checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -687,7 +687,7 @@ checksum = "20235b6899dd1cb74a9afac0abf5b4a20c0e500dd6537280f4096e1b9f14da20"
 dependencies = [
  "async-fs",
  "futures-lite",
- "thiserror",
+ "thiserror 1.0.64",
 ]
 
 [[package]]
@@ -774,7 +774,7 @@ checksum = "3c87f3f15e7794432337fc718554eaa4dc8f04c9677a950ffe366f20a162ae42"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -873,7 +873,7 @@ dependencies = [
  "heck 0.4.1",
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1012,7 +1012,7 @@ dependencies = [
  "regex",
  "rustc-hash 1.1.0",
  "shlex",
- "syn 2.0.79",
+ "syn 2.0.90",
  "which",
 ]
 
@@ -1031,7 +1031,7 @@ dependencies = [
  "regex",
  "rustc-hash 1.1.0",
  "shlex",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1155,7 +1155,7 @@ dependencies = [
  "proc-macro-crate 3.2.0",
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
  "syn_derive",
 ]
 
@@ -1694,7 +1694,7 @@ dependencies = [
  "heck 0.5.0",
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -2189,7 +2189,7 @@ dependencies = [
  "quote",
  "snafu 0.8.5",
  "static_assertions",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -2927,7 +2927,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim 0.11.1",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -2949,7 +2949,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
 dependencies = [
  "darling_core 0.20.10",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -3443,7 +3443,7 @@ checksum = "2cdc8d50f426189eef89dac62fabfa0abb27d5cc008f25bf4156a0203325becc"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -3454,7 +3454,7 @@ checksum = "67e77553c4162a157adbf834ebae5b415acbecbeafc7a74b0e886657506a7611"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -3517,7 +3517,7 @@ dependencies = [
  "darling 0.20.10",
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -3547,7 +3547,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4abae7035bf79b9877b779505d8cf3749285b80c43941eda66604841889451dc"
 dependencies = [
  "derive_builder_core 0.20.1",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -3567,7 +3567,7 @@ checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
  "unicode-xid",
 ]
 
@@ -3579,7 +3579,7 @@ checksum = "65f152f4b8559c4da5d574bafc7af85454d706b4c5fe8b530d508cacbb6807ea"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -3734,7 +3734,7 @@ dependencies = [
  "chrono",
  "rust_decimal",
  "serde",
- "thiserror",
+ "thiserror 1.0.64",
  "time",
  "winnow 0.6.20",
 ]
@@ -3800,7 +3800,7 @@ dependencies = [
  "heck 0.5.0",
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -3812,7 +3812,7 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4267,7 +4267,7 @@ checksum = "e99b8b3c28ae0e84b604c75f721c21dc77afb3706076af5e8216d15fd1deaae3"
 dependencies = [
  "frunk_proc_macro_helpers",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4279,7 +4279,7 @@ dependencies = [
  "frunk_core",
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4291,7 +4291,7 @@ dependencies = [
  "frunk_core",
  "frunk_proc_macro_helpers",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4421,7 +4421,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -5028,7 +5028,7 @@ dependencies = [
  "proc-macro-crate 1.3.1",
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -5043,7 +5043,7 @@ dependencies = [
  "rust-sitter",
  "rust-sitter-tool",
  "slotmap",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -5062,7 +5062,7 @@ dependencies = [
  "serde",
  "serde_json",
  "slotmap",
- "syn 2.0.79",
+ "syn 2.0.90",
  "webbrowser",
 ]
 
@@ -5076,7 +5076,7 @@ dependencies = [
  "proc-macro-crate 1.3.1",
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -5601,7 +5601,7 @@ dependencies = [
  "combine",
  "jni-sys",
  "log",
- "thiserror",
+ "thiserror 1.0.64",
  "walkdir",
  "windows-sys 0.45.0",
 ]
@@ -5639,7 +5639,7 @@ dependencies = [
  "jsonptr",
  "serde",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.64",
 ]
 
 [[package]]
@@ -5680,7 +5680,7 @@ dependencies = [
  "pest_derive",
  "regex",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.64",
 ]
 
 [[package]]
@@ -5693,7 +5693,7 @@ dependencies = [
  "pest_derive",
  "regex",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.64",
 ]
 
 [[package]]
@@ -5816,7 +5816,7 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_yaml",
- "thiserror",
+ "thiserror 1.0.64",
  "tokio",
  "tokio-util",
  "tower",
@@ -5838,7 +5838,7 @@ dependencies = [
  "schemars",
  "serde",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.64",
 ]
 
 [[package]]
@@ -5851,7 +5851,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "serde_json",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -5876,7 +5876,7 @@ dependencies = [
  "pin-project",
  "serde",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.64",
  "tokio",
  "tokio-util",
  "tracing",
@@ -5943,7 +5943,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "regex",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6327,7 +6327,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "792ba667add2798c6c3e988e630f4eb921b5cbc735044825b7111ef1582c8730"
 dependencies = [
  "byteorder",
- "thiserror",
+ "thiserror 1.0.64",
 ]
 
 [[package]]
@@ -6432,7 +6432,7 @@ checksum = "376101dbd964fc502d5902216e180f92b3d003b5cc3d2e40e044eb5470fca677"
 dependencies = [
  "bytes",
  "serde",
- "thiserror",
+ "thiserror 1.0.64",
 ]
 
 [[package]]
@@ -6807,7 +6807,7 @@ dependencies = [
  "rustc_version",
  "smallvec",
  "tagptr",
- "thiserror",
+ "thiserror 1.0.64",
  "triomphe",
  "uuid",
 ]
@@ -6898,9 +6898,9 @@ dependencies = [
  "proc-macro-error",
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
  "termcolor",
- "thiserror",
+ "thiserror 1.0.64",
 ]
 
 [[package]]
@@ -6916,9 +6916,9 @@ dependencies = [
  "proc-macro-error",
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
  "termcolor",
- "thiserror",
+ "thiserror 1.0.64",
 ]
 
 [[package]]
@@ -6948,7 +6948,7 @@ dependencies = [
  "serde",
  "serde_json",
  "socket2 0.5.7",
- "thiserror",
+ "thiserror 1.0.64",
  "tokio",
  "tokio-rustls 0.24.1",
  "tokio-util",
@@ -6991,7 +6991,7 @@ dependencies = [
  "sha2",
  "smallvec",
  "subprocess",
- "thiserror",
+ "thiserror 1.0.64",
  "time",
  "uuid",
  "zstd 0.12.4",
@@ -7031,7 +7031,7 @@ dependencies = [
  "sha2",
  "smallvec",
  "subprocess",
- "thiserror",
+ "thiserror 1.0.64",
  "time",
  "uuid",
  "zstd 0.13.2",
@@ -7090,7 +7090,7 @@ checksum = "254a5372af8fc138e36684761d3c0cdb758a4410e938babcff1c860ce14ddbfc"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -7281,7 +7281,7 @@ checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -7539,7 +7539,7 @@ dependencies = [
  "js-sys",
  "once_cell",
  "pin-project-lite",
- "thiserror",
+ "thiserror 1.0.64",
  "urlencoding",
 ]
 
@@ -7554,7 +7554,7 @@ dependencies = [
  "js-sys",
  "once_cell",
  "pin-project-lite",
- "thiserror",
+ "thiserror 1.0.64",
  "urlencoding",
 ]
 
@@ -7572,7 +7572,7 @@ dependencies = [
  "opentelemetry-semantic-conventions",
  "opentelemetry_sdk 0.21.2",
  "prost 0.11.9",
- "thiserror",
+ "thiserror 1.0.64",
  "tokio",
  "tonic 0.9.2",
 ]
@@ -7629,7 +7629,7 @@ dependencies = [
  "ordered-float 4.3.0",
  "percent-encoding",
  "rand",
- "thiserror",
+ "thiserror 1.0.64",
  "tokio",
  "tokio-stream",
 ]
@@ -7652,7 +7652,7 @@ dependencies = [
  "percent-encoding",
  "rand",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.64",
 ]
 
 [[package]]
@@ -8088,7 +8088,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fdbef9d1d47087a895abd220ed25eb4ad973a5e26f6a4367b038c25e28dfc2d9"
 dependencies = [
  "memchr",
- "thiserror",
+ "thiserror 1.0.64",
  "ucd-trie",
 ]
 
@@ -8112,7 +8112,7 @@ dependencies = [
  "pest_meta",
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -8138,9 +8138,9 @@ dependencies = [
 
 [[package]]
 name = "pgwire"
-version = "0.25.0"
+version = "0.28.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e63bc3945a17010ff93677589c656c5e8fb4183b00bc86360de8e187d2a86cb"
+checksum = "c84e671791f3a354f265e55e400be8bb4b6262c1ec04fac4289e710ccf22ab43"
 dependencies = [
  "async-trait",
  "bytes",
@@ -8154,7 +8154,7 @@ dependencies = [
  "rand",
  "ring 0.17.8",
  "rust_decimal",
- "thiserror",
+ "thiserror 2.0.4",
  "tokio",
  "tokio-rustls 0.26.0",
  "tokio-util",
@@ -8224,7 +8224,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -8497,7 +8497,7 @@ dependencies = [
  "smallvec",
  "symbolic-demangle",
  "tempfile",
- "thiserror",
+ "thiserror 1.0.64",
 ]
 
 [[package]]
@@ -8572,7 +8572,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "479cf940fbbb3426c32c5d5176f62ad57549a0bb84773423ba8be9d089f5faba"
 dependencies = [
  "proc-macro2",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -8620,9 +8620,9 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.86"
+version = "1.0.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
+checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
 dependencies = [
  "unicode-ident",
 ]
@@ -8664,7 +8664,7 @@ dependencies = [
  "parking_lot 0.12.3",
  "procfs",
  "protobuf",
- "thiserror",
+ "thiserror 1.0.64",
 ]
 
 [[package]]
@@ -8768,7 +8768,7 @@ dependencies = [
  "prost 0.12.6",
  "prost-types 0.12.6",
  "regex",
- "syn 2.0.79",
+ "syn 2.0.90",
  "tempfile",
 ]
 
@@ -8814,7 +8814,7 @@ dependencies = [
  "itertools 0.12.1",
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -8827,7 +8827,7 @@ dependencies = [
  "itertools 0.13.0",
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -9010,7 +9010,7 @@ dependencies = [
  "proc-macro2",
  "pyo3-macros-backend",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -9023,7 +9023,7 @@ dependencies = [
  "proc-macro2",
  "pyo3-build-config",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -9159,7 +9159,7 @@ dependencies = [
  "rustc-hash 2.0.0",
  "rustls 0.23.13",
  "socket2 0.5.7",
- "thiserror",
+ "thiserror 1.0.64",
  "tokio",
  "tracing",
 ]
@@ -9176,7 +9176,7 @@ dependencies = [
  "rustc-hash 2.0.0",
  "rustls 0.23.13",
  "slab",
- "thiserror",
+ "thiserror 1.0.64",
  "tinyvec",
  "tracing",
 ]
@@ -9249,7 +9249,7 @@ dependencies = [
  "serde",
  "serde_repr",
  "strum 0.25.0",
- "thiserror",
+ "thiserror 1.0.64",
 ]
 
 [[package]]
@@ -9302,7 +9302,7 @@ checksum = "6c1bb13e2dcfa2232ac6887157aad8d9b3fe4ca57f7c8d4938ff5ea9be742300"
 dependencies = [
  "clocksource",
  "parking_lot 0.12.3",
- "thiserror",
+ "thiserror 1.0.64",
 ]
 
 [[package]]
@@ -9372,7 +9372,7 @@ checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43"
 dependencies = [
  "getrandom",
  "libredox",
- "thiserror",
+ "thiserror 1.0.64",
 ]
 
 [[package]]
@@ -9392,7 +9392,7 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -9579,7 +9579,7 @@ dependencies = [
  "nix 0.25.1",
  "regex",
  "tempfile",
- "thiserror",
+ "thiserror 1.0.64",
 ]
 
 [[package]]
@@ -9723,7 +9723,7 @@ dependencies = [
  "serde_json",
  "sha2",
  "stringprep",
- "thiserror",
+ "thiserror 1.0.64",
 ]
 
 [[package]]
@@ -9743,7 +9743,7 @@ dependencies = [
  "rsasl",
  "rustls 0.23.13",
  "snap",
- "thiserror",
+ "thiserror 1.0.64",
  "tokio",
  "tokio-rustls 0.26.0",
  "tracing",
@@ -9787,7 +9787,7 @@ dependencies = [
  "regex",
  "relative-path",
  "rustc_version",
- "syn 2.0.79",
+ "syn 2.0.90",
  "unicode-ident",
 ]
 
@@ -9799,7 +9799,7 @@ checksum = "b3a8fb4672e840a587a66fc577a5491375df51ddb88f2a2c2a792598c326fe14"
 dependencies = [
  "quote",
  "rand",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -9822,7 +9822,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rust-embed-utils",
- "syn 2.0.79",
+ "syn 2.0.90",
  "walkdir",
 ]
 
@@ -10362,7 +10362,7 @@ dependencies = [
  "static_assertions",
  "strum 0.24.1",
  "strum_macros 0.24.3",
- "thiserror",
+ "thiserror 1.0.64",
  "thread_local",
  "timsort",
  "uname",
@@ -10561,7 +10561,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "serde_derive_internals",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -10662,7 +10662,7 @@ dependencies = [
  "heck 0.4.1",
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -10740,7 +10740,7 @@ checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -10751,7 +10751,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -10785,7 +10785,7 @@ checksum = "6c64451ba24fc7a6a2d60fc75dd9c83c90903b19028d4eff35e88fc1e86564e9"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -10806,7 +10806,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "serde",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -10848,7 +10848,7 @@ dependencies = [
  "darling 0.20.10",
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -11147,7 +11147,7 @@ checksum = "adc4e5204eb1910f40f9cfa375f6f05b68c3abac4b6fd879c8ff5e7ae8a0a085"
 dependencies = [
  "num-bigint",
  "num-traits",
- "thiserror",
+ "thiserror 1.0.64",
  "time",
 ]
 
@@ -11240,7 +11240,7 @@ dependencies = [
  "heck 0.5.0",
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -11383,7 +11383,7 @@ dependencies = [
  "prettydiff",
  "regex",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.64",
  "toml 0.5.11",
  "walkdir",
 ]
@@ -11451,7 +11451,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -11461,7 +11461,7 @@ source = "git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=54a267ac89c09
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -11525,7 +11525,7 @@ dependencies = [
  "sqlformat",
  "sqlx-rt",
  "stringprep",
- "thiserror",
+ "thiserror 1.0.64",
  "tokio-stream",
  "url",
  "webpki-roots 0.22.6",
@@ -11753,7 +11753,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -11766,7 +11766,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -11819,7 +11819,7 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_yaml",
- "syn 2.0.79",
+ "syn 2.0.90",
  "typify",
  "walkdir",
 ]
@@ -11840,7 +11840,7 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_yaml",
- "syn 2.0.79",
+ "syn 2.0.90",
  "typify",
  "walkdir",
 ]
@@ -11887,9 +11887,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.79"
+version = "2.0.90"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590"
+checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -11924,7 +11924,7 @@ dependencies = [
  "proc-macro-error",
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -12067,7 +12067,7 @@ dependencies = [
  "tantivy-stacker",
  "tantivy-tokenizer-api",
  "tempfile",
- "thiserror",
+ "thiserror 1.0.64",
  "time",
  "uuid",
  "winapi",
@@ -12393,7 +12393,16 @@ version = "1.0.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d50af8abc119fb8bb6dbabcfa89656f46f84aa0ac7688088608076ad2b459a84"
 dependencies = [
- "thiserror-impl",
+ "thiserror-impl 1.0.64",
+]
+
+[[package]]
+name = "thiserror"
+version = "2.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2f49a1853cf82743e3b7950f77e0f4d622ca36cf4317cba00c767838bac8d490"
+dependencies = [
+ "thiserror-impl 2.0.4",
 ]
 
 [[package]]
@@ -12404,7 +12413,18 @@ checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "2.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8381894bb3efe0c4acac3ded651301ceee58a15d47c2e34885ed1908ad667061"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -12591,7 +12611,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -12864,7 +12884,7 @@ dependencies = [
  "proc-macro2",
  "prost-build 0.12.6",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -12981,7 +13001,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3566e8ce28cc0a3fe42519fc80e6b4c943cc4c8cef275620eb8dac2d3d4e06cf"
 dependencies = [
  "crossbeam-channel",
- "thiserror",
+ "thiserror 1.0.64",
  "time",
  "tracing-subscriber",
 ]
@@ -12994,7 +13014,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -13154,7 +13174,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "042342584c5a7a0b833d9fc4e2bdab3f9868ddc6c4b339a1e01451c6720868bc"
 dependencies = [
  "regex",
- "thiserror",
+ "thiserror 1.0.64",
  "tree-sitter",
 ]
 
@@ -13185,7 +13205,7 @@ checksum = "ccb3f1376219530a37a809751ecf65aa35fd8b9c1c4ab6d4faf5f6a9eeda2c05"
 dependencies = [
  "memchr",
  "regex",
- "thiserror",
+ "thiserror 1.0.64",
  "tree-sitter",
 ]
 
@@ -13251,7 +13271,7 @@ checksum = "70b20a22c42c8f1cd23ce5e34f165d4d37038f5b663ad20fb6adbdf029172483"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -13279,8 +13299,8 @@ dependencies = [
  "semver",
  "serde",
  "serde_json",
- "syn 2.0.79",
- "thiserror",
+ "syn 2.0.90",
+ "thiserror 1.0.64",
  "unicode-ident",
 ]
 
@@ -13297,7 +13317,7 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_tokenstream",
- "syn 2.0.79",
+ "syn 2.0.90",
  "typify-impl",
 ]
 
@@ -13621,7 +13641,7 @@ checksum = "ee1cd046f83ea2c4e920d6ee9f7c3537ef928d75dce5d84a87c2c5d6b3999a3a"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -13737,7 +13757,7 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
  "wasm-bindgen-shared",
 ]
 
@@ -13771,7 +13791,7 @@ checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -14288,7 +14308,7 @@ dependencies = [
  "geo-types",
  "log",
  "num-traits",
- "thiserror",
+ "thiserror 1.0.64",
 ]
 
 [[package]]
@@ -14315,7 +14335,7 @@ dependencies = [
  "ring 0.17.8",
  "signature",
  "spki 0.7.3",
- "thiserror",
+ "thiserror 1.0.64",
  "zeroize",
 ]
 
@@ -14367,7 +14387,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -14387,7 +14407,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.79",
+ "syn 2.0.90",
 ]
 
 [[package]]
diff --git a/src/servers/Cargo.toml b/src/servers/Cargo.toml
index 6365bbc8d041..c01560724931 100644
--- a/src/servers/Cargo.toml
+++ b/src/servers/Cargo.toml
@@ -77,7 +77,7 @@ openmetrics-parser = "0.4"
 opensrv-mysql = { git = "https://github.com/datafuselabs/opensrv", rev = "6bbc3b65e6b19212c4f7fc4f40c20daf6f452deb" }
 opentelemetry-proto.workspace = true
 parking_lot.workspace = true
-pgwire = { version = "0.25.0", default-features = false, features = ["server-api-ring"] }
+pgwire = { version = "0.28.0", default-features = false, features = ["server-api-ring"] }
 pin-project = "1.0"
 pipeline.workspace = true
 postgres-types = { version = "0.2", features = ["with-chrono-0_4", "with-serde_json-1"] }
diff --git a/src/servers/src/postgres.rs b/src/servers/src/postgres.rs
index 5e8de2294e18..5b844042950f 100644
--- a/src/servers/src/postgres.rs
+++ b/src/servers/src/postgres.rs
@@ -33,7 +33,7 @@ use ::auth::UserProviderRef;
 use derive_builder::Builder;
 use pgwire::api::auth::ServerParameterProvider;
 use pgwire::api::copy::NoopCopyHandler;
-use pgwire::api::{ClientInfo, PgWireHandlerFactory};
+use pgwire::api::{ClientInfo, PgWireServerHandlers};
 pub use server::PostgresServer;
 use session::context::Channel;
 use session::Session;
@@ -90,11 +90,12 @@ pub(crate) struct MakePostgresServerHandler {
 
 pub(crate) struct PostgresServerHandler(Arc<PostgresServerHandlerInner>);
 
-impl PgWireHandlerFactory for PostgresServerHandler {
+impl PgWireServerHandlers for PostgresServerHandler {
     type StartupHandler = PostgresServerHandlerInner;
     type SimpleQueryHandler = PostgresServerHandlerInner;
     type ExtendedQueryHandler = PostgresServerHandlerInner;
     type CopyHandler = NoopCopyHandler;
+    type ErrorHandler = PostgresServerHandlerInner;
 
     fn simple_query_handler(&self) -> Arc<Self::SimpleQueryHandler> {
         self.0.clone()
@@ -111,6 +112,10 @@ impl PgWireHandlerFactory for PostgresServerHandler {
     fn copy_handler(&self) -> Arc<Self::CopyHandler> {
         Arc::new(NoopCopyHandler)
     }
+
+    fn error_handler(&self) -> Arc<Self::ErrorHandler> {
+        self.0.clone()
+    }
 }
 
 impl MakePostgresServerHandler {
diff --git a/src/servers/src/postgres/auth_handler.rs b/src/servers/src/postgres/auth_handler.rs
index 3f3360385840..12553c44cf10 100644
--- a/src/servers/src/postgres/auth_handler.rs
+++ b/src/servers/src/postgres/auth_handler.rs
@@ -177,7 +177,7 @@ impl StartupHandler for PostgresServerHandlerInner {
                         client.metadata().get(super::METADATA_USER).cloned(),
                     ));
                     set_client_info(client, &self.session);
-                    auth::finish_authentication(client, self.param_provider.as_ref()).await;
+                    auth::finish_authentication(client, self.param_provider.as_ref()).await?;
                 }
             }
             PgWireFrontendMessage::PasswordMessageFamily(pwd) => {
@@ -194,7 +194,7 @@ impl StartupHandler for PostgresServerHandlerInner {
                 if let Ok(Some(user_info)) = auth_result {
                     self.session.set_user_info(user_info);
                     set_client_info(client, &self.session);
-                    auth::finish_authentication(client, self.param_provider.as_ref()).await;
+                    auth::finish_authentication(client, self.param_provider.as_ref()).await?;
                 } else {
                     return send_error(
                         client,
diff --git a/src/servers/src/postgres/fixtures.rs b/src/servers/src/postgres/fixtures.rs
index 2ca3ad02eaa7..3132a38d1b5d 100644
--- a/src/servers/src/postgres/fixtures.rs
+++ b/src/servers/src/postgres/fixtures.rs
@@ -78,14 +78,16 @@ pub(crate) fn process<'a>(query: &str, query_ctx: QueryContextRef) -> Option<Vec
     if START_TRANSACTION_PATTERN.is_match(query) {
         set_transaction_warning(query_ctx);
         if query.to_lowercase().starts_with("begin") {
-            Some(vec![Response::Execution(Tag::new("BEGIN"))])
+            Some(vec![Response::TransactionStart(Tag::new("BEGIN"))])
         } else {
-            Some(vec![Response::Execution(Tag::new("START TRANSACTION"))])
+            Some(vec![Response::TransactionStart(Tag::new(
+                "START TRANSACTION",
+            ))])
         }
     } else if ABORT_TRANSACTION_PATTERN.is_match(query) {
-        Some(vec![Response::Execution(Tag::new("ROLLBACK"))])
+        Some(vec![Response::TransactionEnd(Tag::new("ROLLBACK"))])
     } else if COMMIT_TRANSACTION_PATTERN.is_match(query) {
-        Some(vec![Response::Execution(Tag::new("COMMIT"))])
+        Some(vec![Response::TransactionEnd(Tag::new("COMMIT"))])
     } else if let Some(show_var) = SHOW_PATTERN.captures(query) {
         let show_var = show_var[1].to_lowercase();
         if let Some(value) = VAR_VALUES.get(&show_var.as_ref()) {
@@ -127,7 +129,9 @@ mod test {
     use super::*;
 
     fn assert_tag(q: &str, t: &str, query_context: QueryContextRef) {
-        if let Response::Execution(tag) = process(q, query_context.clone())
+        if let Response::Execution(tag)
+        | Response::TransactionStart(tag)
+        | Response::TransactionEnd(tag) = process(q, query_context.clone())
             .unwrap_or_else(|| panic!("fail to match {}", q))
             .remove(0)
         {
diff --git a/src/servers/src/postgres/handler.rs b/src/servers/src/postgres/handler.rs
index e2e46534b5a1..6a2862000b23 100644
--- a/src/servers/src/postgres/handler.rs
+++ b/src/servers/src/postgres/handler.rs
@@ -31,7 +31,7 @@ use pgwire::api::results::{
     DataRowEncoder, DescribePortalResponse, DescribeStatementResponse, QueryResponse, Response, Tag,
 };
 use pgwire::api::stmt::{QueryParser, StoredStatement};
-use pgwire::api::{ClientInfo, Type};
+use pgwire::api::{ClientInfo, ErrorHandler, Type};
 use pgwire::error::{ErrorInfo, PgWireError, PgWireResult};
 use pgwire::messages::PgWireBackendMessage;
 use query::query_engine::DescribeResult;
@@ -414,3 +414,12 @@ impl ExtendedQueryHandler for PostgresServerHandlerInner {
         }
     }
 }
+
+impl ErrorHandler for PostgresServerHandlerInner {
+    fn on_error<C>(&self, _client: &C, error: &mut PgWireError)
+    where
+        C: ClientInfo,
+    {
+        debug!("Postgres interface error {}", error)
+    }
+}
diff --git a/src/servers/src/postgres/types/bytea.rs b/src/servers/src/postgres/types/bytea.rs
index 975d670f9c00..78a2a20bd8da 100644
--- a/src/servers/src/postgres/types/bytea.rs
+++ b/src/servers/src/postgres/types/bytea.rs
@@ -27,7 +27,6 @@ impl ToSqlText for HexOutputBytea<'_> {
     where
         Self: Sized,
     {
-        out.put_slice(b"\\x");
         let _ = self.0.to_sql_text(ty, out);
         Ok(IsNull::No)
     }

From 903da8f4cb2908c26af518d2e8aba97b68fd6a1d Mon Sep 17 00:00:00 2001
From: dennis zhuang <killme2008@gmail.com>
Date: Mon, 9 Dec 2024 11:27:46 +0800
Subject: [PATCH 11/36] fix: show create table doesn't quote option keys which
 contains dot (#5108)

* fix: show create table doesn't quote option keys which contains dot

* fix: compile
---
 src/query/src/sql/show_create_table.rs        |  17 +-
 src/sql/src/statements/option_map.rs          |  12 +-
 .../common/alter/alter_table_options.result   | 166 +++++++++---------
 3 files changed, 108 insertions(+), 87 deletions(-)

diff --git a/src/query/src/sql/show_create_table.rs b/src/query/src/sql/show_create_table.rs
index 17f1dcdd394b..ca69dfc5e69e 100644
--- a/src/query/src/sql/show_create_table.rs
+++ b/src/query/src/sql/show_create_table.rs
@@ -215,6 +215,7 @@ pub fn create_table_stmt(
 #[cfg(test)]
 mod tests {
     use std::sync::Arc;
+    use std::time::Duration;
 
     use common_time::timestamp::TimeUnit;
     use datatypes::prelude::ConcreteDataType;
@@ -258,13 +259,22 @@ mod tests {
         let catalog_name = "greptime".to_string();
         let regions = vec![0, 1, 2];
 
+        let mut options = table::requests::TableOptions {
+            ttl: Some(Duration::from_secs(30).into()),
+            ..Default::default()
+        };
+
+        let _ = options
+            .extra_options
+            .insert("compaction.type".to_string(), "twcs".to_string());
+
         let meta = TableMetaBuilder::default()
             .schema(table_schema)
             .primary_key_indices(vec![0, 1])
             .value_indices(vec![2, 3])
             .engine("mito".to_string())
             .next_column_id(0)
-            .options(Default::default())
+            .options(options)
             .created_on(Default::default())
             .region_numbers(regions)
             .build()
@@ -301,7 +311,10 @@ CREATE TABLE IF NOT EXISTS "system_metrics" (
   INVERTED INDEX ("host")
 )
 ENGINE=mito
-"#,
+WITH(
+  'compaction.type' = 'twcs',
+  ttl = '30s'
+)"#,
             sql
         );
     }
diff --git a/src/sql/src/statements/option_map.rs b/src/sql/src/statements/option_map.rs
index 86f186d9b15d..9ff8d94312fd 100644
--- a/src/sql/src/statements/option_map.rs
+++ b/src/sql/src/statements/option_map.rs
@@ -79,10 +79,18 @@ impl OptionMap {
     pub fn kv_pairs(&self) -> Vec<String> {
         let mut result = Vec::with_capacity(self.options.len() + self.secrets.len());
         for (k, v) in self.options.iter() {
-            result.push(format!("{k} = '{}'", v.escape_default()));
+            if k.contains(".") {
+                result.push(format!("'{k}' = '{}'", v.escape_default()));
+            } else {
+                result.push(format!("{k} = '{}'", v.escape_default()));
+            }
         }
         for (k, _) in self.secrets.iter() {
-            result.push(format!("{k} = '******'"));
+            if k.contains(".") {
+                result.push(format!("'{k}' = '******'"));
+            } else {
+                result.push(format!("{k} = '******'"));
+            }
         }
         result
     }
diff --git a/tests/cases/standalone/common/alter/alter_table_options.result b/tests/cases/standalone/common/alter/alter_table_options.result
index b38a99d8465e..14849a70d4ff 100644
--- a/tests/cases/standalone/common/alter/alter_table_options.result
+++ b/tests/cases/standalone/common/alter/alter_table_options.result
@@ -173,28 +173,28 @@ Affected Rows: 0
 
 SHOW CREATE TABLE ato;
 
-+-------+----------------------------------------------------+
-| Table | Create Table                                       |
-+-------+----------------------------------------------------+
-| ato   | CREATE TABLE IF NOT EXISTS "ato" (                 |
-|       |   "i" INT NULL,                                    |
-|       |   "j" TIMESTAMP(3) NOT NULL,                       |
-|       |   TIME INDEX ("j"),                                |
-|       |   PRIMARY KEY ("i")                                |
-|       | )                                                  |
-|       |                                                    |
-|       | ENGINE=mito                                        |
-|       | WITH(                                              |
-|       |   compaction.twcs.max_active_window_files = '2',   |
-|       |   compaction.twcs.max_active_window_runs = '6',    |
-|       |   compaction.twcs.max_inactive_window_files = '2', |
-|       |   compaction.twcs.max_inactive_window_runs = '6',  |
-|       |   compaction.twcs.max_output_file_size = '500MB',  |
-|       |   compaction.twcs.time_window = '2h',              |
-|       |   compaction.type = 'twcs',                        |
-|       |   ttl = '1s'                                       |
-|       | )                                                  |
-+-------+----------------------------------------------------+
++-------+------------------------------------------------------+
+| Table | Create Table                                         |
++-------+------------------------------------------------------+
+| ato   | CREATE TABLE IF NOT EXISTS "ato" (                   |
+|       |   "i" INT NULL,                                      |
+|       |   "j" TIMESTAMP(3) NOT NULL,                         |
+|       |   TIME INDEX ("j"),                                  |
+|       |   PRIMARY KEY ("i")                                  |
+|       | )                                                    |
+|       |                                                      |
+|       | ENGINE=mito                                          |
+|       | WITH(                                                |
+|       |   'compaction.twcs.max_active_window_files' = '2',   |
+|       |   'compaction.twcs.max_active_window_runs' = '6',    |
+|       |   'compaction.twcs.max_inactive_window_files' = '2', |
+|       |   'compaction.twcs.max_inactive_window_runs' = '6',  |
+|       |   'compaction.twcs.max_output_file_size' = '500MB',  |
+|       |   'compaction.twcs.time_window' = '2h',              |
+|       |   'compaction.type' = 'twcs',                        |
+|       |   ttl = '1s'                                         |
+|       | )                                                    |
++-------+------------------------------------------------------+
 
 ALTER TABLE ato UNSET 'compaction.twcs.time_window';
 
@@ -206,27 +206,27 @@ Error: 1004(InvalidArguments), Invalid unset table option request: Invalid set r
 
 SHOW CREATE TABLE ato;
 
-+-------+----------------------------------------------------+
-| Table | Create Table                                       |
-+-------+----------------------------------------------------+
-| ato   | CREATE TABLE IF NOT EXISTS "ato" (                 |
-|       |   "i" INT NULL,                                    |
-|       |   "j" TIMESTAMP(3) NOT NULL,                       |
-|       |   TIME INDEX ("j"),                                |
-|       |   PRIMARY KEY ("i")                                |
-|       | )                                                  |
-|       |                                                    |
-|       | ENGINE=mito                                        |
-|       | WITH(                                              |
-|       |   compaction.twcs.max_active_window_files = '2',   |
-|       |   compaction.twcs.max_active_window_runs = '6',    |
-|       |   compaction.twcs.max_inactive_window_files = '2', |
-|       |   compaction.twcs.max_inactive_window_runs = '6',  |
-|       |   compaction.twcs.max_output_file_size = '500MB',  |
-|       |   compaction.type = 'twcs',                        |
-|       |   ttl = '1s'                                       |
-|       | )                                                  |
-+-------+----------------------------------------------------+
++-------+------------------------------------------------------+
+| Table | Create Table                                         |
++-------+------------------------------------------------------+
+| ato   | CREATE TABLE IF NOT EXISTS "ato" (                   |
+|       |   "i" INT NULL,                                      |
+|       |   "j" TIMESTAMP(3) NOT NULL,                         |
+|       |   TIME INDEX ("j"),                                  |
+|       |   PRIMARY KEY ("i")                                  |
+|       | )                                                    |
+|       |                                                      |
+|       | ENGINE=mito                                          |
+|       | WITH(                                                |
+|       |   'compaction.twcs.max_active_window_files' = '2',   |
+|       |   'compaction.twcs.max_active_window_runs' = '6',    |
+|       |   'compaction.twcs.max_inactive_window_files' = '2', |
+|       |   'compaction.twcs.max_inactive_window_runs' = '6',  |
+|       |   'compaction.twcs.max_output_file_size' = '500MB',  |
+|       |   'compaction.type' = 'twcs',                        |
+|       |   ttl = '1s'                                         |
+|       | )                                                    |
++-------+------------------------------------------------------+
 
 ALTER TABLE ato SET 'compaction.twcs.max_inactive_window_runs'='';
 
@@ -234,50 +234,50 @@ Affected Rows: 0
 
 SHOW CREATE TABLE ato;
 
-+-------+----------------------------------------------------+
-| Table | Create Table                                       |
-+-------+----------------------------------------------------+
-| ato   | CREATE TABLE IF NOT EXISTS "ato" (                 |
-|       |   "i" INT NULL,                                    |
-|       |   "j" TIMESTAMP(3) NOT NULL,                       |
-|       |   TIME INDEX ("j"),                                |
-|       |   PRIMARY KEY ("i")                                |
-|       | )                                                  |
-|       |                                                    |
-|       | ENGINE=mito                                        |
-|       | WITH(                                              |
-|       |   compaction.twcs.max_active_window_files = '2',   |
-|       |   compaction.twcs.max_active_window_runs = '6',    |
-|       |   compaction.twcs.max_inactive_window_files = '2', |
-|       |   compaction.twcs.max_output_file_size = '500MB',  |
-|       |   compaction.type = 'twcs',                        |
-|       |   ttl = '1s'                                       |
-|       | )                                                  |
-+-------+----------------------------------------------------+
++-------+------------------------------------------------------+
+| Table | Create Table                                         |
++-------+------------------------------------------------------+
+| ato   | CREATE TABLE IF NOT EXISTS "ato" (                   |
+|       |   "i" INT NULL,                                      |
+|       |   "j" TIMESTAMP(3) NOT NULL,                         |
+|       |   TIME INDEX ("j"),                                  |
+|       |   PRIMARY KEY ("i")                                  |
+|       | )                                                    |
+|       |                                                      |
+|       | ENGINE=mito                                          |
+|       | WITH(                                                |
+|       |   'compaction.twcs.max_active_window_files' = '2',   |
+|       |   'compaction.twcs.max_active_window_runs' = '6',    |
+|       |   'compaction.twcs.max_inactive_window_files' = '2', |
+|       |   'compaction.twcs.max_output_file_size' = '500MB',  |
+|       |   'compaction.type' = 'twcs',                        |
+|       |   ttl = '1s'                                         |
+|       | )                                                    |
++-------+------------------------------------------------------+
 
 -- SQLNESS ARG restart=true
 SHOW CREATE TABLE ato;
 
-+-------+----------------------------------------------------+
-| Table | Create Table                                       |
-+-------+----------------------------------------------------+
-| ato   | CREATE TABLE IF NOT EXISTS "ato" (                 |
-|       |   "i" INT NULL,                                    |
-|       |   "j" TIMESTAMP(3) NOT NULL,                       |
-|       |   TIME INDEX ("j"),                                |
-|       |   PRIMARY KEY ("i")                                |
-|       | )                                                  |
-|       |                                                    |
-|       | ENGINE=mito                                        |
-|       | WITH(                                              |
-|       |   compaction.twcs.max_active_window_files = '2',   |
-|       |   compaction.twcs.max_active_window_runs = '6',    |
-|       |   compaction.twcs.max_inactive_window_files = '2', |
-|       |   compaction.twcs.max_output_file_size = '500MB',  |
-|       |   compaction.type = 'twcs',                        |
-|       |   ttl = '1s'                                       |
-|       | )                                                  |
-+-------+----------------------------------------------------+
++-------+------------------------------------------------------+
+| Table | Create Table                                         |
++-------+------------------------------------------------------+
+| ato   | CREATE TABLE IF NOT EXISTS "ato" (                   |
+|       |   "i" INT NULL,                                      |
+|       |   "j" TIMESTAMP(3) NOT NULL,                         |
+|       |   TIME INDEX ("j"),                                  |
+|       |   PRIMARY KEY ("i")                                  |
+|       | )                                                    |
+|       |                                                      |
+|       | ENGINE=mito                                          |
+|       | WITH(                                                |
+|       |   'compaction.twcs.max_active_window_files' = '2',   |
+|       |   'compaction.twcs.max_active_window_runs' = '6',    |
+|       |   'compaction.twcs.max_inactive_window_files' = '2', |
+|       |   'compaction.twcs.max_output_file_size' = '500MB',  |
+|       |   'compaction.type' = 'twcs',                        |
+|       |   ttl = '1s'                                         |
+|       | )                                                    |
++-------+------------------------------------------------------+
 
 DROP TABLE ato;
 

From bac7e7bac964f5e409065d8a31e5ec580e719bd2 Mon Sep 17 00:00:00 2001
From: Zhenchi <zhongzc_arch@outlook.com>
Date: Mon, 9 Dec 2024 15:19:00 +0800
Subject: [PATCH 12/36] refactor: extract implicit conversion helper functions
 of vector type (#5118)

refactor: extract implicit conversion helper functions of vector

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>
---
 src/common/function/src/scalars/vector.rs     |   1 +
 .../function/src/scalars/vector/distance.rs   | 132 +--------------
 .../function/src/scalars/vector/impl_conv.rs  | 156 ++++++++++++++++++
 3 files changed, 165 insertions(+), 124 deletions(-)
 create mode 100644 src/common/function/src/scalars/vector/impl_conv.rs

diff --git a/src/common/function/src/scalars/vector.rs b/src/common/function/src/scalars/vector.rs
index 602504ec83ba..7c8cf5550e25 100644
--- a/src/common/function/src/scalars/vector.rs
+++ b/src/common/function/src/scalars/vector.rs
@@ -14,6 +14,7 @@
 
 mod convert;
 mod distance;
+pub(crate) mod impl_conv;
 
 use std::sync::Arc;
 
diff --git a/src/common/function/src/scalars/vector/distance.rs b/src/common/function/src/scalars/vector/distance.rs
index 1905a375f3e4..f17eec5b042c 100644
--- a/src/common/function/src/scalars/vector/distance.rs
+++ b/src/common/function/src/scalars/vector/distance.rs
@@ -18,18 +18,17 @@ mod l2sq;
 
 use std::borrow::Cow;
 use std::fmt::Display;
-use std::sync::Arc;
 
 use common_query::error::{InvalidFuncArgsSnafu, Result};
 use common_query::prelude::Signature;
 use datatypes::prelude::ConcreteDataType;
 use datatypes::scalars::ScalarVectorBuilder;
-use datatypes::value::ValueRef;
-use datatypes::vectors::{Float32VectorBuilder, MutableVector, Vector, VectorRef};
+use datatypes::vectors::{Float32VectorBuilder, MutableVector, VectorRef};
 use snafu::ensure;
 
 use crate::function::{Function, FunctionContext};
 use crate::helper;
+use crate::scalars::vector::impl_conv::{as_veclit, as_veclit_if_const};
 
 macro_rules! define_distance_function {
     ($StructName:ident, $display_name:expr, $similarity_method:path) => {
@@ -80,17 +79,17 @@ macro_rules! define_distance_function {
                     return Ok(result.to_vector());
                 }
 
-                let arg0_const = parse_if_constant_string(arg0)?;
-                let arg1_const = parse_if_constant_string(arg1)?;
+                let arg0_const = as_veclit_if_const(arg0)?;
+                let arg1_const = as_veclit_if_const(arg1)?;
 
                 for i in 0..size {
                     let vec0 = match arg0_const.as_ref() {
-                        Some(a) => Some(Cow::Borrowed(a.as_slice())),
-                        None => as_vector(arg0.get_ref(i))?,
+                        Some(a) => Some(Cow::Borrowed(a.as_ref())),
+                        None => as_veclit(arg0.get_ref(i))?,
                     };
                     let vec1 = match arg1_const.as_ref() {
-                        Some(b) => Some(Cow::Borrowed(b.as_slice())),
-                        None => as_vector(arg1.get_ref(i))?,
+                        Some(b) => Some(Cow::Borrowed(b.as_ref())),
+                        None => as_veclit(arg1.get_ref(i))?,
                     };
 
                     if let (Some(vec0), Some(vec1)) = (vec0, vec1) {
@@ -129,98 +128,6 @@ define_distance_function!(CosDistanceFunction, "vec_cos_distance", cos::cos);
 define_distance_function!(L2SqDistanceFunction, "vec_l2sq_distance", l2sq::l2sq);
 define_distance_function!(DotProductFunction, "vec_dot_product", dot::dot);
 
-/// Parse a vector value if the value is a constant string.
-fn parse_if_constant_string(arg: &Arc<dyn Vector>) -> Result<Option<Vec<f32>>> {
-    if !arg.is_const() {
-        return Ok(None);
-    }
-    if arg.data_type() != ConcreteDataType::string_datatype() {
-        return Ok(None);
-    }
-    arg.get_ref(0)
-        .as_string()
-        .unwrap() // Safe: checked if it is a string
-        .map(parse_f32_vector_from_string)
-        .transpose()
-}
-
-/// Convert a value to a vector value.
-/// Supported data types are binary and string.
-fn as_vector(arg: ValueRef<'_>) -> Result<Option<Cow<'_, [f32]>>> {
-    match arg.data_type() {
-        ConcreteDataType::Binary(_) => arg
-            .as_binary()
-            .unwrap() // Safe: checked if it is a binary
-            .map(binary_as_vector)
-            .transpose(),
-        ConcreteDataType::String(_) => arg
-            .as_string()
-            .unwrap() // Safe: checked if it is a string
-            .map(|s| Ok(Cow::Owned(parse_f32_vector_from_string(s)?)))
-            .transpose(),
-        ConcreteDataType::Null(_) => Ok(None),
-        _ => InvalidFuncArgsSnafu {
-            err_msg: format!("Unsupported data type: {:?}", arg.data_type()),
-        }
-        .fail(),
-    }
-}
-
-/// Convert a u8 slice to a vector value.
-fn binary_as_vector(bytes: &[u8]) -> Result<Cow<'_, [f32]>> {
-    if bytes.len() % std::mem::size_of::<f32>() != 0 {
-        return InvalidFuncArgsSnafu {
-            err_msg: format!("Invalid binary length of vector: {}", bytes.len()),
-        }
-        .fail();
-    }
-
-    if cfg!(target_endian = "little") {
-        Ok(unsafe {
-            let vec = std::slice::from_raw_parts(
-                bytes.as_ptr() as *const f32,
-                bytes.len() / std::mem::size_of::<f32>(),
-            );
-            Cow::Borrowed(vec)
-        })
-    } else {
-        let v = bytes
-            .chunks_exact(std::mem::size_of::<f32>())
-            .map(|chunk| f32::from_le_bytes(chunk.try_into().unwrap()))
-            .collect::<Vec<f32>>();
-        Ok(Cow::Owned(v))
-    }
-}
-
-/// Parse a string to a vector value.
-/// Valid inputs are strings like "[1.0, 2.0, 3.0]".
-fn parse_f32_vector_from_string(s: &str) -> Result<Vec<f32>> {
-    let trimmed = s.trim();
-    if !trimmed.starts_with('[') || !trimmed.ends_with(']') {
-        return InvalidFuncArgsSnafu {
-            err_msg: format!(
-                "Failed to parse {s} to Vector value: not properly enclosed in brackets"
-            ),
-        }
-        .fail();
-    }
-    let content = trimmed[1..trimmed.len() - 1].trim();
-    if content.is_empty() {
-        return Ok(Vec::new());
-    }
-
-    content
-        .split(',')
-        .map(|s| s.trim().parse::<f32>())
-        .collect::<std::result::Result<_, _>>()
-        .map_err(|e| {
-            InvalidFuncArgsSnafu {
-                err_msg: format!("Failed to parse {s} to Vector value: {e}"),
-            }
-            .build()
-        })
-}
-
 #[cfg(test)]
 mod tests {
     use std::sync::Arc;
@@ -456,27 +363,4 @@ mod tests {
             assert!(result.is_err());
         }
     }
-
-    #[test]
-    fn test_parse_vector_from_string() {
-        let result = parse_f32_vector_from_string("[1.0, 2.0, 3.0]").unwrap();
-        assert_eq!(result, vec![1.0, 2.0, 3.0]);
-
-        let result = parse_f32_vector_from_string("[]").unwrap();
-        assert_eq!(result, Vec::<f32>::new());
-
-        let result = parse_f32_vector_from_string("[1.0, a, 3.0]");
-        assert!(result.is_err());
-    }
-
-    #[test]
-    fn test_binary_as_vector() {
-        let bytes = [0, 0, 128, 63];
-        let result = binary_as_vector(&bytes).unwrap();
-        assert_eq!(result.as_ref(), &[1.0]);
-
-        let invalid_bytes = [0, 0, 128];
-        let result = binary_as_vector(&invalid_bytes);
-        assert!(result.is_err());
-    }
 }
diff --git a/src/common/function/src/scalars/vector/impl_conv.rs b/src/common/function/src/scalars/vector/impl_conv.rs
new file mode 100644
index 000000000000..903bfb2a0336
--- /dev/null
+++ b/src/common/function/src/scalars/vector/impl_conv.rs
@@ -0,0 +1,156 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::borrow::Cow;
+use std::sync::Arc;
+
+use common_query::error::{InvalidFuncArgsSnafu, Result};
+use datatypes::prelude::ConcreteDataType;
+use datatypes::value::ValueRef;
+use datatypes::vectors::Vector;
+
+/// Convert a constant string or binary literal to a vector literal.
+pub fn as_veclit_if_const(arg: &Arc<dyn Vector>) -> Result<Option<Cow<'_, [f32]>>> {
+    if !arg.is_const() {
+        return Ok(None);
+    }
+    if arg.data_type() != ConcreteDataType::string_datatype()
+        && arg.data_type() != ConcreteDataType::binary_datatype()
+    {
+        return Ok(None);
+    }
+    as_veclit(arg.get_ref(0))
+}
+
+/// Convert a string or binary literal to a vector literal.
+pub fn as_veclit(arg: ValueRef<'_>) -> Result<Option<Cow<'_, [f32]>>> {
+    match arg.data_type() {
+        ConcreteDataType::Binary(_) => arg
+            .as_binary()
+            .unwrap() // Safe: checked if it is a binary
+            .map(binlit_as_veclit)
+            .transpose(),
+        ConcreteDataType::String(_) => arg
+            .as_string()
+            .unwrap() // Safe: checked if it is a string
+            .map(|s| Ok(Cow::Owned(parse_veclit_from_strlit(s)?)))
+            .transpose(),
+        ConcreteDataType::Null(_) => Ok(None),
+        _ => InvalidFuncArgsSnafu {
+            err_msg: format!("Unsupported data type: {:?}", arg.data_type()),
+        }
+        .fail(),
+    }
+}
+
+/// Convert a u8 slice to a vector literal.
+pub fn binlit_as_veclit(bytes: &[u8]) -> Result<Cow<'_, [f32]>> {
+    if bytes.len() % std::mem::size_of::<f32>() != 0 {
+        return InvalidFuncArgsSnafu {
+            err_msg: format!("Invalid binary length of vector: {}", bytes.len()),
+        }
+        .fail();
+    }
+
+    if cfg!(target_endian = "little") {
+        Ok(unsafe {
+            let vec = std::slice::from_raw_parts(
+                bytes.as_ptr() as *const f32,
+                bytes.len() / std::mem::size_of::<f32>(),
+            );
+            Cow::Borrowed(vec)
+        })
+    } else {
+        let v = bytes
+            .chunks_exact(std::mem::size_of::<f32>())
+            .map(|chunk| f32::from_le_bytes(chunk.try_into().unwrap()))
+            .collect::<Vec<f32>>();
+        Ok(Cow::Owned(v))
+    }
+}
+
+/// Parse a string literal to a vector literal.
+/// Valid inputs are strings like "[1.0, 2.0, 3.0]".
+pub fn parse_veclit_from_strlit(s: &str) -> Result<Vec<f32>> {
+    let trimmed = s.trim();
+    if !trimmed.starts_with('[') || !trimmed.ends_with(']') {
+        return InvalidFuncArgsSnafu {
+            err_msg: format!(
+                "Failed to parse {s} to Vector value: not properly enclosed in brackets"
+            ),
+        }
+        .fail();
+    }
+    let content = trimmed[1..trimmed.len() - 1].trim();
+    if content.is_empty() {
+        return Ok(Vec::new());
+    }
+
+    content
+        .split(',')
+        .map(|s| s.trim().parse::<f32>())
+        .collect::<std::result::Result<_, _>>()
+        .map_err(|e| {
+            InvalidFuncArgsSnafu {
+                err_msg: format!("Failed to parse {s} to Vector value: {e}"),
+            }
+            .build()
+        })
+}
+
+#[allow(unused)]
+/// Convert a vector literal to a binary literal.
+pub fn veclit_to_binlit(vec: &[f32]) -> Vec<u8> {
+    if cfg!(target_endian = "little") {
+        unsafe {
+            std::slice::from_raw_parts(vec.as_ptr() as *const u8, std::mem::size_of_val(vec))
+                .to_vec()
+        }
+    } else {
+        let mut bytes = Vec::with_capacity(std::mem::size_of_val(vec));
+        for e in vec {
+            bytes.extend_from_slice(&e.to_le_bytes());
+        }
+        bytes
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_veclit_from_strlit() {
+        let result = parse_veclit_from_strlit("[1.0, 2.0, 3.0]").unwrap();
+        assert_eq!(result, vec![1.0, 2.0, 3.0]);
+
+        let result = parse_veclit_from_strlit("[]").unwrap();
+        assert_eq!(result, Vec::<f32>::new());
+
+        let result = parse_veclit_from_strlit("[1.0, a, 3.0]");
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_binlit_as_veclit() {
+        let vec = &[1.0, 2.0, 3.0];
+        let bytes = veclit_to_binlit(vec);
+        let result = binlit_as_veclit(&bytes).unwrap();
+        assert_eq!(result.as_ref(), vec);
+
+        let invalid_bytes = [0, 0, 128];
+        let result = binlit_as_veclit(&invalid_bytes);
+        assert!(result.is_err());
+    }
+}

From b35221ccb63cfa67900b634b61a7e84d7a39fbf9 Mon Sep 17 00:00:00 2001
From: Weny Xu <wenymedia@gmail.com>
Date: Mon, 9 Dec 2024 15:22:47 +0800
Subject: [PATCH 13/36] ci: set meta replicas to 1 (#5111)

---
 .github/actions/setup-greptimedb-cluster/action.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/actions/setup-greptimedb-cluster/action.yml b/.github/actions/setup-greptimedb-cluster/action.yml
index 088a46582507..7c385c43a9a9 100644
--- a/.github/actions/setup-greptimedb-cluster/action.yml
+++ b/.github/actions/setup-greptimedb-cluster/action.yml
@@ -8,7 +8,7 @@ inputs:
     default: 2
     description: "Number of Datanode replicas"
   meta-replicas:
-    default: 3
+    default: 1
     description: "Number of Metasrv replicas"
   image-registry: 
     default: "docker.io"
@@ -58,7 +58,7 @@ runs:
         --set image.tag=${{ inputs.image-tag }} \
         --set base.podTemplate.main.resources.requests.cpu=50m \
         --set base.podTemplate.main.resources.requests.memory=256Mi \
-        --set base.podTemplate.main.resources.limits.cpu=1000m \
+        --set base.podTemplate.main.resources.limits.cpu=2000m \
         --set base.podTemplate.main.resources.limits.memory=2Gi \
         --set frontend.replicas=${{ inputs.frontend-replicas }} \
         --set datanode.replicas=${{ inputs.datanode-replicas }} \

From 1b642ea6a9e0515d59a1706a48612e1d1aca5a4d Mon Sep 17 00:00:00 2001
From: ZonaHe <zonahe@qq.com>
Date: Mon, 9 Dec 2024 18:27:35 +0800
Subject: [PATCH 14/36] feat: update dashboard to v0.7.1 (#5123)

Co-authored-by: sunchanglong <sunchanglong@users.noreply.github.com>
---
 src/servers/dashboard/VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/servers/dashboard/VERSION b/src/servers/dashboard/VERSION
index 8b20e48523e5..63f2359f6421 100644
--- a/src/servers/dashboard/VERSION
+++ b/src/servers/dashboard/VERSION
@@ -1 +1 @@
-v0.7.0
+v0.7.1

From 2fcb95f50a3499379350d503eecfdf2a9b4f7d9e Mon Sep 17 00:00:00 2001
From: Yingwen <realevenyag@gmail.com>
Date: Mon, 9 Dec 2024 20:50:57 +0800
Subject: [PATCH 15/36] fix!: fix regression caused by unbalanced partitions
 and splitting ranges (#5090)

* feat: assign partition ranges by rows

* feat: balance partition rows

* feat: get uppoer bound for part nums

* feat: only split in non-compaction seq scan

* fix: parallel scan on multiple sources

* fix: can split check

* feat: scanner prepare by request

* feat: remove scan_parallelism

* docs: upate docs

* chore: update comment

* style: fix clippy

* feat: skip merge and dedup if there is only one source

* chore: Revert "feat: skip merge and dedup if there is only one source"

Since memtable won't do dedup jobs

This reverts commit 2fc7a54b11de1b7219e7fe2818a96b46405d3ee6.

* test: avoid compaction in sqlness window sort test

* chore: do not create semaphore if num partitions is enough

* chore: more assertions

* chore: fix typo

* fix: compaction flag not set

* chore: address review comments
---
 config/config.md                              |   2 -
 config/datanode.example.toml                  |   6 -
 config/standalone.example.toml                |   6 -
 src/cmd/tests/load_config_test.rs             |   2 -
 src/mito2/src/compaction.rs                   |   5 +-
 src/mito2/src/config.rs                       |  13 +-
 src/mito2/src/engine.rs                       |  26 +-
 src/mito2/src/engine/append_mode_test.rs      |   7 +-
 src/mito2/src/engine/merge_mode_test.rs       |   7 +-
 src/mito2/src/engine/parallel_test.rs         |   5 +-
 src/mito2/src/read/range.rs                   | 260 ++++++++++++------
 src/mito2/src/read/scan_region.rs             |  96 ++++---
 src/mito2/src/read/seq_scan.rs                | 108 +++++---
 src/mito2/src/read/unordered_scan.rs          |  13 +-
 src/query/src/optimizer/parallelize_scan.rs   | 171 ++++++++++--
 src/store-api/src/region_engine.rs            |  85 ++++--
 src/table/src/table/scan.rs                   |  15 +-
 .../common/order/windowed_sort.result         |  26 +-
 .../standalone/common/order/windowed_sort.sql |   4 +-
 19 files changed, 554 insertions(+), 303 deletions(-)

diff --git a/config/config.md b/config/config.md
index ec00eb98b730..1f034d28731d 100644
--- a/config/config.md
+++ b/config/config.md
@@ -136,7 +136,6 @@
 | `region_engine.mito.experimental_write_cache_size` | String | `1GiB` | Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger. |
 | `region_engine.mito.experimental_write_cache_ttl` | String | Unset | TTL for write cache. |
 | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
-| `region_engine.mito.scan_parallelism` | Integer | `0` | Parallelism to scan a region (default: 1/4 of cpu cores).<br/>- `0`: using the default value (1/4 of cpu cores).<br/>- `1`: scan in current thread.<br/>- `n`: scan in parallelism n. |
 | `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
 | `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. |
 | `region_engine.mito.min_compaction_interval` | String | `0m` | Minimum time interval between two compactions.<br/>To align with the old behavior, the default value is 0 (no restrictions). |
@@ -464,7 +463,6 @@
 | `region_engine.mito.experimental_write_cache_size` | String | `1GiB` | Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger. |
 | `region_engine.mito.experimental_write_cache_ttl` | String | Unset | TTL for write cache. |
 | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
-| `region_engine.mito.scan_parallelism` | Integer | `0` | Parallelism to scan a region (default: 1/4 of cpu cores).<br/>- `0`: using the default value (1/4 of cpu cores).<br/>- `1`: scan in current thread.<br/>- `n`: scan in parallelism n. |
 | `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
 | `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. |
 | `region_engine.mito.min_compaction_interval` | String | `0m` | Minimum time interval between two compactions.<br/>To align with the old behavior, the default value is 0 (no restrictions). |
diff --git a/config/datanode.example.toml b/config/datanode.example.toml
index c5fdd24ebe14..11c2794e61df 100644
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -492,12 +492,6 @@ experimental_write_cache_ttl = "8h"
 ## Buffer size for SST writing.
 sst_write_buffer_size = "8MB"
 
-## Parallelism to scan a region (default: 1/4 of cpu cores).
-## - `0`: using the default value (1/4 of cpu cores).
-## - `1`: scan in current thread.
-## - `n`: scan in parallelism n.
-scan_parallelism = 0
-
 ## Capacity of the channel to send data from parallel scan tasks to the main task.
 parallel_scan_channel_size = 32
 
diff --git a/config/standalone.example.toml b/config/standalone.example.toml
index deaf8900f213..a69295af1644 100644
--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -530,12 +530,6 @@ experimental_write_cache_ttl = "8h"
 ## Buffer size for SST writing.
 sst_write_buffer_size = "8MB"
 
-## Parallelism to scan a region (default: 1/4 of cpu cores).
-## - `0`: using the default value (1/4 of cpu cores).
-## - `1`: scan in current thread.
-## - `n`: scan in parallelism n.
-scan_parallelism = 0
-
 ## Capacity of the channel to send data from parallel scan tasks to the main task.
 parallel_scan_channel_size = 32
 
diff --git a/src/cmd/tests/load_config_test.rs b/src/cmd/tests/load_config_test.rs
index 454188141d14..c5f1111d37b6 100644
--- a/src/cmd/tests/load_config_test.rs
+++ b/src/cmd/tests/load_config_test.rs
@@ -69,7 +69,6 @@ fn test_load_datanode_example_config() {
             region_engine: vec![
                 RegionEngineConfig::Mito(MitoConfig {
                     auto_flush_interval: Duration::from_secs(3600),
-                    scan_parallelism: 0,
                     experimental_write_cache_ttl: Some(Duration::from_secs(60 * 60 * 8)),
                     ..Default::default()
                 }),
@@ -205,7 +204,6 @@ fn test_load_standalone_example_config() {
                 RegionEngineConfig::Mito(MitoConfig {
                     auto_flush_interval: Duration::from_secs(3600),
                     experimental_write_cache_ttl: Some(Duration::from_secs(60 * 60 * 8)),
-                    scan_parallelism: 0,
                     ..Default::default()
                 }),
                 RegionEngineConfig::File(EngineConfig {}),
diff --git a/src/mito2/src/compaction.rs b/src/mito2/src/compaction.rs
index a4094af74121..5f462f33a111 100644
--- a/src/mito2/src/compaction.rs
+++ b/src/mito2/src/compaction.rs
@@ -597,9 +597,8 @@ impl<'a> CompactionSstReaderBuilder<'a> {
                 scan_input.with_predicate(time_range_to_predicate(time_range, &self.metadata)?);
         }
 
-        SeqScan::new(scan_input)
-            .with_compaction()
-            .build_reader()
+        SeqScan::new(scan_input, true)
+            .build_reader_for_compaction()
             .await
     }
 }
diff --git a/src/mito2/src/config.rs b/src/mito2/src/config.rs
index 797c42f8084c..cb4022f65e57 100644
--- a/src/mito2/src/config.rs
+++ b/src/mito2/src/config.rs
@@ -30,7 +30,7 @@ use crate::sst::DEFAULT_WRITE_BUFFER_SIZE;
 
 const MULTIPART_UPLOAD_MINIMUM_SIZE: ReadableSize = ReadableSize::mb(5);
 /// Default channel size for parallel scan task.
-const DEFAULT_SCAN_CHANNEL_SIZE: usize = 32;
+pub(crate) const DEFAULT_SCAN_CHANNEL_SIZE: usize = 32;
 
 // Use `1/GLOBAL_WRITE_BUFFER_SIZE_FACTOR` of OS memory as global write buffer size in default mode
 const GLOBAL_WRITE_BUFFER_SIZE_FACTOR: u64 = 8;
@@ -107,11 +107,6 @@ pub struct MitoConfig {
     // Other configs:
     /// Buffer size for SST writing.
     pub sst_write_buffer_size: ReadableSize,
-    /// Parallelism to scan a region (default: 1/4 of cpu cores).
-    /// - 0: using the default value (1/4 of cpu cores).
-    /// - 1: scan in current thread.
-    /// - n: scan in parallelism n.
-    pub scan_parallelism: usize,
     /// Capacity of the channel to send data from parallel scan tasks to the main task (default 32).
     pub parallel_scan_channel_size: usize,
     /// Whether to allow stale entries read during replay.
@@ -156,7 +151,6 @@ impl Default for MitoConfig {
             experimental_write_cache_size: ReadableSize::gb(1),
             experimental_write_cache_ttl: None,
             sst_write_buffer_size: DEFAULT_WRITE_BUFFER_SIZE,
-            scan_parallelism: divide_num_cpus(4),
             parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE,
             allow_stale_entries: false,
             index: IndexConfig::default(),
@@ -229,11 +223,6 @@ impl MitoConfig {
             );
         }
 
-        // Use default value if `scan_parallelism` is 0.
-        if self.scan_parallelism == 0 {
-            self.scan_parallelism = divide_num_cpus(4);
-        }
-
         if self.parallel_scan_channel_size < 1 {
             self.parallel_scan_channel_size = DEFAULT_SCAN_CHANNEL_SIZE;
             warn!(
diff --git a/src/mito2/src/engine.rs b/src/mito2/src/engine.rs
index c60b7c4107ed..a518da32535d 100644
--- a/src/mito2/src/engine.rs
+++ b/src/mito2/src/engine.rs
@@ -90,7 +90,7 @@ use crate::error::{
 };
 use crate::manifest::action::RegionEdit;
 use crate::metrics::HANDLE_REQUEST_ELAPSED;
-use crate::read::scan_region::{ScanParallelism, ScanRegion, Scanner};
+use crate::read::scan_region::{ScanRegion, Scanner};
 use crate::request::{RegionEditRequest, WorkerRequest};
 use crate::wal::entry_distributor::{
     build_wal_entry_distributor_and_receivers, DEFAULT_ENTRY_RECEIVER_BUFFER_SIZE,
@@ -171,19 +171,9 @@ impl MitoEngine {
         self.scan_region(region_id, request)?.scanner()
     }
 
-    /// Returns a region scanner to scan the region for `request`.
-    fn region_scanner(
-        &self,
-        region_id: RegionId,
-        request: ScanRequest,
-    ) -> Result<RegionScannerRef> {
-        let scanner = self.scanner(region_id, request)?;
-        scanner.region_scanner()
-    }
-
     /// Scans a region.
     fn scan_region(&self, region_id: RegionId, request: ScanRequest) -> Result<ScanRegion> {
-        self.inner.handle_query(region_id, request)
+        self.inner.scan_region(region_id, request)
     }
 
     /// Edit region's metadata by [RegionEdit] directly. Use with care.
@@ -423,7 +413,7 @@ impl EngineInner {
     }
 
     /// Handles the scan `request` and returns a [ScanRegion].
-    fn handle_query(&self, region_id: RegionId, request: ScanRequest) -> Result<ScanRegion> {
+    fn scan_region(&self, region_id: RegionId, request: ScanRequest) -> Result<ScanRegion> {
         let query_start = Instant::now();
         // Reading a region doesn't need to go through the region worker thread.
         let region = self
@@ -433,14 +423,10 @@ impl EngineInner {
         let version = region.version();
         // Get cache.
         let cache_manager = self.workers.cache_manager();
-        let scan_parallelism = ScanParallelism {
-            parallelism: self.config.scan_parallelism,
-            channel_size: self.config.parallel_scan_channel_size,
-        };
 
         let scan_region =
             ScanRegion::new(version, region.access_layer.clone(), request, cache_manager)
-                .with_parallelism(scan_parallelism)
+                .with_parallel_scan_channel_size(self.config.parallel_scan_channel_size)
                 .with_ignore_inverted_index(self.config.inverted_index.apply_on_query.disabled())
                 .with_ignore_fulltext_index(self.config.fulltext_index.apply_on_query.disabled())
                 .with_start_time(query_start);
@@ -538,7 +524,9 @@ impl RegionEngine for MitoEngine {
         region_id: RegionId,
         request: ScanRequest,
     ) -> Result<RegionScannerRef, BoxedError> {
-        self.region_scanner(region_id, request)
+        self.scan_region(region_id, request)
+            .map_err(BoxedError::new)?
+            .region_scanner()
             .map_err(BoxedError::new)
     }
 
diff --git a/src/mito2/src/engine/append_mode_test.rs b/src/mito2/src/engine/append_mode_test.rs
index ab8515aa133c..c9f61c5db3e0 100644
--- a/src/mito2/src/engine/append_mode_test.rs
+++ b/src/mito2/src/engine/append_mode_test.rs
@@ -92,7 +92,6 @@ async fn test_append_mode_compaction() {
     let mut env = TestEnv::new();
     let engine = env
         .create_engine(MitoConfig {
-            scan_parallelism: 2,
             ..Default::default()
         })
         .await;
@@ -176,19 +175,19 @@ async fn test_append_mode_compaction() {
 | b     | 1.0     | 1970-01-01T00:00:01 |
 +-------+---------+---------------------+";
     // Scans in parallel.
-    let scanner = engine.scanner(region_id, ScanRequest::default()).unwrap();
+    let mut scanner = engine.scanner(region_id, ScanRequest::default()).unwrap();
     assert_eq!(2, scanner.num_files());
     assert_eq!(1, scanner.num_memtables());
+    scanner.set_target_partitions(2);
     let stream = scanner.scan().await.unwrap();
     let batches = RecordBatches::try_collect(stream).await.unwrap();
     assert_eq!(expected, sort_batches_and_print(&batches, &["tag_0", "ts"]));
 
-    // Reopens engine with parallelism 1.
+    // Reopens engine.
     let engine = env
         .reopen_engine(
             engine,
             MitoConfig {
-                scan_parallelism: 1,
                 ..Default::default()
             },
         )
diff --git a/src/mito2/src/engine/merge_mode_test.rs b/src/mito2/src/engine/merge_mode_test.rs
index 08f4d0565007..e74aba5655a3 100644
--- a/src/mito2/src/engine/merge_mode_test.rs
+++ b/src/mito2/src/engine/merge_mode_test.rs
@@ -92,7 +92,6 @@ async fn test_merge_mode_compaction() {
     let mut env = TestEnv::new();
     let engine = env
         .create_engine(MitoConfig {
-            scan_parallelism: 2,
             ..Default::default()
         })
         .await;
@@ -190,19 +189,19 @@ async fn test_merge_mode_compaction() {
 | a     |         | 13.0    | 1970-01-01T00:00:03 |
 +-------+---------+---------+---------------------+";
     // Scans in parallel.
-    let scanner = engine.scanner(region_id, ScanRequest::default()).unwrap();
+    let mut scanner = engine.scanner(region_id, ScanRequest::default()).unwrap();
     assert_eq!(1, scanner.num_files());
     assert_eq!(1, scanner.num_memtables());
+    scanner.set_target_partitions(2);
     let stream = scanner.scan().await.unwrap();
     let batches = RecordBatches::try_collect(stream).await.unwrap();
     assert_eq!(expected, sort_batches_and_print(&batches, &["tag_0", "ts"]));
 
-    // Reopens engine with parallelism 1.
+    // Reopens engine.
     let engine = env
         .reopen_engine(
             engine,
             MitoConfig {
-                scan_parallelism: 1,
                 ..Default::default()
             },
         )
diff --git a/src/mito2/src/engine/parallel_test.rs b/src/mito2/src/engine/parallel_test.rs
index 53cc0dca8fb0..3d5dab3540e1 100644
--- a/src/mito2/src/engine/parallel_test.rs
+++ b/src/mito2/src/engine/parallel_test.rs
@@ -37,7 +37,6 @@ async fn scan_in_parallel(
 ) {
     let engine = env
         .open_engine(MitoConfig {
-            scan_parallelism: parallelism,
             parallel_scan_channel_size: channel_size,
             ..Default::default()
         })
@@ -57,7 +56,9 @@ async fn scan_in_parallel(
         .unwrap();
 
     let request = ScanRequest::default();
-    let stream = engine.scan_to_stream(region_id, request).await.unwrap();
+    let mut scanner = engine.scanner(region_id, request).unwrap();
+    scanner.set_target_partitions(parallelism);
+    let stream = scanner.scan().await.unwrap();
     let batches = RecordBatches::try_collect(stream).await.unwrap();
     let expected = "\
 +-------+---------+---------------------+
diff --git a/src/mito2/src/read/range.rs b/src/mito2/src/read/range.rs
index 1944d171dd19..554751830ffc 100644
--- a/src/mito2/src/read/range.rs
+++ b/src/mito2/src/read/range.rs
@@ -34,6 +34,16 @@ use crate::sst::parquet::DEFAULT_ROW_GROUP_SIZE;
 
 const ALL_ROW_GROUPS: i64 = -1;
 
+/// Index and metadata for a memtable or file.
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub(crate) struct SourceIndex {
+    /// Index of the memtable and file.
+    pub(crate) index: usize,
+    /// Total number of row groups in this source. 0 if the metadata
+    /// is unavailable. We use this to split files.
+    pub(crate) num_row_groups: u64,
+}
+
 /// Index to access a row group.
 #[derive(Debug, Clone, Copy, PartialEq)]
 pub(crate) struct RowGroupIndex {
@@ -52,7 +62,7 @@ pub(crate) struct RangeMeta {
     /// The time range of the range.
     pub(crate) time_range: FileTimeRange,
     /// Indices to memtables or files.
-    indices: SmallVec<[usize; 2]>,
+    pub(crate) indices: SmallVec<[SourceIndex; 2]>,
     /// Indices to memtable/file row groups that this range scans.
     pub(crate) row_group_indices: SmallVec<[RowGroupIndex; 2]>,
     /// Estimated number of rows in the range. This can be 0 if the statistics are not available.
@@ -81,12 +91,17 @@ impl RangeMeta {
     }
 
     /// Creates a list of ranges from the `input` for seq scan.
-    pub(crate) fn seq_scan_ranges(input: &ScanInput) -> Vec<RangeMeta> {
+    /// If `compaction` is true, it doesn't split the ranges.
+    pub(crate) fn seq_scan_ranges(input: &ScanInput, compaction: bool) -> Vec<RangeMeta> {
         let mut ranges = Vec::with_capacity(input.memtables.len() + input.files.len());
         Self::push_seq_mem_ranges(&input.memtables, &mut ranges);
         Self::push_seq_file_ranges(input.memtables.len(), &input.files, &mut ranges);
 
         let ranges = group_ranges_for_seq_scan(ranges);
+        if compaction {
+            // We don't split ranges in compaction.
+            return ranges;
+        }
         maybe_split_ranges_for_seq_scan(ranges)
     }
 
@@ -105,13 +120,13 @@ impl RangeMeta {
     }
 
     /// Returns true if the time range of given `meta` overlaps with the time range of this meta.
-    pub(crate) fn overlaps(&self, meta: &RangeMeta) -> bool {
+    fn overlaps(&self, meta: &RangeMeta) -> bool {
         overlaps(&self.time_range, &meta.time_range)
     }
 
     /// Merges given `meta` to this meta.
     /// It assumes that the time ranges overlap and they don't have the same file or memtable index.
-    pub(crate) fn merge(&mut self, mut other: RangeMeta) {
+    fn merge(&mut self, mut other: RangeMeta) {
         debug_assert!(self.overlaps(&other));
         debug_assert!(self.indices.iter().all(|idx| !other.indices.contains(idx)));
         debug_assert!(self
@@ -130,22 +145,28 @@ impl RangeMeta {
 
     /// Returns true if we can split the range into multiple smaller ranges and
     /// still preserve the order for [SeqScan].
-    pub(crate) fn can_split_preserve_order(&self) -> bool {
-        // Only one source and multiple row groups.
-        self.indices.len() == 1 && self.row_group_indices.len() > 1
+    fn can_split_preserve_order(&self) -> bool {
+        self.indices.len() == 1 && self.indices[0].num_row_groups > 1
     }
 
     /// Splits the range if it can preserve the order.
-    pub(crate) fn maybe_split(self, output: &mut Vec<RangeMeta>) {
+    fn maybe_split(self, output: &mut Vec<RangeMeta>) {
         if self.can_split_preserve_order() {
+            let num_row_groups = self.indices[0].num_row_groups;
+            debug_assert_eq!(1, self.row_group_indices.len());
+            debug_assert_eq!(ALL_ROW_GROUPS, self.row_group_indices[0].row_group_index);
+
             output.reserve(self.row_group_indices.len());
-            let num_rows = self.num_rows / self.row_group_indices.len();
+            let num_rows = self.num_rows / num_row_groups as usize;
             // Splits by row group.
-            for index in self.row_group_indices {
+            for row_group_index in 0..num_row_groups {
                 output.push(RangeMeta {
                     time_range: self.time_range,
                     indices: self.indices.clone(),
-                    row_group_indices: smallvec![index],
+                    row_group_indices: smallvec![RowGroupIndex {
+                        index: self.indices[0].index,
+                        row_group_index: row_group_index as i64,
+                    }],
                     num_rows,
                 });
             }
@@ -165,7 +186,10 @@ impl RangeMeta {
                 let num_rows = stats.num_rows() / stats.num_ranges();
                 ranges.push(RangeMeta {
                     time_range,
-                    indices: smallvec![memtable_index],
+                    indices: smallvec![SourceIndex {
+                        index: memtable_index,
+                        num_row_groups: stats.num_ranges() as u64,
+                    }],
                     row_group_indices: smallvec![RowGroupIndex {
                         index: memtable_index,
                         row_group_index: row_group_index as i64,
@@ -199,7 +223,10 @@ impl RangeMeta {
                     let num_rows = parquet_meta.row_group(row_group_index as usize).num_rows();
                     ranges.push(RangeMeta {
                         time_range: time_range.unwrap_or_else(|| file.time_range()),
-                        indices: smallvec![file_index],
+                        indices: smallvec![SourceIndex {
+                            index: file_index,
+                            num_row_groups: file.meta_ref().num_row_groups,
+                        }],
                         row_group_indices: smallvec![RowGroupIndex {
                             index: file_index,
                             row_group_index: row_group_index as i64,
@@ -212,7 +239,10 @@ impl RangeMeta {
                 for row_group_index in 0..file.meta_ref().num_row_groups {
                     ranges.push(RangeMeta {
                         time_range: file.time_range(),
-                        indices: smallvec![file_index],
+                        indices: smallvec![SourceIndex {
+                            index: file_index,
+                            num_row_groups: file.meta_ref().num_row_groups,
+                        }],
                         row_group_indices: smallvec![RowGroupIndex {
                             index: file_index,
                             row_group_index: row_group_index as i64,
@@ -224,7 +254,10 @@ impl RangeMeta {
                 // If we don't known the number of row groups in advance, scan all row groups.
                 ranges.push(RangeMeta {
                     time_range: file.time_range(),
-                    indices: smallvec![file_index],
+                    indices: smallvec![SourceIndex {
+                        index: file_index,
+                        num_row_groups: 0,
+                    }],
                     row_group_indices: smallvec![RowGroupIndex {
                         index: file_index,
                         row_group_index: ALL_ROW_GROUPS,
@@ -245,7 +278,10 @@ impl RangeMeta {
             };
             ranges.push(RangeMeta {
                 time_range,
-                indices: smallvec![i],
+                indices: smallvec![SourceIndex {
+                    index: i,
+                    num_row_groups: stats.num_ranges() as u64,
+                }],
                 row_group_indices: smallvec![RowGroupIndex {
                     index: i,
                     row_group_index: ALL_ROW_GROUPS,
@@ -263,31 +299,18 @@ impl RangeMeta {
         // For non append-only mode, each range only contains one file.
         for (i, file) in files.iter().enumerate() {
             let file_index = num_memtables + i;
-            if file.meta_ref().num_row_groups > 0 {
-                // All row groups share the same time range.
-                let row_group_indices = (0..file.meta_ref().num_row_groups)
-                    .map(|row_group_index| RowGroupIndex {
-                        index: file_index,
-                        row_group_index: row_group_index as i64,
-                    })
-                    .collect();
-                ranges.push(RangeMeta {
-                    time_range: file.time_range(),
-                    indices: smallvec![file_index],
-                    row_group_indices,
-                    num_rows: file.meta_ref().num_rows as usize,
-                });
-            } else {
-                ranges.push(RangeMeta {
-                    time_range: file.time_range(),
-                    indices: smallvec![file_index],
-                    row_group_indices: smallvec![RowGroupIndex {
-                        index: file_index,
-                        row_group_index: ALL_ROW_GROUPS,
-                    }],
-                    num_rows: file.meta_ref().num_rows as usize,
-                });
-            }
+            ranges.push(RangeMeta {
+                time_range: file.time_range(),
+                indices: smallvec![SourceIndex {
+                    index: file_index,
+                    num_row_groups: file.meta_ref().num_row_groups,
+                }],
+                row_group_indices: smallvec![RowGroupIndex {
+                    index: file_index,
+                    row_group_index: ALL_ROW_GROUPS,
+                }],
+                num_rows: file.meta_ref().num_rows as usize,
+            });
         }
     }
 }
@@ -514,7 +537,10 @@ mod tests {
                 );
                 RangeMeta {
                     time_range,
-                    indices: smallvec![*idx],
+                    indices: smallvec![SourceIndex {
+                        index: *idx,
+                        num_row_groups: 0,
+                    }],
                     row_group_indices: smallvec![RowGroupIndex {
                         index: *idx,
                         row_group_index: 0
@@ -527,7 +553,7 @@ mod tests {
         let actual: Vec<_> = output
             .iter()
             .map(|range| {
-                let indices = range.indices.to_vec();
+                let indices = range.indices.iter().map(|index| index.index).collect();
                 let group_indices: Vec<_> = range
                     .row_group_indices
                     .iter()
@@ -578,7 +604,10 @@ mod tests {
     fn test_merge_range() {
         let mut left = RangeMeta {
             time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
-            indices: smallvec![1],
+            indices: smallvec![SourceIndex {
+                index: 1,
+                num_row_groups: 2,
+            }],
             row_group_indices: smallvec![
                 RowGroupIndex {
                     index: 1,
@@ -593,7 +622,10 @@ mod tests {
         };
         let right = RangeMeta {
             time_range: (Timestamp::new_second(800), Timestamp::new_second(1200)),
-            indices: smallvec![2],
+            indices: smallvec![SourceIndex {
+                index: 2,
+                num_row_groups: 2,
+            }],
             row_group_indices: smallvec![
                 RowGroupIndex {
                     index: 2,
@@ -612,7 +644,16 @@ mod tests {
             left,
             RangeMeta {
                 time_range: (Timestamp::new_second(800), Timestamp::new_second(2000)),
-                indices: smallvec![1, 2],
+                indices: smallvec![
+                    SourceIndex {
+                        index: 1,
+                        num_row_groups: 2
+                    },
+                    SourceIndex {
+                        index: 2,
+                        num_row_groups: 2
+                    }
+                ],
                 row_group_indices: smallvec![
                     RowGroupIndex {
                         index: 1,
@@ -640,17 +681,14 @@ mod tests {
     fn test_split_range() {
         let range = RangeMeta {
             time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
-            indices: smallvec![1],
-            row_group_indices: smallvec![
-                RowGroupIndex {
-                    index: 1,
-                    row_group_index: 1
-                },
-                RowGroupIndex {
-                    index: 1,
-                    row_group_index: 2
-                }
-            ],
+            indices: smallvec![SourceIndex {
+                index: 1,
+                num_row_groups: 2,
+            }],
+            row_group_indices: smallvec![RowGroupIndex {
+                index: 1,
+                row_group_index: ALL_ROW_GROUPS,
+            }],
             num_rows: 5,
         };
 
@@ -663,19 +701,25 @@ mod tests {
             &[
                 RangeMeta {
                     time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
-                    indices: smallvec![1],
+                    indices: smallvec![SourceIndex {
+                        index: 1,
+                        num_row_groups: 2,
+                    }],
                     row_group_indices: smallvec![RowGroupIndex {
                         index: 1,
-                        row_group_index: 1
+                        row_group_index: 0
                     },],
                     num_rows: 2,
                 },
                 RangeMeta {
                     time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
-                    indices: smallvec![1],
+                    indices: smallvec![SourceIndex {
+                        index: 1,
+                        num_row_groups: 2,
+                    }],
                     row_group_indices: smallvec![RowGroupIndex {
                         index: 1,
-                        row_group_index: 2
+                        row_group_index: 1
                     }],
                     num_rows: 2,
                 }
@@ -687,7 +731,16 @@ mod tests {
     fn test_not_split_range() {
         let range = RangeMeta {
             time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
-            indices: smallvec![1, 2],
+            indices: smallvec![
+                SourceIndex {
+                    index: 1,
+                    num_row_groups: 1,
+                },
+                SourceIndex {
+                    index: 2,
+                    num_row_groups: 1,
+                }
+            ],
             row_group_indices: smallvec![
                 RowGroupIndex {
                     index: 1,
@@ -710,32 +763,50 @@ mod tests {
     #[test]
     fn test_maybe_split_ranges() {
         let ranges = vec![
+            RangeMeta {
+                time_range: (Timestamp::new_second(0), Timestamp::new_second(500)),
+                indices: smallvec![SourceIndex {
+                    index: 0,
+                    num_row_groups: 1,
+                }],
+                row_group_indices: smallvec![RowGroupIndex {
+                    index: 0,
+                    row_group_index: 0,
+                },],
+                num_rows: 4,
+            },
             RangeMeta {
                 time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
-                indices: smallvec![1],
-                row_group_indices: smallvec![
-                    RowGroupIndex {
-                        index: 1,
-                        row_group_index: 0
-                    },
-                    RowGroupIndex {
-                        index: 1,
-                        row_group_index: 1
-                    }
-                ],
+                indices: smallvec![SourceIndex {
+                    index: 1,
+                    num_row_groups: 2,
+                }],
+                row_group_indices: smallvec![RowGroupIndex {
+                    index: 1,
+                    row_group_index: ALL_ROW_GROUPS,
+                },],
                 num_rows: 4,
             },
             RangeMeta {
                 time_range: (Timestamp::new_second(3000), Timestamp::new_second(4000)),
-                indices: smallvec![2, 3],
+                indices: smallvec![
+                    SourceIndex {
+                        index: 2,
+                        num_row_groups: 2,
+                    },
+                    SourceIndex {
+                        index: 3,
+                        num_row_groups: 0,
+                    }
+                ],
                 row_group_indices: smallvec![
                     RowGroupIndex {
                         index: 2,
-                        row_group_index: 0
+                        row_group_index: ALL_ROW_GROUPS,
                     },
                     RowGroupIndex {
                         index: 3,
-                        row_group_index: 0
+                        row_group_index: ALL_ROW_GROUPS,
                     }
                 ],
                 num_rows: 5,
@@ -745,9 +816,24 @@ mod tests {
         assert_eq!(
             output,
             vec![
+                RangeMeta {
+                    time_range: (Timestamp::new_second(0), Timestamp::new_second(500)),
+                    indices: smallvec![SourceIndex {
+                        index: 0,
+                        num_row_groups: 1,
+                    }],
+                    row_group_indices: smallvec![RowGroupIndex {
+                        index: 0,
+                        row_group_index: 0
+                    },],
+                    num_rows: 4,
+                },
                 RangeMeta {
                     time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
-                    indices: smallvec![1],
+                    indices: smallvec![SourceIndex {
+                        index: 1,
+                        num_row_groups: 2,
+                    }],
                     row_group_indices: smallvec![RowGroupIndex {
                         index: 1,
                         row_group_index: 0
@@ -756,7 +842,10 @@ mod tests {
                 },
                 RangeMeta {
                     time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
-                    indices: smallvec![1],
+                    indices: smallvec![SourceIndex {
+                        index: 1,
+                        num_row_groups: 2,
+                    }],
                     row_group_indices: smallvec![RowGroupIndex {
                         index: 1,
                         row_group_index: 1
@@ -765,15 +854,24 @@ mod tests {
                 },
                 RangeMeta {
                     time_range: (Timestamp::new_second(3000), Timestamp::new_second(4000)),
-                    indices: smallvec![2, 3],
+                    indices: smallvec![
+                        SourceIndex {
+                            index: 2,
+                            num_row_groups: 2
+                        },
+                        SourceIndex {
+                            index: 3,
+                            num_row_groups: 0,
+                        }
+                    ],
                     row_group_indices: smallvec![
                         RowGroupIndex {
                             index: 2,
-                            row_group_index: 0
+                            row_group_index: ALL_ROW_GROUPS,
                         },
                         RowGroupIndex {
                             index: 3,
-                            row_group_index: 0
+                            row_group_index: ALL_ROW_GROUPS,
                         }
                     ],
                     num_rows: 5,
diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs
index 7da80806f22e..471cc1a8e5d4 100644
--- a/src/mito2/src/read/scan_region.rs
+++ b/src/mito2/src/read/scan_region.rs
@@ -33,6 +33,7 @@ use tokio_stream::wrappers::ReceiverStream;
 use crate::access_layer::AccessLayerRef;
 use crate::cache::file_cache::FileCacheRef;
 use crate::cache::CacheManagerRef;
+use crate::config::DEFAULT_SCAN_CHANNEL_SIZE;
 use crate::error::Result;
 use crate::memtable::MemtableRef;
 use crate::metrics::READ_SST_COUNT;
@@ -68,15 +69,6 @@ impl Scanner {
             Scanner::Unordered(unordered_scan) => unordered_scan.build_stream().await,
         }
     }
-
-    /// Returns a [RegionScanner] to scan the region.
-    #[tracing::instrument(level = tracing::Level::DEBUG, skip_all)]
-    pub(crate) fn region_scanner(self) -> Result<RegionScannerRef> {
-        match self {
-            Scanner::Seq(seq_scan) => Ok(Box::new(seq_scan)),
-            Scanner::Unordered(unordered_scan) => Ok(Box::new(unordered_scan)),
-        }
-    }
 }
 
 #[cfg(test)]
@@ -104,6 +96,17 @@ impl Scanner {
             Scanner::Unordered(unordered_scan) => unordered_scan.input().file_ids(),
         }
     }
+
+    /// Sets the target partitions for the scanner. It can controls the parallelism of the scanner.
+    pub(crate) fn set_target_partitions(&mut self, target_partitions: usize) {
+        use store_api::region_engine::{PrepareRequest, RegionScanner};
+
+        let request = PrepareRequest::default().with_target_partitions(target_partitions);
+        match self {
+            Scanner::Seq(seq_scan) => seq_scan.prepare(request).unwrap(),
+            Scanner::Unordered(unordered_scan) => unordered_scan.prepare(request).unwrap(),
+        }
+    }
 }
 
 #[cfg_attr(doc, aquamarine::aquamarine)]
@@ -165,8 +168,8 @@ pub(crate) struct ScanRegion {
     request: ScanRequest,
     /// Cache.
     cache_manager: CacheManagerRef,
-    /// Parallelism to scan.
-    parallelism: ScanParallelism,
+    /// Capacity of the channel to send data from parallel scan tasks to the main task.
+    parallel_scan_channel_size: usize,
     /// Whether to ignore inverted index.
     ignore_inverted_index: bool,
     /// Whether to ignore fulltext index.
@@ -188,17 +191,20 @@ impl ScanRegion {
             access_layer,
             request,
             cache_manager,
-            parallelism: ScanParallelism::default(),
+            parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE,
             ignore_inverted_index: false,
             ignore_fulltext_index: false,
             start_time: None,
         }
     }
 
-    /// Sets parallelism.
+    /// Sets parallel scan task channel size.
     #[must_use]
-    pub(crate) fn with_parallelism(mut self, parallelism: ScanParallelism) -> Self {
-        self.parallelism = parallelism;
+    pub(crate) fn with_parallel_scan_channel_size(
+        mut self,
+        parallel_scan_channel_size: usize,
+    ) -> Self {
+        self.parallel_scan_channel_size = parallel_scan_channel_size;
         self
     }
 
@@ -224,7 +230,7 @@ impl ScanRegion {
 
     /// Returns a [Scanner] to scan the region.
     pub(crate) fn scanner(self) -> Result<Scanner> {
-        if self.version.options.append_mode && self.request.series_row_selector.is_none() {
+        if self.use_unordered_scan() {
             // If table is append only and there is no series row selector, we use unordered scan in query.
             // We still use seq scan in compaction.
             self.unordered_scan().map(Scanner::Unordered)
@@ -233,10 +239,20 @@ impl ScanRegion {
         }
     }
 
+    /// Returns a [RegionScanner] to scan the region.
+    #[tracing::instrument(level = tracing::Level::DEBUG, skip_all)]
+    pub(crate) fn region_scanner(self) -> Result<RegionScannerRef> {
+        if self.use_unordered_scan() {
+            self.unordered_scan().map(|scanner| Box::new(scanner) as _)
+        } else {
+            self.seq_scan().map(|scanner| Box::new(scanner) as _)
+        }
+    }
+
     /// Scan sequentially.
     pub(crate) fn seq_scan(self) -> Result<SeqScan> {
         let input = self.scan_input(true)?;
-        Ok(SeqScan::new(input))
+        Ok(SeqScan::new(input, false))
     }
 
     /// Unordered scan.
@@ -248,7 +264,14 @@ impl ScanRegion {
     #[cfg(test)]
     pub(crate) fn scan_without_filter_deleted(self) -> Result<SeqScan> {
         let input = self.scan_input(false)?;
-        Ok(SeqScan::new(input))
+        Ok(SeqScan::new(input, false))
+    }
+
+    /// Returns true if the region can use unordered scan for current request.
+    fn use_unordered_scan(&self) -> bool {
+        // If table is append only and there is no series row selector, we use unordered scan in query.
+        // We still use seq scan in compaction.
+        self.version.options.append_mode && self.request.series_row_selector.is_none()
     }
 
     /// Creates a scan input.
@@ -314,7 +337,7 @@ impl ScanRegion {
             .with_cache(self.cache_manager)
             .with_inverted_index_applier(inverted_index_applier)
             .with_fulltext_index_applier(fulltext_index_applier)
-            .with_parallelism(self.parallelism)
+            .with_parallel_scan_channel_size(self.parallel_scan_channel_size)
             .with_start_time(self.start_time)
             .with_append_mode(self.version.options.append_mode)
             .with_filter_deleted(filter_deleted)
@@ -428,15 +451,6 @@ impl ScanRegion {
     }
 }
 
-/// Config for parallel scan.
-#[derive(Debug, Clone, Copy, Default)]
-pub(crate) struct ScanParallelism {
-    /// Number of tasks expect to spawn to read data.
-    pub(crate) parallelism: usize,
-    /// Channel size to send batches. Only takes effect when the parallelism > 1.
-    pub(crate) channel_size: usize,
-}
-
 /// Returns true if the time range of a SST `file` matches the `predicate`.
 fn file_in_range(file: &FileHandle, predicate: &TimestampRange) -> bool {
     if predicate == &TimestampRange::min_to_max() {
@@ -466,8 +480,8 @@ pub(crate) struct ScanInput {
     pub(crate) cache_manager: CacheManagerRef,
     /// Ignores file not found error.
     ignore_file_not_found: bool,
-    /// Parallelism to scan data.
-    pub(crate) parallelism: ScanParallelism,
+    /// Capacity of the channel to send data from parallel scan tasks to the main task.
+    pub(crate) parallel_scan_channel_size: usize,
     /// Index appliers.
     inverted_index_applier: Option<InvertedIndexApplierRef>,
     fulltext_index_applier: Option<FulltextIndexApplierRef>,
@@ -496,7 +510,7 @@ impl ScanInput {
             files: Vec::new(),
             cache_manager: CacheManagerRef::default(),
             ignore_file_not_found: false,
-            parallelism: ScanParallelism::default(),
+            parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE,
             inverted_index_applier: None,
             fulltext_index_applier: None,
             query_start: None,
@@ -549,10 +563,13 @@ impl ScanInput {
         self
     }
 
-    /// Sets scan parallelism.
+    /// Sets scan task channel size.
     #[must_use]
-    pub(crate) fn with_parallelism(mut self, parallelism: ScanParallelism) -> Self {
-        self.parallelism = parallelism;
+    pub(crate) fn with_parallel_scan_channel_size(
+        mut self,
+        parallel_scan_channel_size: usize,
+    ) -> Self {
+        self.parallel_scan_channel_size = parallel_scan_channel_size;
         self
     }
 
@@ -621,12 +638,15 @@ impl ScanInput {
         sources: Vec<Source>,
         semaphore: Arc<Semaphore>,
     ) -> Result<Vec<Source>> {
-        debug_assert!(self.parallelism.parallelism > 1);
+        if sources.len() <= 1 {
+            return Ok(sources);
+        }
+
         // Spawn a task for each source.
         let sources = sources
             .into_iter()
             .map(|source| {
-                let (sender, receiver) = mpsc::channel(self.parallelism.channel_size);
+                let (sender, receiver) = mpsc::channel(self.parallel_scan_channel_size);
                 self.spawn_scan_task(source, semaphore.clone(), sender);
                 let stream = Box::pin(ReceiverStream::new(receiver));
                 Source::Stream(stream)
@@ -761,9 +781,9 @@ pub(crate) struct StreamContext {
 
 impl StreamContext {
     /// Creates a new [StreamContext] for [SeqScan].
-    pub(crate) fn seq_scan_ctx(input: ScanInput) -> Self {
+    pub(crate) fn seq_scan_ctx(input: ScanInput, compaction: bool) -> Self {
         let query_start = input.query_start.unwrap_or_else(Instant::now);
-        let ranges = RangeMeta::seq_scan_ranges(&input);
+        let ranges = RangeMeta::seq_scan_ranges(&input, compaction);
         READ_SST_COUNT.observe(input.num_files() as f64);
 
         Self {
diff --git a/src/mito2/src/read/seq_scan.rs b/src/mito2/src/read/seq_scan.rs
index 9498078ddbc4..d8732cb93df2 100644
--- a/src/mito2/src/read/seq_scan.rs
+++ b/src/mito2/src/read/seq_scan.rs
@@ -28,7 +28,7 @@ use datafusion::physical_plan::{DisplayAs, DisplayFormatType};
 use datatypes::schema::SchemaRef;
 use snafu::ResultExt;
 use store_api::metadata::RegionMetadataRef;
-use store_api::region_engine::{PartitionRange, RegionScanner, ScannerProperties};
+use store_api::region_engine::{PartitionRange, PrepareRequest, RegionScanner, ScannerProperties};
 use store_api::storage::TimeSeriesRowSelector;
 use tokio::sync::Semaphore;
 
@@ -51,39 +51,27 @@ pub struct SeqScan {
     properties: ScannerProperties,
     /// Context of streams.
     stream_ctx: Arc<StreamContext>,
-    /// Semaphore to control scan parallelism of files.
-    /// Streams created by the scanner share the same semaphore.
-    semaphore: Arc<Semaphore>,
     /// The scanner is used for compaction.
     compaction: bool,
 }
 
 impl SeqScan {
-    /// Creates a new [SeqScan].
-    pub(crate) fn new(input: ScanInput) -> Self {
-        // TODO(yingwen): Set permits according to partition num. But we need to support file
-        // level parallelism.
-        let parallelism = input.parallelism.parallelism.max(1);
+    /// Creates a new [SeqScan] with the given input and compaction flag.
+    /// If `compaction` is true, the scanner will not attempt to split ranges.
+    pub(crate) fn new(input: ScanInput, compaction: bool) -> Self {
         let mut properties = ScannerProperties::default()
             .with_append_mode(input.append_mode)
             .with_total_rows(input.total_rows());
-        let stream_ctx = Arc::new(StreamContext::seq_scan_ctx(input));
+        let stream_ctx = Arc::new(StreamContext::seq_scan_ctx(input, compaction));
         properties.partitions = vec![stream_ctx.partition_ranges()];
 
         Self {
             properties,
             stream_ctx,
-            semaphore: Arc::new(Semaphore::new(parallelism)),
-            compaction: false,
+            compaction,
         }
     }
 
-    /// Sets the scanner to be used for compaction.
-    pub(crate) fn with_compaction(mut self) -> Self {
-        self.compaction = true;
-        self
-    }
-
     /// Builds a stream for the query.
     ///
     /// The returned stream is not partitioned and will contains all the data. If want
@@ -98,7 +86,12 @@ impl SeqScan {
     }
 
     /// Builds a [BoxedBatchReader] from sequential scan for compaction.
-    pub async fn build_reader(&self) -> Result<BoxedBatchReader> {
+    ///
+    /// # Panics
+    /// Panics if the compaction flag is not set.
+    pub async fn build_reader_for_compaction(&self) -> Result<BoxedBatchReader> {
+        assert!(self.compaction);
+
         let part_metrics = PartitionMetrics::new(
             self.stream_ctx.input.mapper.metadata().region_id,
             0,
@@ -112,23 +105,20 @@ impl SeqScan {
         debug_assert_eq!(1, self.properties.partitions.len());
         let partition_ranges = &self.properties.partitions[0];
 
-        let reader = Self::build_all_merge_reader(
+        let reader = Self::merge_all_ranges_for_compaction(
             &self.stream_ctx,
             partition_ranges,
-            self.semaphore.clone(),
-            self.compaction,
             &part_metrics,
         )
         .await?;
         Ok(Box::new(reader))
     }
 
-    /// Builds a merge reader that reads all data.
-    async fn build_all_merge_reader(
+    /// Builds a merge reader that reads all ranges.
+    /// Callers MUST not split ranges before calling this method.
+    async fn merge_all_ranges_for_compaction(
         stream_ctx: &Arc<StreamContext>,
         partition_ranges: &[PartitionRange],
-        semaphore: Arc<Semaphore>,
-        compaction: bool,
         part_metrics: &PartitionMetrics,
     ) -> Result<BoxedBatchReader> {
         let mut sources = Vec::new();
@@ -140,27 +130,37 @@ impl SeqScan {
             build_sources(
                 stream_ctx,
                 part_range,
-                compaction,
+                true,
                 part_metrics,
                 range_builder_list.clone(),
                 &mut sources,
             );
         }
-        Self::build_reader_from_sources(stream_ctx, sources, semaphore).await
+
+        common_telemetry::debug!(
+            "Build reader to read all parts, region_id: {}, num_part_ranges: {}, num_sources: {}",
+            stream_ctx.input.mapper.metadata().region_id,
+            partition_ranges.len(),
+            sources.len()
+        );
+        Self::build_reader_from_sources(stream_ctx, sources, None).await
     }
 
+    /// Builds a reader to read sources. If `semaphore` is provided, reads sources in parallel
+    /// if possible.
     #[tracing::instrument(level = tracing::Level::DEBUG, skip_all)]
     async fn build_reader_from_sources(
         stream_ctx: &StreamContext,
         mut sources: Vec<Source>,
-        semaphore: Arc<Semaphore>,
+        semaphore: Option<Arc<Semaphore>>,
     ) -> Result<BoxedBatchReader> {
-        if stream_ctx.input.parallelism.parallelism > 1 {
-            // Read sources in parallel. We always spawn a task so we can control the parallelism
-            // by the semaphore.
-            sources = stream_ctx
-                .input
-                .create_parallel_sources(sources, semaphore.clone())?;
+        if let Some(semaphore) = semaphore.as_ref() {
+            // Read sources in parallel.
+            if sources.len() > 1 {
+                sources = stream_ctx
+                    .input
+                    .create_parallel_sources(sources, semaphore.clone())?;
+            }
         }
 
         let mut builder = MergeReaderBuilder::from_sources(sources);
@@ -207,10 +207,21 @@ impl SeqScan {
         }
 
         let stream_ctx = self.stream_ctx.clone();
-        let semaphore = self.semaphore.clone();
+        let semaphore = if self.properties.target_partitions() > self.properties.num_partitions() {
+            // We can use additional tasks to read the data if we have more target partitions than actual partitions.
+            // This semaphore is partition level.
+            // We don't use a global semaphore to avoid a partition waiting for others. The final concurrency
+            // of tasks usually won't exceed the target partitions a lot as compaction can reduce the number of
+            // files in a part range.
+            Some(Arc::new(Semaphore::new(
+                self.properties.target_partitions() - self.properties.num_partitions() + 1,
+            )))
+        } else {
+            None
+        };
         let partition_ranges = self.properties.partitions[partition].clone();
         let compaction = self.compaction;
-        let distinguish_range = self.properties.distinguish_partition_range();
+        let distinguish_range = self.properties.distinguish_partition_range;
         let part_metrics = PartitionMetrics::new(
             self.stream_ctx.input.mapper.metadata().region_id,
             partition,
@@ -325,13 +336,8 @@ impl RegionScanner for SeqScan {
         self.scan_partition_impl(partition)
     }
 
-    fn prepare(
-        &mut self,
-        ranges: Vec<Vec<PartitionRange>>,
-        distinguish_partition_range: bool,
-    ) -> Result<(), BoxedError> {
-        self.properties.partitions = ranges;
-        self.properties.distinguish_partition_range = distinguish_partition_range;
+    fn prepare(&mut self, request: PrepareRequest) -> Result<(), BoxedError> {
+        self.properties.prepare(request);
         Ok(())
     }
 
@@ -375,6 +381,20 @@ fn build_sources(
 ) {
     // Gets range meta.
     let range_meta = &stream_ctx.ranges[part_range.identifier];
+    #[cfg(debug_assertions)]
+    if compaction {
+        // Compaction expects input sources are not been split.
+        debug_assert_eq!(range_meta.indices.len(), range_meta.row_group_indices.len());
+        for (i, row_group_idx) in range_meta.row_group_indices.iter().enumerate() {
+            // It should scan all row groups.
+            debug_assert_eq!(
+                -1, row_group_idx.row_group_index,
+                "Expect {} range scan all row groups, given: {}",
+                i, row_group_idx.row_group_index,
+            );
+        }
+    }
+
     sources.reserve(range_meta.row_group_indices.len());
     for index in &range_meta.row_group_indices {
         let stream = if stream_ctx.is_mem_range_index(*index) {
diff --git a/src/mito2/src/read/unordered_scan.rs b/src/mito2/src/read/unordered_scan.rs
index c1ee34b08e5d..97db9b86592c 100644
--- a/src/mito2/src/read/unordered_scan.rs
+++ b/src/mito2/src/read/unordered_scan.rs
@@ -27,7 +27,7 @@ use datatypes::schema::SchemaRef;
 use futures::{Stream, StreamExt};
 use snafu::ResultExt;
 use store_api::metadata::RegionMetadataRef;
-use store_api::region_engine::{PartitionRange, RegionScanner, ScannerProperties};
+use store_api::region_engine::{PrepareRequest, RegionScanner, ScannerProperties};
 
 use crate::error::{PartitionOutOfRangeSnafu, Result};
 use crate::read::range::RangeBuilderList;
@@ -144,7 +144,7 @@ impl UnorderedScan {
         );
         let stream_ctx = self.stream_ctx.clone();
         let part_ranges = self.properties.partitions[partition].clone();
-        let distinguish_range = self.properties.distinguish_partition_range();
+        let distinguish_range = self.properties.distinguish_partition_range;
 
         let stream = try_stream! {
             part_metrics.on_first_poll();
@@ -231,13 +231,8 @@ impl RegionScanner for UnorderedScan {
         self.stream_ctx.input.mapper.output_schema()
     }
 
-    fn prepare(
-        &mut self,
-        ranges: Vec<Vec<PartitionRange>>,
-        distinguish_partition_range: bool,
-    ) -> Result<(), BoxedError> {
-        self.properties.partitions = ranges;
-        self.properties.distinguish_partition_range = distinguish_partition_range;
+    fn prepare(&mut self, request: PrepareRequest) -> Result<(), BoxedError> {
+        self.properties.prepare(request);
         Ok(())
     }
 
diff --git a/src/query/src/optimizer/parallelize_scan.rs b/src/query/src/optimizer/parallelize_scan.rs
index 02cd04df87b6..a9e0a3302436 100644
--- a/src/query/src/optimizer/parallelize_scan.rs
+++ b/src/query/src/optimizer/parallelize_scan.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::collections::BinaryHeap;
 use std::sync::Arc;
 
 use common_telemetry::debug;
@@ -93,7 +94,7 @@ impl ParallelizeScan {
 
                     // update the partition ranges
                     let new_exec = region_scan_exec
-                        .with_new_partitions(partition_ranges)
+                        .with_new_partitions(partition_ranges, expected_partition_num)
                         .map_err(|e| DataFusionError::External(e.into_inner()))?;
                     return Ok(Transformed::yes(Arc::new(new_exec)));
                 }
@@ -109,21 +110,71 @@ impl ParallelizeScan {
 
     /// Distribute [`PartitionRange`]s to each partition.
     ///
-    /// Currently we use a simple round-robin strategy to assign ranges to partitions.
+    /// Currently we assign ranges to partitions according to their rows so each partition
+    /// has similar number of rows.
     /// This method may return partitions with smaller number than `expected_partition_num`
     /// if the number of ranges is smaller than `expected_partition_num`. But this will
     /// return at least one partition.
     fn assign_partition_range(
-        ranges: Vec<PartitionRange>,
+        mut ranges: Vec<PartitionRange>,
         expected_partition_num: usize,
     ) -> Vec<Vec<PartitionRange>> {
-        let actual_partition_num = expected_partition_num.min(ranges.len()).max(1);
+        if ranges.is_empty() {
+            // Returns a single partition with no range.
+            return vec![vec![]];
+        }
+
+        if ranges.len() == 1 {
+            return vec![ranges];
+        }
+
+        // Sort ranges by number of rows in descending order.
+        ranges.sort_by(|a, b| b.num_rows.cmp(&a.num_rows));
+        // Get the max row number of the ranges. Note that the number of rows may be 0 if statistics are not available.
+        let max_rows = ranges[0].num_rows;
+        let total_rows = ranges.iter().map(|range| range.num_rows).sum::<usize>();
+        // Computes the partition num by the max row number. This eliminates the unbalance of the partitions.
+        let balanced_partition_num = if max_rows > 0 {
+            total_rows.div_ceil(max_rows)
+        } else {
+            ranges.len()
+        };
+        let actual_partition_num = expected_partition_num.min(balanced_partition_num).max(1);
         let mut partition_ranges = vec![vec![]; actual_partition_num];
 
-        // round-robin assignment
-        for (i, range) in ranges.into_iter().enumerate() {
-            let partition_idx = i % expected_partition_num;
+        #[derive(Eq, PartialEq)]
+        struct HeapNode {
+            num_rows: usize,
+            partition_idx: usize,
+        }
+
+        impl Ord for HeapNode {
+            fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+                // Reverse for min-heap.
+                self.num_rows.cmp(&other.num_rows).reverse()
+            }
+        }
+
+        impl PartialOrd for HeapNode {
+            fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+                Some(self.cmp(other))
+            }
+        }
+
+        let mut part_heap =
+            BinaryHeap::from_iter((0..actual_partition_num).map(|partition_idx| HeapNode {
+                num_rows: 0,
+                partition_idx,
+            }));
+
+        // Assigns the range to the partition with the smallest number of rows.
+        for range in ranges {
+            // Safety: actual_partition_num always > 0.
+            let mut node = part_heap.pop().unwrap();
+            let partition_idx = node.partition_idx;
+            node.num_rows += range.num_rows;
             partition_ranges[partition_idx].push(range);
+            part_heap.push(node);
         }
 
         partition_ranges
@@ -172,18 +223,18 @@ mod test {
             ParallelizeScan::assign_partition_range(ranges.clone(), expected_partition_num);
         let expected = vec![
             vec![
+                PartitionRange {
+                    start: Timestamp::new(30, TimeUnit::Second),
+                    end: Timestamp::new(40, TimeUnit::Second),
+                    num_rows: 250,
+                    identifier: 4,
+                },
                 PartitionRange {
                     start: Timestamp::new(0, TimeUnit::Second),
                     end: Timestamp::new(10, TimeUnit::Second),
                     num_rows: 100,
                     identifier: 1,
                 },
-                PartitionRange {
-                    start: Timestamp::new(20, TimeUnit::Second),
-                    end: Timestamp::new(30, TimeUnit::Second),
-                    num_rows: 150,
-                    identifier: 3,
-                },
             ],
             vec![
                 PartitionRange {
@@ -193,10 +244,10 @@ mod test {
                     identifier: 2,
                 },
                 PartitionRange {
-                    start: Timestamp::new(30, TimeUnit::Second),
-                    end: Timestamp::new(40, TimeUnit::Second),
-                    num_rows: 250,
-                    identifier: 4,
+                    start: Timestamp::new(20, TimeUnit::Second),
+                    end: Timestamp::new(30, TimeUnit::Second),
+                    num_rows: 150,
+                    identifier: 3,
                 },
             ],
         ];
@@ -207,34 +258,100 @@ mod test {
         let result = ParallelizeScan::assign_partition_range(ranges, expected_partition_num);
         let expected = vec![
             vec![PartitionRange {
+                start: Timestamp::new(30, TimeUnit::Second),
+                end: Timestamp::new(40, TimeUnit::Second),
+                num_rows: 250,
+                identifier: 4,
+            }],
+            vec![PartitionRange {
+                start: Timestamp::new(10, TimeUnit::Second),
+                end: Timestamp::new(20, TimeUnit::Second),
+                num_rows: 200,
+                identifier: 2,
+            }],
+            vec![
+                PartitionRange {
+                    start: Timestamp::new(20, TimeUnit::Second),
+                    end: Timestamp::new(30, TimeUnit::Second),
+                    num_rows: 150,
+                    identifier: 3,
+                },
+                PartitionRange {
+                    start: Timestamp::new(0, TimeUnit::Second),
+                    end: Timestamp::new(10, TimeUnit::Second),
+                    num_rows: 100,
+                    identifier: 1,
+                },
+            ],
+        ];
+        assert_eq!(result, expected);
+
+        // assign 0 ranges to 5 partitions. Only 1 partition is returned.
+        let result = ParallelizeScan::assign_partition_range(vec![], 5);
+        assert_eq!(result.len(), 1);
+    }
+
+    #[test]
+    fn test_assign_unbalance_partition_range() {
+        let ranges = vec![
+            PartitionRange {
                 start: Timestamp::new(0, TimeUnit::Second),
                 end: Timestamp::new(10, TimeUnit::Second),
                 num_rows: 100,
                 identifier: 1,
-            }],
-            vec![PartitionRange {
+            },
+            PartitionRange {
                 start: Timestamp::new(10, TimeUnit::Second),
                 end: Timestamp::new(20, TimeUnit::Second),
                 num_rows: 200,
                 identifier: 2,
-            }],
-            vec![PartitionRange {
+            },
+            PartitionRange {
                 start: Timestamp::new(20, TimeUnit::Second),
                 end: Timestamp::new(30, TimeUnit::Second),
                 num_rows: 150,
                 identifier: 3,
-            }],
+            },
+            PartitionRange {
+                start: Timestamp::new(30, TimeUnit::Second),
+                end: Timestamp::new(40, TimeUnit::Second),
+                num_rows: 2500,
+                identifier: 4,
+            },
+        ];
+
+        // assign to 2 partitions
+        let expected_partition_num = 2;
+        let result =
+            ParallelizeScan::assign_partition_range(ranges.clone(), expected_partition_num);
+        let expected = vec![
             vec![PartitionRange {
                 start: Timestamp::new(30, TimeUnit::Second),
                 end: Timestamp::new(40, TimeUnit::Second),
-                num_rows: 250,
+                num_rows: 2500,
                 identifier: 4,
             }],
+            vec![
+                PartitionRange {
+                    start: Timestamp::new(10, TimeUnit::Second),
+                    end: Timestamp::new(20, TimeUnit::Second),
+                    num_rows: 200,
+                    identifier: 2,
+                },
+                PartitionRange {
+                    start: Timestamp::new(20, TimeUnit::Second),
+                    end: Timestamp::new(30, TimeUnit::Second),
+                    num_rows: 150,
+                    identifier: 3,
+                },
+                PartitionRange {
+                    start: Timestamp::new(0, TimeUnit::Second),
+                    end: Timestamp::new(10, TimeUnit::Second),
+                    num_rows: 100,
+                    identifier: 1,
+                },
+            ],
         ];
         assert_eq!(result, expected);
-
-        // assign 0 ranges to 5 partitions. Only 1 partition is returned.
-        let result = ParallelizeScan::assign_partition_range(vec![], 5);
-        assert_eq!(result.len(), 1);
     }
 }
diff --git a/src/store-api/src/region_engine.rs b/src/store-api/src/region_engine.rs
index 8dd706395d1d..c9b0ac53db59 100644
--- a/src/store-api/src/region_engine.rs
+++ b/src/store-api/src/region_engine.rs
@@ -206,16 +206,13 @@ pub struct ScannerProperties {
 
     /// Whether to yield an empty batch to distinguish partition ranges.
     pub distinguish_partition_range: bool,
+
+    /// The target partitions of the scanner. 0 indicates using the number of partitions as target partitions.
+    target_partitions: usize,
 }
 
 impl ScannerProperties {
-    /// Initialize partitions with given parallelism for scanner.
-    pub fn with_parallelism(mut self, parallelism: usize) -> Self {
-        self.partitions = vec![vec![]; parallelism];
-        self
-    }
-
-    /// Set append mode for scanner.
+    /// Sets append mode for scanner.
     pub fn with_append_mode(mut self, append_mode: bool) -> Self {
         self.append_mode = append_mode;
         self
@@ -234,9 +231,24 @@ impl ScannerProperties {
             append_mode,
             total_rows,
             distinguish_partition_range: false,
+            target_partitions: 0,
         }
     }
 
+    /// Updates the properties with the given [PrepareRequest].
+    pub fn prepare(&mut self, request: PrepareRequest) {
+        if let Some(ranges) = request.ranges {
+            self.partitions = ranges;
+        }
+        if let Some(distinguish_partition_range) = request.distinguish_partition_range {
+            self.distinguish_partition_range = distinguish_partition_range;
+        }
+        if let Some(target_partitions) = request.target_partitions {
+            self.target_partitions = target_partitions;
+        }
+    }
+
+    /// Returns the number of actual partitions.
     pub fn num_partitions(&self) -> usize {
         self.partitions.len()
     }
@@ -249,8 +261,44 @@ impl ScannerProperties {
         self.total_rows
     }
 
-    pub fn distinguish_partition_range(&self) -> bool {
-        self.distinguish_partition_range
+    /// Returns the target partitions of the scanner. If it is not set, returns the number of partitions.
+    pub fn target_partitions(&self) -> usize {
+        if self.target_partitions == 0 {
+            self.num_partitions()
+        } else {
+            self.target_partitions
+        }
+    }
+}
+
+/// Request to override the scanner properties.
+#[derive(Default)]
+pub struct PrepareRequest {
+    /// Assigned partition ranges.
+    pub ranges: Option<Vec<Vec<PartitionRange>>>,
+    /// Distringuishes partition range by empty batches.
+    pub distinguish_partition_range: Option<bool>,
+    /// The expected number of target partitions.
+    pub target_partitions: Option<usize>,
+}
+
+impl PrepareRequest {
+    /// Sets the ranges.
+    pub fn with_ranges(mut self, ranges: Vec<Vec<PartitionRange>>) -> Self {
+        self.ranges = Some(ranges);
+        self
+    }
+
+    /// Sets the distinguish partition range flag.
+    pub fn with_distinguish_partition_range(mut self, distinguish_partition_range: bool) -> Self {
+        self.distinguish_partition_range = Some(distinguish_partition_range);
+        self
+    }
+
+    /// Sets the target partitions.
+    pub fn with_target_partitions(mut self, target_partitions: usize) -> Self {
+        self.target_partitions = Some(target_partitions);
+        self
     }
 }
 
@@ -271,11 +319,7 @@ pub trait RegionScanner: Debug + DisplayAs + Send {
     /// Prepares the scanner with the given partition ranges.
     ///
     /// This method is for the planner to adjust the scanner's behavior based on the partition ranges.
-    fn prepare(
-        &mut self,
-        ranges: Vec<Vec<PartitionRange>>,
-        distinguish_partition_range: bool,
-    ) -> Result<(), BoxedError>;
+    fn prepare(&mut self, request: PrepareRequest) -> Result<(), BoxedError>;
 
     /// Scans the partition and returns a stream of record batches.
     ///
@@ -431,9 +475,7 @@ impl SinglePartitionScanner {
         Self {
             stream: Mutex::new(Some(stream)),
             schema,
-            properties: ScannerProperties::default()
-                .with_parallelism(1)
-                .with_append_mode(append_mode),
+            properties: ScannerProperties::default().with_append_mode(append_mode),
             metadata,
         }
     }
@@ -454,13 +496,8 @@ impl RegionScanner for SinglePartitionScanner {
         self.schema.clone()
     }
 
-    fn prepare(
-        &mut self,
-        ranges: Vec<Vec<PartitionRange>>,
-        distinguish_partition_range: bool,
-    ) -> Result<(), BoxedError> {
-        self.properties.partitions = ranges;
-        self.properties.distinguish_partition_range = distinguish_partition_range;
+    fn prepare(&mut self, request: PrepareRequest) -> Result<(), BoxedError> {
+        self.properties.prepare(request);
         Ok(())
     }
 
diff --git a/src/table/src/table/scan.rs b/src/table/src/table/scan.rs
index 0eac7c0c354f..e4b47fa4fb2a 100644
--- a/src/table/src/table/scan.rs
+++ b/src/table/src/table/scan.rs
@@ -35,7 +35,7 @@ use datafusion_common::{ColumnStatistics, DataFusionError, Statistics};
 use datafusion_physical_expr::{EquivalenceProperties, Partitioning, PhysicalSortExpr};
 use datatypes::arrow::datatypes::SchemaRef as ArrowSchemaRef;
 use futures::{Stream, StreamExt};
-use store_api::region_engine::{PartitionRange, RegionScannerRef};
+use store_api::region_engine::{PartitionRange, PrepareRequest, RegionScannerRef};
 
 use crate::table::metrics::StreamMetrics;
 
@@ -112,6 +112,7 @@ impl RegionScanExec {
     pub fn with_new_partitions(
         &self,
         partitions: Vec<Vec<PartitionRange>>,
+        target_partitions: usize,
     ) -> Result<Self, BoxedError> {
         if self.is_partition_set {
             warn!("Setting partition ranges more than once for RegionScanExec");
@@ -123,8 +124,11 @@ impl RegionScanExec {
 
         {
             let mut scanner = self.scanner.lock().unwrap();
-            let distinguish_partition_range = scanner.properties().distinguish_partition_range();
-            scanner.prepare(partitions, distinguish_partition_range)?;
+            scanner.prepare(
+                PrepareRequest::default()
+                    .with_ranges(partitions)
+                    .with_target_partitions(target_partitions),
+            )?;
         }
 
         Ok(Self {
@@ -141,9 +145,10 @@ impl RegionScanExec {
 
     pub fn with_distinguish_partition_range(&self, distinguish_partition_range: bool) {
         let mut scanner = self.scanner.lock().unwrap();
-        let partition_ranges = scanner.properties().partitions.clone();
         // set distinguish_partition_range won't fail
-        let _ = scanner.prepare(partition_ranges, distinguish_partition_range);
+        let _ = scanner.prepare(
+            PrepareRequest::default().with_distinguish_partition_range(distinguish_partition_range),
+        );
     }
 
     pub fn time_index(&self) -> String {
diff --git a/tests/cases/standalone/common/order/windowed_sort.result b/tests/cases/standalone/common/order/windowed_sort.result
index 13b3503fb943..3ae8f9f8469a 100644
--- a/tests/cases/standalone/common/order/windowed_sort.result
+++ b/tests/cases/standalone/common/order/windowed_sort.result
@@ -1,5 +1,5 @@
 -- Test without PK, with a windowed sort query.
-CREATE TABLE test(i INTEGER, t TIMESTAMP TIME INDEX);
+CREATE TABLE test(i INTEGER, t TIMESTAMP TIME INDEX) WITH('compaction.type'='twcs', 'compaction.twcs.max_inactive_window_files'='4');
 
 Affected Rows: 0
 
@@ -69,8 +69,8 @@ EXPLAIN ANALYZE SELECT * FROM test ORDER BY t LIMIT 5;
 |_|_|_|
 | 1_| 0_|_GlobalLimitExec: skip=0, fetch=5 REDACTED
 |_|_|_SortPreservingMergeExec: [t@1 ASC NULLS LAST] REDACTED
-|_|_|_WindowedSortExec: expr=t@1 ASC NULLS LAST num_ranges=2 fetch=5 REDACTED
-|_|_|_SeqScan: region=REDACTED, partition_count=2 (1 memtable ranges, 1 file 1 ranges) REDACTED
+|_|_|_WindowedSortExec: expr=t@1 ASC NULLS LAST num_ranges=4 fetch=5 REDACTED
+|_|_|_SeqScan: region=REDACTED, partition_count=4 (1 memtable ranges, 3 file 3 ranges) REDACTED
 |_|_|_|
 |_|_| Total rows: 5_|
 +-+-+-+
@@ -101,9 +101,9 @@ EXPLAIN ANALYZE SELECT * FROM test ORDER BY t DESC LIMIT 5;
 |_|_|_|
 | 1_| 0_|_GlobalLimitExec: skip=0, fetch=5 REDACTED
 |_|_|_SortPreservingMergeExec: [t@1 DESC] REDACTED
-|_|_|_WindowedSortExec: expr=t@1 DESC num_ranges=2 fetch=5 REDACTED
-|_|_|_PartSortExec: expr=t@1 DESC num_ranges=2 limit=5 REDACTED
-|_|_|_SeqScan: region=REDACTED, partition_count=2 (1 memtable ranges, 1 file 1 ranges) REDACTED
+|_|_|_WindowedSortExec: expr=t@1 DESC num_ranges=4 fetch=5 REDACTED
+|_|_|_PartSortExec: expr=t@1 DESC num_ranges=4 limit=5 REDACTED
+|_|_|_SeqScan: region=REDACTED, partition_count=4 (1 memtable ranges, 3 file 3 ranges) REDACTED
 |_|_|_|
 |_|_| Total rows: 5_|
 +-+-+-+
@@ -113,7 +113,7 @@ DROP TABLE test;
 Affected Rows: 0
 
 -- Test with PK, with a windowed sort query.
-CREATE TABLE test_pk(pk INTEGER PRIMARY KEY, i INTEGER, t TIMESTAMP TIME INDEX);
+CREATE TABLE test_pk(pk INTEGER PRIMARY KEY, i INTEGER, t TIMESTAMP TIME INDEX) WITH('compaction.type'='twcs', 'compaction.twcs.max_inactive_window_files'='4');
 
 Affected Rows: 0
 
@@ -183,9 +183,9 @@ EXPLAIN ANALYZE SELECT * FROM test_pk ORDER BY t LIMIT 5;
 |_|_|_|
 | 1_| 0_|_GlobalLimitExec: skip=0, fetch=5 REDACTED
 |_|_|_SortPreservingMergeExec: [t@2 ASC NULLS LAST] REDACTED
-|_|_|_WindowedSortExec: expr=t@2 ASC NULLS LAST num_ranges=2 fetch=5 REDACTED
-|_|_|_PartSortExec: expr=t@2 ASC NULLS LAST num_ranges=2 limit=5 REDACTED
-|_|_|_SeqScan: region=REDACTED, partition_count=2 (1 memtable ranges, 1 file 1 ranges) REDACTED
+|_|_|_WindowedSortExec: expr=t@2 ASC NULLS LAST num_ranges=4 fetch=5 REDACTED
+|_|_|_PartSortExec: expr=t@2 ASC NULLS LAST num_ranges=4 limit=5 REDACTED
+|_|_|_SeqScan: region=REDACTED, partition_count=4 (1 memtable ranges, 3 file 3 ranges) REDACTED
 |_|_|_|
 |_|_| Total rows: 5_|
 +-+-+-+
@@ -216,9 +216,9 @@ EXPLAIN ANALYZE SELECT * FROM test_pk ORDER BY t DESC LIMIT 5;
 |_|_|_|
 | 1_| 0_|_GlobalLimitExec: skip=0, fetch=5 REDACTED
 |_|_|_SortPreservingMergeExec: [t@2 DESC] REDACTED
-|_|_|_WindowedSortExec: expr=t@2 DESC num_ranges=2 fetch=5 REDACTED
-|_|_|_PartSortExec: expr=t@2 DESC num_ranges=2 limit=5 REDACTED
-|_|_|_SeqScan: region=REDACTED, partition_count=2 (1 memtable ranges, 1 file 1 ranges) REDACTED
+|_|_|_WindowedSortExec: expr=t@2 DESC num_ranges=4 fetch=5 REDACTED
+|_|_|_PartSortExec: expr=t@2 DESC num_ranges=4 limit=5 REDACTED
+|_|_|_SeqScan: region=REDACTED, partition_count=4 (1 memtable ranges, 3 file 3 ranges) REDACTED
 |_|_|_|
 |_|_| Total rows: 5_|
 +-+-+-+
diff --git a/tests/cases/standalone/common/order/windowed_sort.sql b/tests/cases/standalone/common/order/windowed_sort.sql
index e8006f74ce17..e21ae3764bdb 100644
--- a/tests/cases/standalone/common/order/windowed_sort.sql
+++ b/tests/cases/standalone/common/order/windowed_sort.sql
@@ -1,5 +1,5 @@
 -- Test without PK, with a windowed sort query.
-CREATE TABLE test(i INTEGER, t TIMESTAMP TIME INDEX);
+CREATE TABLE test(i INTEGER, t TIMESTAMP TIME INDEX) WITH('compaction.type'='twcs', 'compaction.twcs.max_inactive_window_files'='4');
 
 INSERT INTO test VALUES (1, 1), (NULL, 2), (1, 3);
 
@@ -36,7 +36,7 @@ EXPLAIN ANALYZE SELECT * FROM test ORDER BY t DESC LIMIT 5;
 DROP TABLE test;
 
 -- Test with PK, with a windowed sort query.
-CREATE TABLE test_pk(pk INTEGER PRIMARY KEY, i INTEGER, t TIMESTAMP TIME INDEX);
+CREATE TABLE test_pk(pk INTEGER PRIMARY KEY, i INTEGER, t TIMESTAMP TIME INDEX) WITH('compaction.type'='twcs', 'compaction.twcs.max_inactive_window_files'='4');
 
 INSERT INTO test_pk VALUES (1, 1, 1), (2, NULL, 2), (3, 1, 3);
 

From ce86ba3425924b83668dccdd6d9811e32b2e3933 Mon Sep 17 00:00:00 2001
From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com>
Date: Mon, 9 Dec 2024 21:39:18 +0800
Subject: [PATCH 16/36] chore:  Reduce FETCH_OPTION_TIMEOUT from 10 to 3
 seconds in config.rs (#5117)

Reduce FETCH_OPTION_TIMEOUT from 10 to 3 seconds in config.rs
---
 src/mito2/src/config.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mito2/src/config.rs b/src/mito2/src/config.rs
index cb4022f65e57..067ab27938a2 100644
--- a/src/mito2/src/config.rs
+++ b/src/mito2/src/config.rs
@@ -46,7 +46,7 @@ const PAGE_CACHE_SIZE_FACTOR: u64 = 8;
 const INDEX_CREATE_MEM_THRESHOLD_FACTOR: u64 = 16;
 
 /// Fetch option timeout
-pub(crate) const FETCH_OPTION_TIMEOUT: Duration = Duration::from_secs(10);
+pub(crate) const FETCH_OPTION_TIMEOUT: Duration = Duration::from_secs(3);
 
 /// Configuration for [MitoEngine](crate::engine::MitoEngine).
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]

From 03a28320d6f322000790eeb2b81ce7cb2cb9a972 Mon Sep 17 00:00:00 2001
From: dennis zhuang <killme2008@gmail.com>
Date: Tue, 10 Dec 2024 12:03:44 +0800
Subject: [PATCH 17/36] feat!: enable read cache and write cache when using
 remote object stores (#5093)

* feat: enable read cache and write cache when using remote object stores

* feat: make read cache be aware of remote store names

* chore: docs

* chore: apply review suggestions

* chore: trim write cache path

---------

Co-authored-by: Yingwen <realevenyag@gmail.com>
---
 config/config.md                   | 16 ++++----
 config/datanode.example.toml       | 14 +++----
 config/standalone.example.toml     | 14 +++----
 src/datanode/src/config.rs         | 28 ++++++++++++-
 src/datanode/src/datanode.rs       |  8 +++-
 src/datanode/src/lib.rs            |  1 +
 src/datanode/src/store.rs          | 63 +++++++++++++++++++++---------
 src/mito2/src/config.rs            | 10 +++--
 src/mito2/src/error.rs             |  8 ++++
 src/mito2/src/worker.rs            | 10 ++++-
 src/object-store/src/lib.rs        |  2 +
 tests-integration/src/test_util.rs |  3 ++
 tests-integration/tests/http.rs    |  2 +-
 tests-integration/tests/main.rs    |  3 +-
 14 files changed, 132 insertions(+), 50 deletions(-)

diff --git a/config/config.md b/config/config.md
index 1f034d28731d..0f70a8cb126e 100644
--- a/config/config.md
+++ b/config/config.md
@@ -93,7 +93,7 @@
 | `storage` | -- | -- | The data storage options. |
 | `storage.data_home` | String | `/tmp/greptimedb/` | The working home directory. |
 | `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. |
-| `storage.cache_path` | String | Unset | Cache configuration for object storage such as 'S3' etc. It is recommended to configure it when using object storage for better performance.<br/>The local file cache directory. |
+| `storage.cache_path` | String | Unset | Read cache configuration for object storage such as 'S3' etc, it's configured by default when using object storage. It is recommended to configure it when using object storage for better performance.<br/>A local file directory, defaults to `{data_home}/object_cache/read`. An empty string means disabling. |
 | `storage.cache_capacity` | String | Unset | The local file cache capacity in bytes. If your disk space is sufficient, it is recommended to set it larger. |
 | `storage.bucket` | String | Unset | The S3 bucket name.<br/>**It's only used when the storage type is `S3`, `Oss` and `Gcs`**. |
 | `storage.root` | String | Unset | The S3 data will be stored in the specified prefix, for example, `s3://${bucket}/${root}`.<br/>**It's only used when the storage type is `S3`, `Oss` and `Azblob`**. |
@@ -131,9 +131,9 @@
 | `region_engine.mito.vector_cache_size` | String | Auto | Cache size for vectors and arrow arrays. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
 | `region_engine.mito.page_cache_size` | String | Auto | Cache size for pages of SST row groups. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/8 of OS memory. |
 | `region_engine.mito.selector_result_cache_size` | String | Auto | Cache size for time series selector (e.g. `last_value()`). Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
-| `region_engine.mito.enable_experimental_write_cache` | Bool | `false` | Whether to enable the experimental write cache. It is recommended to enable it when using object storage for better performance. |
-| `region_engine.mito.experimental_write_cache_path` | String | `""` | File system path for write cache, defaults to `{data_home}/write_cache`. |
-| `region_engine.mito.experimental_write_cache_size` | String | `1GiB` | Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger. |
+| `region_engine.mito.enable_experimental_write_cache` | Bool | `false` | Whether to enable the experimental write cache, it's enabled by default when using object storage. It is recommended to enable it when using object storage for better performance. |
+| `region_engine.mito.experimental_write_cache_path` | String | `""` | File system path for write cache, defaults to `{data_home}/object_cache/write`. |
+| `region_engine.mito.experimental_write_cache_size` | String | `5GiB` | Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger. |
 | `region_engine.mito.experimental_write_cache_ttl` | String | Unset | TTL for write cache. |
 | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
 | `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
@@ -420,7 +420,7 @@
 | `storage` | -- | -- | The data storage options. |
 | `storage.data_home` | String | `/tmp/greptimedb/` | The working home directory. |
 | `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. |
-| `storage.cache_path` | String | Unset | Cache configuration for object storage such as 'S3' etc. It is recommended to configure it when using object storage for better performance.<br/>The local file cache directory. |
+| `storage.cache_path` | String | Unset | Read cache configuration for object storage such as 'S3' etc, it's configured by default when using object storage. It is recommended to configure it when using object storage for better performance.<br/>A local file directory, defaults to `{data_home}/object_cache/read`. An empty string means disabling. |
 | `storage.cache_capacity` | String | Unset | The local file cache capacity in bytes. If your disk space is sufficient, it is recommended to set it larger. |
 | `storage.bucket` | String | Unset | The S3 bucket name.<br/>**It's only used when the storage type is `S3`, `Oss` and `Gcs`**. |
 | `storage.root` | String | Unset | The S3 data will be stored in the specified prefix, for example, `s3://${bucket}/${root}`.<br/>**It's only used when the storage type is `S3`, `Oss` and `Azblob`**. |
@@ -458,9 +458,9 @@
 | `region_engine.mito.vector_cache_size` | String | Auto | Cache size for vectors and arrow arrays. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
 | `region_engine.mito.page_cache_size` | String | Auto | Cache size for pages of SST row groups. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/8 of OS memory. |
 | `region_engine.mito.selector_result_cache_size` | String | Auto | Cache size for time series selector (e.g. `last_value()`). Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
-| `region_engine.mito.enable_experimental_write_cache` | Bool | `false` | Whether to enable the experimental write cache. It is recommended to enable it when using object storage for better performance. |
-| `region_engine.mito.experimental_write_cache_path` | String | `""` | File system path for write cache, defaults to `{data_home}/write_cache`. |
-| `region_engine.mito.experimental_write_cache_size` | String | `1GiB` | Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger. |
+| `region_engine.mito.enable_experimental_write_cache` | Bool | `false` | Whether to enable the experimental write cache, it's enabled by default when using object storage. It is recommended to enable it when using object storage for better performance. |
+| `region_engine.mito.experimental_write_cache_path` | String | `""` | File system path for write cache, defaults to `{data_home}/object_cache/write`. |
+| `region_engine.mito.experimental_write_cache_size` | String | `5GiB` | Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger. |
 | `region_engine.mito.experimental_write_cache_ttl` | String | Unset | TTL for write cache. |
 | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
 | `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
diff --git a/config/datanode.example.toml b/config/datanode.example.toml
index 11c2794e61df..8bfa8732cc41 100644
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -294,14 +294,14 @@ data_home = "/tmp/greptimedb/"
 ## - `Oss`: the data is stored in the Aliyun OSS.
 type = "File"
 
-## Cache configuration for object storage such as 'S3' etc. It is recommended to configure it when using object storage for better performance.
-## The local file cache directory.
+## Read cache configuration for object storage such as 'S3' etc, it's configured by default when using object storage. It is recommended to configure it when using object storage for better performance.
+## A local file directory, defaults to `{data_home}/object_cache/read`. An empty string means disabling.
 ## @toml2docs:none-default
-cache_path = "/path/local_cache"
+#+ cache_path = ""
 
 ## The local file cache capacity in bytes. If your disk space is sufficient, it is recommended to set it larger.
 ## @toml2docs:none-default
-cache_capacity = "1GiB"
+cache_capacity = "5GiB"
 
 ## The S3 bucket name.
 ## **It's only used when the storage type is `S3`, `Oss` and `Gcs`**.
@@ -476,14 +476,14 @@ auto_flush_interval = "1h"
 ## @toml2docs:none-default="Auto"
 #+ selector_result_cache_size = "512MB"
 
-## Whether to enable the experimental write cache. It is recommended to enable it when using object storage for better performance.
+## Whether to enable the experimental write cache, it's enabled by default when using object storage. It is recommended to enable it when using object storage for better performance.
 enable_experimental_write_cache = false
 
-## File system path for write cache, defaults to `{data_home}/write_cache`.
+## File system path for write cache, defaults to `{data_home}/object_cache/write`.
 experimental_write_cache_path = ""
 
 ## Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger.
-experimental_write_cache_size = "1GiB"
+experimental_write_cache_size = "5GiB"
 
 ## TTL for write cache.
 ## @toml2docs:none-default
diff --git a/config/standalone.example.toml b/config/standalone.example.toml
index a69295af1644..56cbeaddb9d5 100644
--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -332,14 +332,14 @@ data_home = "/tmp/greptimedb/"
 ## - `Oss`: the data is stored in the Aliyun OSS.
 type = "File"
 
-## Cache configuration for object storage such as 'S3' etc. It is recommended to configure it when using object storage for better performance.
-## The local file cache directory.
+## Read cache configuration for object storage such as 'S3' etc, it's configured by default when using object storage. It is recommended to configure it when using object storage for better performance.
+## A local file directory, defaults to `{data_home}/object_cache/read`. An empty string means disabling.
 ## @toml2docs:none-default
-cache_path = "/path/local_cache"
+#+ cache_path = ""
 
 ## The local file cache capacity in bytes. If your disk space is sufficient, it is recommended to set it larger.
 ## @toml2docs:none-default
-cache_capacity = "1GiB"
+cache_capacity = "5GiB"
 
 ## The S3 bucket name.
 ## **It's only used when the storage type is `S3`, `Oss` and `Gcs`**.
@@ -514,14 +514,14 @@ auto_flush_interval = "1h"
 ## @toml2docs:none-default="Auto"
 #+ selector_result_cache_size = "512MB"
 
-## Whether to enable the experimental write cache. It is recommended to enable it when using object storage for better performance.
+## Whether to enable the experimental write cache, it's enabled by default when using object storage. It is recommended to enable it when using object storage for better performance.
 enable_experimental_write_cache = false
 
-## File system path for write cache, defaults to `{data_home}/write_cache`.
+## File system path for write cache, defaults to `{data_home}/object_cache/write`.
 experimental_write_cache_path = ""
 
 ## Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger.
-experimental_write_cache_size = "1GiB"
+experimental_write_cache_size = "5GiB"
 
 ## TTL for write cache.
 ## @toml2docs:none-default
diff --git a/src/datanode/src/config.rs b/src/datanode/src/config.rs
index 4fedb9ea2cc0..4e1b4f3195a7 100644
--- a/src/datanode/src/config.rs
+++ b/src/datanode/src/config.rs
@@ -32,7 +32,7 @@ use servers::heartbeat_options::HeartbeatOptions;
 use servers::http::HttpOptions;
 use servers::Mode;
 
-pub const DEFAULT_OBJECT_STORE_CACHE_SIZE: ReadableSize = ReadableSize::gb(1);
+pub const DEFAULT_OBJECT_STORE_CACHE_SIZE: ReadableSize = ReadableSize::gb(5);
 
 /// Default data home in file storage
 const DEFAULT_DATA_HOME: &str = "/tmp/greptimedb";
@@ -60,6 +60,11 @@ impl ObjectStoreConfig {
         }
     }
 
+    /// Returns true when it's a remote object storage such as AWS s3 etc.
+    pub fn is_object_storage(&self) -> bool {
+        !matches!(self, Self::File(_))
+    }
+
     /// Returns the object storage configuration name, return the provider name if it's empty.
     pub fn config_name(&self) -> &str {
         let name = match self {
@@ -91,6 +96,13 @@ pub struct StorageConfig {
     pub providers: Vec<ObjectStoreConfig>,
 }
 
+impl StorageConfig {
+    /// Returns true when the default storage config is a remote object storage service such as AWS S3, etc.
+    pub fn is_object_storage(&self) -> bool {
+        self.store.is_object_storage()
+    }
+}
+
 impl Default for StorageConfig {
     fn default() -> Self {
         Self {
@@ -452,6 +464,20 @@ mod tests {
         assert_eq!("S3", s3_config.provider_name());
     }
 
+    #[test]
+    fn test_is_object_storage() {
+        let store = ObjectStoreConfig::default();
+        assert!(!store.is_object_storage());
+        let s3_config = ObjectStoreConfig::S3(S3Config::default());
+        assert!(s3_config.is_object_storage());
+        let oss_config = ObjectStoreConfig::Oss(OssConfig::default());
+        assert!(oss_config.is_object_storage());
+        let gcs_config = ObjectStoreConfig::Gcs(GcsConfig::default());
+        assert!(gcs_config.is_object_storage());
+        let azblob_config = ObjectStoreConfig::Azblob(AzblobConfig::default());
+        assert!(azblob_config.is_object_storage());
+    }
+
     #[test]
     fn test_secstr() {
         let toml_str = r#"
diff --git a/src/datanode/src/datanode.rs b/src/datanode/src/datanode.rs
index c89c007082bf..53a0cf9fd78b 100644
--- a/src/datanode/src/datanode.rs
+++ b/src/datanode/src/datanode.rs
@@ -428,10 +428,16 @@ impl DatanodeBuilder {
     async fn build_mito_engine(
         opts: &DatanodeOptions,
         object_store_manager: ObjectStoreManagerRef,
-        config: MitoConfig,
+        mut config: MitoConfig,
         schema_metadata_manager: SchemaMetadataManagerRef,
         plugins: Plugins,
     ) -> Result<MitoEngine> {
+        if opts.storage.is_object_storage() {
+            // Enable the write cache when setting object storage
+            config.enable_experimental_write_cache = true;
+            info!("Configured 'enable_experimental_write_cache=true' for mito engine.");
+        }
+
         let mito_engine = match &opts.wal {
             DatanodeWalConfig::RaftEngine(raft_engine_config) => MitoEngine::new(
                 &opts.storage.data_home,
diff --git a/src/datanode/src/lib.rs b/src/datanode/src/lib.rs
index dae3eef76c79..6a7b1c596d2d 100644
--- a/src/datanode/src/lib.rs
+++ b/src/datanode/src/lib.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #![feature(assert_matches)]
+#![feature(let_chains)]
 
 pub mod alive_keeper;
 pub mod config;
diff --git a/src/datanode/src/store.rs b/src/datanode/src/store.rs
index e8ede076741f..c78afe448e0c 100644
--- a/src/datanode/src/store.rs
+++ b/src/datanode/src/store.rs
@@ -19,21 +19,20 @@ mod fs;
 mod gcs;
 mod oss;
 mod s3;
-
+use std::path::Path;
 use std::sync::Arc;
 use std::time::Duration;
 use std::{env, path};
 
-use common_base::readable_size::ReadableSize;
 use common_telemetry::{info, warn};
 use object_store::layers::{LruCacheLayer, RetryInterceptor, RetryLayer};
 use object_store::services::Fs;
 use object_store::util::{join_dir, normalize_dir, with_instrument_layers};
-use object_store::{Access, Error, HttpClient, ObjectStore, ObjectStoreBuilder};
+use object_store::{Access, Error, HttpClient, ObjectStore, ObjectStoreBuilder, OBJECT_CACHE_DIR};
 use snafu::prelude::*;
 
 use crate::config::{HttpClientConfig, ObjectStoreConfig, DEFAULT_OBJECT_STORE_CACHE_SIZE};
-use crate::error::{self, Result};
+use crate::error::{self, CreateDirSnafu, Result};
 
 pub(crate) async fn new_raw_object_store(
     store: &ObjectStoreConfig,
@@ -68,7 +67,7 @@ pub(crate) async fn new_object_store_without_cache(
 ) -> Result<ObjectStore> {
     let object_store = new_raw_object_store(store, data_home).await?;
     // Enable retry layer and cache layer for non-fs object storages
-    let object_store = if !matches!(store, ObjectStoreConfig::File(..)) {
+    let object_store = if store.is_object_storage() {
         // Adds retry layer
         with_retry_layers(object_store)
     } else {
@@ -85,8 +84,8 @@ pub(crate) async fn new_object_store(
 ) -> Result<ObjectStore> {
     let object_store = new_raw_object_store(&store, data_home).await?;
     // Enable retry layer and cache layer for non-fs object storages
-    let object_store = if !matches!(store, ObjectStoreConfig::File(..)) {
-        let object_store = if let Some(cache_layer) = build_cache_layer(&store).await? {
+    let object_store = if store.is_object_storage() {
+        let object_store = if let Some(cache_layer) = build_cache_layer(&store, data_home).await? {
             // Adds cache layer
             object_store.layer(cache_layer)
         } else {
@@ -105,44 +104,72 @@ pub(crate) async fn new_object_store(
 
 async fn build_cache_layer(
     store_config: &ObjectStoreConfig,
+    data_home: &str,
 ) -> Result<Option<LruCacheLayer<impl Access>>> {
-    let (cache_path, cache_capacity) = match store_config {
+    let (name, mut cache_path, cache_capacity) = match store_config {
         ObjectStoreConfig::S3(s3_config) => {
-            let path = s3_config.cache.cache_path.as_ref();
+            let path = s3_config.cache.cache_path.clone();
+            let name = &s3_config.name;
             let capacity = s3_config
                 .cache
                 .cache_capacity
                 .unwrap_or(DEFAULT_OBJECT_STORE_CACHE_SIZE);
-            (path, capacity)
+            (name, path, capacity)
         }
         ObjectStoreConfig::Oss(oss_config) => {
-            let path = oss_config.cache.cache_path.as_ref();
+            let path = oss_config.cache.cache_path.clone();
+            let name = &oss_config.name;
             let capacity = oss_config
                 .cache
                 .cache_capacity
                 .unwrap_or(DEFAULT_OBJECT_STORE_CACHE_SIZE);
-            (path, capacity)
+            (name, path, capacity)
         }
         ObjectStoreConfig::Azblob(azblob_config) => {
-            let path = azblob_config.cache.cache_path.as_ref();
+            let path = azblob_config.cache.cache_path.clone();
+            let name = &azblob_config.name;
             let capacity = azblob_config
                 .cache
                 .cache_capacity
                 .unwrap_or(DEFAULT_OBJECT_STORE_CACHE_SIZE);
-            (path, capacity)
+            (name, path, capacity)
         }
         ObjectStoreConfig::Gcs(gcs_config) => {
-            let path = gcs_config.cache.cache_path.as_ref();
+            let path = gcs_config.cache.cache_path.clone();
+            let name = &gcs_config.name;
             let capacity = gcs_config
                 .cache
                 .cache_capacity
                 .unwrap_or(DEFAULT_OBJECT_STORE_CACHE_SIZE);
-            (path, capacity)
+            (name, path, capacity)
         }
-        _ => (None, ReadableSize(0)),
+        _ => unreachable!("Already checked above"),
     };
 
-    if let Some(path) = cache_path {
+    // Enable object cache by default
+    // Set the cache_path to be `${data_home}/object_cache/read/{name}` by default
+    // if it's not present
+    if cache_path.is_none() {
+        let object_cache_path = join_dir(data_home, OBJECT_CACHE_DIR);
+        let read_cache_path = join_dir(&object_cache_path, "read");
+        let read_cache_path = join_dir(&read_cache_path, &name.to_lowercase());
+        tokio::fs::create_dir_all(Path::new(&read_cache_path))
+            .await
+            .context(CreateDirSnafu {
+                dir: &read_cache_path,
+            })?;
+
+        info!(
+            "The object storage cache path is not set for '{}', using the default path: '{}'",
+            name, &read_cache_path
+        );
+
+        cache_path = Some(read_cache_path);
+    }
+
+    if let Some(path) = cache_path.as_ref()
+        && !path.trim().is_empty()
+    {
         let atomic_temp_dir = join_dir(path, ".tmp/");
         clean_temp_dir(&atomic_temp_dir)?;
 
diff --git a/src/mito2/src/config.rs b/src/mito2/src/config.rs
index 067ab27938a2..9b113027a41b 100644
--- a/src/mito2/src/config.rs
+++ b/src/mito2/src/config.rs
@@ -21,6 +21,7 @@ use std::time::Duration;
 use common_base::readable_size::ReadableSize;
 use common_telemetry::warn;
 use object_store::util::join_dir;
+use object_store::OBJECT_CACHE_DIR;
 use serde::{Deserialize, Serialize};
 use serde_with::serde_as;
 
@@ -96,7 +97,7 @@ pub struct MitoConfig {
     pub selector_result_cache_size: ReadableSize,
     /// Whether to enable the experimental write cache.
     pub enable_experimental_write_cache: bool,
-    /// File system path for write cache, defaults to `{data_home}/write_cache`.
+    /// File system path for write cache, defaults to `{data_home}/object_cache/write`.
     pub experimental_write_cache_path: String,
     /// Capacity for write cache.
     pub experimental_write_cache_size: ReadableSize,
@@ -148,7 +149,7 @@ impl Default for MitoConfig {
             selector_result_cache_size: ReadableSize::mb(512),
             enable_experimental_write_cache: false,
             experimental_write_cache_path: String::new(),
-            experimental_write_cache_size: ReadableSize::gb(1),
+            experimental_write_cache_size: ReadableSize::gb(5),
             experimental_write_cache_ttl: None,
             sst_write_buffer_size: DEFAULT_WRITE_BUFFER_SIZE,
             parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE,
@@ -232,8 +233,9 @@ impl MitoConfig {
         }
 
         // Sets write cache path if it is empty.
-        if self.experimental_write_cache_path.is_empty() {
-            self.experimental_write_cache_path = join_dir(data_home, "write_cache");
+        if self.experimental_write_cache_path.trim().is_empty() {
+            let object_cache_path = join_dir(data_home, OBJECT_CACHE_DIR);
+            self.experimental_write_cache_path = join_dir(&object_cache_path, "write");
         }
 
         self.index.sanitize(data_home, &self.inverted_index)?;
diff --git a/src/mito2/src/error.rs b/src/mito2/src/error.rs
index 407c8c29e258..d5e47d213657 100644
--- a/src/mito2/src/error.rs
+++ b/src/mito2/src/error.rs
@@ -677,6 +677,13 @@ pub enum Error {
         location: Location,
     },
 
+    #[snafu(display("Failed to create directory {}", dir))]
+    CreateDir {
+        dir: String,
+        #[snafu(source)]
+        error: std::io::Error,
+    },
+
     #[snafu(display("Failed to filter record batch"))]
     FilterRecordBatch {
         source: common_recordbatch::error::Error,
@@ -955,6 +962,7 @@ impl ErrorExt for Error {
             | ComputeVector { .. }
             | SerializeField { .. }
             | EncodeMemtable { .. }
+            | CreateDir { .. }
             | ReadDataPart { .. }
             | CorruptedEntry { .. }
             | BuildEntry { .. } => StatusCode::Internal,
diff --git a/src/mito2/src/worker.rs b/src/mito2/src/worker.rs
index e883f1833809..33d26c8196df 100644
--- a/src/mito2/src/worker.rs
+++ b/src/mito2/src/worker.rs
@@ -25,8 +25,8 @@ mod handle_manifest;
 mod handle_open;
 mod handle_truncate;
 mod handle_write;
-
 use std::collections::HashMap;
+use std::path::Path;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
 use std::time::Duration;
@@ -50,7 +50,7 @@ use crate::cache::write_cache::{WriteCache, WriteCacheRef};
 use crate::cache::{CacheManager, CacheManagerRef};
 use crate::compaction::CompactionScheduler;
 use crate::config::MitoConfig;
-use crate::error::{JoinSnafu, Result, WorkerStoppedSnafu};
+use crate::error::{CreateDirSnafu, JoinSnafu, Result, WorkerStoppedSnafu};
 use crate::flush::{FlushScheduler, WriteBufferManagerImpl, WriteBufferManagerRef};
 use crate::memtable::MemtableBuilderProvider;
 use crate::metrics::{REGION_COUNT, WRITE_STALL_TOTAL};
@@ -373,6 +373,12 @@ async fn write_cache_from_config(
     // TODO(yingwen): Remove this and document the config once the write cache is ready.
     warn!("Write cache is an experimental feature");
 
+    tokio::fs::create_dir_all(Path::new(&config.experimental_write_cache_path))
+        .await
+        .context(CreateDirSnafu {
+            dir: &config.experimental_write_cache_path,
+        })?;
+
     let cache = WriteCache::new_fs(
         &config.experimental_write_cache_path,
         object_store_manager,
diff --git a/src/object-store/src/lib.rs b/src/object-store/src/lib.rs
index 797e75f42d54..851484a0cffa 100644
--- a/src/object-store/src/lib.rs
+++ b/src/object-store/src/lib.rs
@@ -24,3 +24,5 @@ pub mod manager;
 mod metrics;
 pub mod test_util;
 pub mod util;
+/// The default object cache directory name.
+pub const OBJECT_CACHE_DIR: &str = "object_cache";
diff --git a/tests-integration/src/test_util.rs b/tests-integration/src/test_util.rs
index b3a7269ae003..57ec7f6f86f9 100644
--- a/tests-integration/src/test_util.rs
+++ b/tests-integration/src/test_util.rs
@@ -233,6 +233,9 @@ pub fn get_test_store_config(store_type: &StorageType) -> (ObjectStoreConfig, Te
 
             if *store_type == StorageType::S3WithCache {
                 s3_config.cache.cache_path = Some("/tmp/greptimedb_cache".to_string());
+            } else {
+                // An empty string means disabling.
+                s3_config.cache.cache_path = Some("".to_string());
             }
 
             let mut builder = S3::default()
diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs
index 106c906372b2..083a9daa1a26 100644
--- a/tests-integration/tests/http.rs
+++ b/tests-integration/tests/http.rs
@@ -917,7 +917,7 @@ compress_manifest = false
 auto_flush_interval = "30m"
 enable_experimental_write_cache = false
 experimental_write_cache_path = ""
-experimental_write_cache_size = "1GiB"
+experimental_write_cache_size = "5GiB"
 sst_write_buffer_size = "8MiB"
 parallel_scan_channel_size = 32
 allow_stale_entries = false
diff --git a/tests-integration/tests/main.rs b/tests-integration/tests/main.rs
index 4fc19f24b284..b30820517f9b 100644
--- a/tests-integration/tests/main.rs
+++ b/tests-integration/tests/main.rs
@@ -22,8 +22,9 @@ mod sql;
 mod region_migration;
 
 grpc_tests!(File, S3, S3WithCache, Oss, Azblob, Gcs);
+
 http_tests!(File, S3, S3WithCache, Oss, Azblob, Gcs);
-// region_failover_tests!(File, S3, S3WithCache, Oss, Azblob);
+
 sql_tests!(File);
 
 region_migration_tests!(File);

From 7c69ca05026be5faa0c4868d6bdcfa70d03aee5c Mon Sep 17 00:00:00 2001
From: Yingwen <realevenyag@gmail.com>
Date: Tue, 10 Dec 2024 21:10:37 +0800
Subject: [PATCH 18/36] chore: bump main branch version to 0.12 (#5133)

chore: bump version to v0.12.0
---
 .github/workflows/release.yml |   2 +-
 Cargo.lock                    | 146 +++++++++++++++++-----------------
 Cargo.toml                    |   2 +-
 3 files changed, 75 insertions(+), 75 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 4f32298a8ba2..3f46ef1a7bda 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -91,7 +91,7 @@ env:
   # The scheduled version is '${{ env.NEXT_RELEASE_VERSION }}-nightly-YYYYMMDD', like v0.2.0-nigthly-20230313;
   NIGHTLY_RELEASE_PREFIX: nightly
   # Note: The NEXT_RELEASE_VERSION should be modified manually by every formal release.
-  NEXT_RELEASE_VERSION: v0.11.0
+  NEXT_RELEASE_VERSION: v0.12.0
 
 # Permission reference: https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs
 permissions:
diff --git a/Cargo.lock b/Cargo.lock
index 920393daa030..177625a65955 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -188,7 +188,7 @@ checksum = "d301b3b94cb4b2f23d7917810addbbaff90738e0ca2be692bd027e70d7e0330c"
 
 [[package]]
 name = "api"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "common-base",
  "common-decimal",
@@ -749,7 +749,7 @@ dependencies = [
 
 [[package]]
 name = "auth"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "api",
  "async-trait",
@@ -1340,7 +1340,7 @@ dependencies = [
 
 [[package]]
 name = "cache"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "catalog",
  "common-error",
@@ -1348,7 +1348,7 @@ dependencies = [
  "common-meta",
  "moka",
  "snafu 0.8.5",
- "substrait 0.11.0",
+ "substrait 0.12.0",
 ]
 
 [[package]]
@@ -1375,7 +1375,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
 
 [[package]]
 name = "catalog"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "api",
  "arrow",
@@ -1714,7 +1714,7 @@ checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
 
 [[package]]
 name = "cli"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "async-trait",
  "auth",
@@ -1758,7 +1758,7 @@ dependencies = [
  "session",
  "snafu 0.8.5",
  "store-api",
- "substrait 0.11.0",
+ "substrait 0.12.0",
  "table",
  "temp-env",
  "tempfile",
@@ -1768,7 +1768,7 @@ dependencies = [
 
 [[package]]
 name = "client"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "api",
  "arc-swap",
@@ -1797,7 +1797,7 @@ dependencies = [
  "rand",
  "serde_json",
  "snafu 0.8.5",
- "substrait 0.11.0",
+ "substrait 0.12.0",
  "substrait 0.37.3",
  "tokio",
  "tokio-stream",
@@ -1838,7 +1838,7 @@ dependencies = [
 
 [[package]]
 name = "cmd"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "async-trait",
  "auth",
@@ -1898,7 +1898,7 @@ dependencies = [
  "similar-asserts",
  "snafu 0.8.5",
  "store-api",
- "substrait 0.11.0",
+ "substrait 0.12.0",
  "table",
  "temp-env",
  "tempfile",
@@ -1944,7 +1944,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335"
 
 [[package]]
 name = "common-base"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "anymap2",
  "async-trait",
@@ -1965,7 +1965,7 @@ dependencies = [
 
 [[package]]
 name = "common-catalog"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "chrono",
  "common-error",
@@ -1976,7 +1976,7 @@ dependencies = [
 
 [[package]]
 name = "common-config"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "common-base",
  "common-error",
@@ -1999,7 +1999,7 @@ dependencies = [
 
 [[package]]
 name = "common-datasource"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "arrow",
  "arrow-schema",
@@ -2036,7 +2036,7 @@ dependencies = [
 
 [[package]]
 name = "common-decimal"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "bigdecimal 0.4.5",
  "common-error",
@@ -2049,7 +2049,7 @@ dependencies = [
 
 [[package]]
 name = "common-error"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "snafu 0.8.5",
  "strum 0.25.0",
@@ -2058,7 +2058,7 @@ dependencies = [
 
 [[package]]
 name = "common-frontend"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "api",
  "async-trait",
@@ -2073,7 +2073,7 @@ dependencies = [
 
 [[package]]
 name = "common-function"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "api",
  "approx 0.5.1",
@@ -2118,7 +2118,7 @@ dependencies = [
 
 [[package]]
 name = "common-greptimedb-telemetry"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "async-trait",
  "common-runtime",
@@ -2135,7 +2135,7 @@ dependencies = [
 
 [[package]]
 name = "common-grpc"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "api",
  "arrow-flight",
@@ -2161,7 +2161,7 @@ dependencies = [
 
 [[package]]
 name = "common-grpc-expr"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "api",
  "common-base",
@@ -2180,7 +2180,7 @@ dependencies = [
 
 [[package]]
 name = "common-macro"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "arc-swap",
  "common-query",
@@ -2194,7 +2194,7 @@ dependencies = [
 
 [[package]]
 name = "common-mem-prof"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "common-error",
  "common-macro",
@@ -2207,7 +2207,7 @@ dependencies = [
 
 [[package]]
 name = "common-meta"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "anymap2",
  "api",
@@ -2264,7 +2264,7 @@ dependencies = [
 
 [[package]]
 name = "common-options"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "common-grpc",
  "humantime-serde",
@@ -2273,11 +2273,11 @@ dependencies = [
 
 [[package]]
 name = "common-plugins"
-version = "0.11.0"
+version = "0.12.0"
 
 [[package]]
 name = "common-pprof"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "common-error",
  "common-macro",
@@ -2289,7 +2289,7 @@ dependencies = [
 
 [[package]]
 name = "common-procedure"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "async-stream",
  "async-trait",
@@ -2316,7 +2316,7 @@ dependencies = [
 
 [[package]]
 name = "common-procedure-test"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "async-trait",
  "common-procedure",
@@ -2324,7 +2324,7 @@ dependencies = [
 
 [[package]]
 name = "common-query"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "api",
  "async-trait",
@@ -2350,7 +2350,7 @@ dependencies = [
 
 [[package]]
 name = "common-recordbatch"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "arc-swap",
  "common-error",
@@ -2369,7 +2369,7 @@ dependencies = [
 
 [[package]]
 name = "common-runtime"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "async-trait",
  "clap 4.5.19",
@@ -2399,7 +2399,7 @@ dependencies = [
 
 [[package]]
 name = "common-telemetry"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "atty",
  "backtrace",
@@ -2427,7 +2427,7 @@ dependencies = [
 
 [[package]]
 name = "common-test-util"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "client",
  "common-query",
@@ -2439,7 +2439,7 @@ dependencies = [
 
 [[package]]
 name = "common-time"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "arrow",
  "chrono",
@@ -2457,7 +2457,7 @@ dependencies = [
 
 [[package]]
 name = "common-version"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "build-data",
  "const_format",
@@ -2467,7 +2467,7 @@ dependencies = [
 
 [[package]]
 name = "common-wal"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "common-base",
  "common-error",
@@ -3276,7 +3276,7 @@ dependencies = [
 
 [[package]]
 name = "datanode"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "api",
  "arrow-flight",
@@ -3327,7 +3327,7 @@ dependencies = [
  "session",
  "snafu 0.8.5",
  "store-api",
- "substrait 0.11.0",
+ "substrait 0.12.0",
  "table",
  "tokio",
  "toml 0.8.19",
@@ -3336,7 +3336,7 @@ dependencies = [
 
 [[package]]
 name = "datatypes"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "arrow",
  "arrow-array",
@@ -3954,7 +3954,7 @@ dependencies = [
 
 [[package]]
 name = "file-engine"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "api",
  "async-trait",
@@ -4071,7 +4071,7 @@ checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8"
 
 [[package]]
 name = "flow"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "api",
  "arrow",
@@ -4128,7 +4128,7 @@ dependencies = [
  "snafu 0.8.5",
  "store-api",
  "strum 0.25.0",
- "substrait 0.11.0",
+ "substrait 0.12.0",
  "table",
  "tokio",
  "tonic 0.11.0",
@@ -4175,7 +4175,7 @@ checksum = "6c2141d6d6c8512188a7891b4b01590a45f6dac67afb4f255c4124dbb86d4eaa"
 
 [[package]]
 name = "frontend"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "api",
  "arc-swap",
@@ -5315,7 +5315,7 @@ dependencies = [
 
 [[package]]
 name = "index"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "async-trait",
  "asynchronous-codec",
@@ -6150,7 +6150,7 @@ checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
 
 [[package]]
 name = "log-query"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "chrono",
  "common-error",
@@ -6161,7 +6161,7 @@ dependencies = [
 
 [[package]]
 name = "log-store"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "async-stream",
  "async-trait",
@@ -6482,7 +6482,7 @@ dependencies = [
 
 [[package]]
 name = "meta-client"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "api",
  "async-trait",
@@ -6509,7 +6509,7 @@ dependencies = [
 
 [[package]]
 name = "meta-srv"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "api",
  "async-trait",
@@ -6588,7 +6588,7 @@ dependencies = [
 
 [[package]]
 name = "metric-engine"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "api",
  "aquamarine",
@@ -6692,7 +6692,7 @@ dependencies = [
 
 [[package]]
 name = "mito2"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "api",
  "aquamarine",
@@ -7404,7 +7404,7 @@ dependencies = [
 
 [[package]]
 name = "object-store"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "anyhow",
  "bytes",
@@ -7657,7 +7657,7 @@ dependencies = [
 
 [[package]]
 name = "operator"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "ahash 0.8.11",
  "api",
@@ -7705,7 +7705,7 @@ dependencies = [
  "sql",
  "sqlparser 0.45.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=54a267ac89c09b11c0c88934690530807185d3e7)",
  "store-api",
- "substrait 0.11.0",
+ "substrait 0.12.0",
  "table",
  "tokio",
  "tokio-util",
@@ -7955,7 +7955,7 @@ dependencies = [
 
 [[package]]
 name = "partition"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "api",
  "async-trait",
@@ -8241,7 +8241,7 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
 [[package]]
 name = "pipeline"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "ahash 0.8.11",
  "api",
@@ -8404,7 +8404,7 @@ dependencies = [
 
 [[package]]
 name = "plugins"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "auth",
  "clap 4.5.19",
@@ -8681,7 +8681,7 @@ dependencies = [
 
 [[package]]
 name = "promql"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "ahash 0.8.11",
  "async-trait",
@@ -8919,7 +8919,7 @@ dependencies = [
 
 [[package]]
 name = "puffin"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "async-compression 0.4.13",
  "async-trait",
@@ -9043,7 +9043,7 @@ dependencies = [
 
 [[package]]
 name = "query"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "ahash 0.8.11",
  "api",
@@ -9110,7 +9110,7 @@ dependencies = [
  "stats-cli",
  "store-api",
  "streaming-stats",
- "substrait 0.11.0",
+ "substrait 0.12.0",
  "table",
  "tokio",
  "tokio-stream",
@@ -10572,7 +10572,7 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
 
 [[package]]
 name = "script"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "api",
  "arc-swap",
@@ -10866,7 +10866,7 @@ dependencies = [
 
 [[package]]
 name = "servers"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "ahash 0.8.11",
  "api",
@@ -10979,7 +10979,7 @@ dependencies = [
 
 [[package]]
 name = "session"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "api",
  "arc-swap",
@@ -11327,7 +11327,7 @@ dependencies = [
 
 [[package]]
 name = "sql"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "api",
  "chrono",
@@ -11390,7 +11390,7 @@ dependencies = [
 
 [[package]]
 name = "sqlness-runner"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "async-trait",
  "clap 4.5.19",
@@ -11610,7 +11610,7 @@ dependencies = [
 
 [[package]]
 name = "store-api"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "api",
  "aquamarine",
@@ -11781,7 +11781,7 @@ dependencies = [
 
 [[package]]
 name = "substrait"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "async-trait",
  "bytes",
@@ -11980,7 +11980,7 @@ dependencies = [
 
 [[package]]
 name = "table"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "api",
  "async-trait",
@@ -12246,7 +12246,7 @@ checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"
 
 [[package]]
 name = "tests-fuzz"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "arbitrary",
  "async-trait",
@@ -12288,7 +12288,7 @@ dependencies = [
 
 [[package]]
 name = "tests-integration"
-version = "0.11.0"
+version = "0.12.0"
 dependencies = [
  "api",
  "arrow-flight",
@@ -12352,7 +12352,7 @@ dependencies = [
  "sql",
  "sqlx",
  "store-api",
- "substrait 0.11.0",
+ "substrait 0.12.0",
  "table",
  "tempfile",
  "time",
diff --git a/Cargo.toml b/Cargo.toml
index 4cc07cd89818..d1d360850e70 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -68,7 +68,7 @@ members = [
 resolver = "2"
 
 [workspace.package]
-version = "0.11.0"
+version = "0.12.0"
 edition = "2021"
 license = "Apache-2.0"
 

From 3d1b8c4fac9bf2252213a20eceb8c95104f22dd2 Mon Sep 17 00:00:00 2001
From: shuiyisong <113876041+shuiyisong@users.noreply.github.com>
Date: Wed, 11 Dec 2024 10:56:48 +0800
Subject: [PATCH 19/36] chore: add `/ready` api for health checking (#5124)

* chore: add ready endpoint for health checking

* chore: add test
---
 src/servers/src/http.rs         | 13 +++++++++----
 tests-integration/tests/http.rs | 33 +++++++++++++++++++--------------
 2 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/src/servers/src/http.rs b/src/servers/src/http.rs
index c719e02cac35..d8d07ed31fa0 100644
--- a/src/servers/src/http.rs
+++ b/src/servers/src/http.rs
@@ -638,10 +638,15 @@ impl HttpServer {
             router.clone()
         };
 
-        router = router.route(
-            "/health",
-            routing::get(handler::health).post(handler::health),
-        );
+        router = router
+            .route(
+                "/health",
+                routing::get(handler::health).post(handler::health),
+            )
+            .route(
+                "/ready",
+                routing::get(handler::health).post(handler::health),
+            );
 
         router = router.route("/status", routing::get(handler::status));
 
diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs
index 083a9daa1a26..9d7b81f3919b 100644
--- a/tests-integration/tests/http.rs
+++ b/tests-integration/tests/http.rs
@@ -757,21 +757,26 @@ pub async fn test_health_api(store_type: StorageType) {
     let (app, _guard) = setup_test_http_app_with_frontend(store_type, "health_api").await;
     let client = TestClient::new(app);
 
-    // we can call health api with both `GET` and `POST` method.
-    let res_post = client.post("/health").send().await;
-    assert_eq!(res_post.status(), StatusCode::OK);
-    let res_get = client.get("/health").send().await;
-    assert_eq!(res_get.status(), StatusCode::OK);
-
-    // both `GET` and `POST` method return same result
-    let body_text = res_post.text().await;
-    assert_eq!(body_text, res_get.text().await);
-
-    // currently health api simply returns an empty json `{}`, which can be deserialized to an empty `HealthResponse`
-    assert_eq!(body_text, "{}");
+    async fn health_api(client: &TestClient, endpoint: &str) {
+        // we can call health api with both `GET` and `POST` method.
+        let res_post = client.post(endpoint).send().await;
+        assert_eq!(res_post.status(), StatusCode::OK);
+        let res_get = client.get(endpoint).send().await;
+        assert_eq!(res_get.status(), StatusCode::OK);
+
+        // both `GET` and `POST` method return same result
+        let body_text = res_post.text().await;
+        assert_eq!(body_text, res_get.text().await);
+
+        // currently health api simply returns an empty json `{}`, which can be deserialized to an empty `HealthResponse`
+        assert_eq!(body_text, "{}");
+
+        let body = serde_json::from_str::<HealthResponse>(&body_text).unwrap();
+        assert_eq!(body, HealthResponse {});
+    }
 
-    let body = serde_json::from_str::<HealthResponse>(&body_text).unwrap();
-    assert_eq!(body, HealthResponse {});
+    health_api(&client, "/health").await;
+    health_api(&client, "/ready").await;
 }
 
 pub async fn test_status_api(store_type: StorageType) {

From d91517688ab4ad010f71cae01a50de883c7967bb Mon Sep 17 00:00:00 2001
From: discord9 <55937128+discord9@users.noreply.github.com>
Date: Wed, 11 Dec 2024 15:02:03 +0800
Subject: [PATCH 20/36] chore: fix aws_lc not in depend tree check in CI
 (#5121)

* chore: fix aws_lc check in CI

* chore: update lock file
---
 .github/cargo-blacklist.txt   |  1 +
 .github/workflows/develop.yml |  7 ----
 Cargo.lock                    | 61 ++---------------------------------
 src/servers/Cargo.toml        |  5 ---
 4 files changed, 3 insertions(+), 71 deletions(-)

diff --git a/.github/cargo-blacklist.txt b/.github/cargo-blacklist.txt
index 32e7878a86db..d2f071130ee9 100644
--- a/.github/cargo-blacklist.txt
+++ b/.github/cargo-blacklist.txt
@@ -1,2 +1,3 @@
 native-tls
 openssl
+aws-lc-sys
diff --git a/.github/workflows/develop.yml b/.github/workflows/develop.yml
index 80a4f042c0f7..6eccbe65b811 100644
--- a/.github/workflows/develop.yml
+++ b/.github/workflows/develop.yml
@@ -269,13 +269,6 @@ jobs:
       - name: Install cargo-gc-bin
         shell: bash
         run: cargo install cargo-gc-bin
-      - name: Check aws-lc-sys will not build
-        shell: bash
-        run: |
-             if cargo tree -i aws-lc-sys -e features | grep -q aws-lc-sys; then
-               echo "Found aws-lc-sys, which has compilation problems on older gcc versions. Please replace it with ring until its building experience improves."
-               exit 1
-             fi
       - name: Build greptime bianry
         shell: bash
         # `cargo gc` will invoke `cargo build` with specified args
diff --git a/Cargo.lock b/Cargo.lock
index 177625a65955..628c6a582418 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -798,21 +798,6 @@ dependencies = [
  "cc",
 ]
 
-[[package]]
-name = "aws-lc-sys"
-version = "0.21.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3ddc4a5b231dd6958b140ff3151b6412b3f4321fab354f399eec8f14b06df62"
-dependencies = [
- "bindgen 0.69.4",
- "cc",
- "cmake",
- "dunce",
- "fs_extra",
- "libc",
- "paste",
-]
-
 [[package]]
 name = "axum"
 version = "0.6.20"
@@ -993,29 +978,6 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "bindgen"
-version = "0.69.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0"
-dependencies = [
- "bitflags 2.6.0",
- "cexpr",
- "clang-sys",
- "itertools 0.12.1",
- "lazy_static",
- "lazycell",
- "log",
- "prettyplease",
- "proc-macro2",
- "quote",
- "regex",
- "rustc-hash 1.1.0",
- "shlex",
- "syn 2.0.90",
- "which",
-]
-
 [[package]]
 name = "bindgen"
 version = "0.70.1"
@@ -3719,12 +3681,6 @@ version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2"
 
-[[package]]
-name = "dunce"
-version = "1.0.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
-
 [[package]]
 name = "duration-str"
 version = "0.11.2"
@@ -4314,12 +4270,6 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
-[[package]]
-name = "fs_extra"
-version = "1.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
-
 [[package]]
 name = "fsevent-sys"
 version = "4.1.0"
@@ -5955,12 +5905,6 @@ dependencies = [
  "spin 0.9.8",
 ]
 
-[[package]]
-name = "lazycell"
-version = "1.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
-
 [[package]]
 name = "levenshtein_automata"
 version = "0.2.1"
@@ -6966,7 +6910,7 @@ checksum = "06f19e4cfa0ab5a76b627cec2d81331c49b034988eaf302c3bafeada684eadef"
 dependencies = [
  "base64 0.21.7",
  "bigdecimal 0.4.5",
- "bindgen 0.70.1",
+ "bindgen",
  "bitflags 2.6.0",
  "bitvec",
  "btoi",
@@ -7005,7 +6949,7 @@ checksum = "478b0ff3f7d67b79da2b96f56f334431aef65e15ba4b29dd74a4236e29582bdc"
 dependencies = [
  "base64 0.21.7",
  "bigdecimal 0.4.5",
- "bindgen 0.70.1",
+ "bindgen",
  "bitflags 2.6.0",
  "bitvec",
  "btoi",
@@ -10876,7 +10820,6 @@ dependencies = [
  "arrow-schema",
  "async-trait",
  "auth",
- "aws-lc-sys",
  "axum",
  "axum-macros",
  "base64 0.21.7",
diff --git a/src/servers/Cargo.toml b/src/servers/Cargo.toml
index c01560724931..ddfeaf27bd45 100644
--- a/src/servers/Cargo.toml
+++ b/src/servers/Cargo.toml
@@ -139,11 +139,6 @@ tokio-test = "0.4"
 [target.'cfg(unix)'.dev-dependencies]
 pprof = { version = "0.13", features = ["criterion", "flamegraph"] }
 
-[target.'cfg(windows)'.dependencies]
-aws-lc-sys = { version = "0.21.0", features = [
-    "prebuilt-nasm",
-] } # use prebuilt nasm on windows per https://github.com/aws/aws-lc-rs/blob/main/aws-lc-sys/README.md#use-of-prebuilt-nasm-objects
-
 [build-dependencies]
 common-version.workspace = true
 

From a6893aad421fcc242a67b859a7aa21624bbe8a17 Mon Sep 17 00:00:00 2001
From: jeremyhi <jiachun_feng@proton.me>
Date: Wed, 11 Dec 2024 16:04:02 +0800
Subject: [PATCH 21/36] chore: set store_key_prefix for all kvbackend (#5132)

---
 src/meta-srv/src/bootstrap.rs | 44 +++++++++++++++++------------------
 src/meta-srv/src/metasrv.rs   |  4 ++++
 2 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/src/meta-srv/src/bootstrap.rs b/src/meta-srv/src/bootstrap.rs
index 47afa0ab416b..85770e1f3d4d 100644
--- a/src/meta-srv/src/bootstrap.rs
+++ b/src/meta-srv/src/bootstrap.rs
@@ -206,43 +206,41 @@ pub async fn metasrv_builder(
     plugins: Plugins,
     kv_backend: Option<KvBackendRef>,
 ) -> Result<MetasrvBuilder> {
-    let (kv_backend, election) = match (kv_backend, &opts.backend) {
+    let (mut kv_backend, election) = match (kv_backend, &opts.backend) {
         (Some(kv_backend), _) => (kv_backend, None),
         (None, BackendImpl::MemoryStore) => (Arc::new(MemoryKvBackend::new()) as _, None),
         (None, BackendImpl::EtcdStore) => {
             let etcd_client = create_etcd_client(opts).await?;
-            let kv_backend = {
-                let etcd_backend =
-                    EtcdStore::with_etcd_client(etcd_client.clone(), opts.max_txn_ops);
-                if !opts.store_key_prefix.is_empty() {
-                    Arc::new(ChrootKvBackend::new(
-                        opts.store_key_prefix.clone().into_bytes(),
-                        etcd_backend,
-                    ))
-                } else {
-                    etcd_backend
-                }
-            };
-            (
-                kv_backend,
-                Some(
-                    EtcdElection::with_etcd_client(
-                        &opts.server_addr,
-                        etcd_client.clone(),
-                        opts.store_key_prefix.clone(),
-                    )
-                    .await?,
-                ),
+            let kv_backend = EtcdStore::with_etcd_client(etcd_client.clone(), opts.max_txn_ops);
+            let election = EtcdElection::with_etcd_client(
+                &opts.server_addr,
+                etcd_client,
+                opts.store_key_prefix.clone(),
             )
+            .await?;
+
+            (kv_backend, Some(election))
         }
         #[cfg(feature = "pg_kvbackend")]
         (None, BackendImpl::PostgresStore) => {
             let pg_client = create_postgres_client(opts).await?;
             let kv_backend = PgStore::with_pg_client(pg_client).await.unwrap();
+            // TODO(jeremy, weny): implement election for postgres
             (kv_backend, None)
         }
     };
 
+    if !opts.store_key_prefix.is_empty() {
+        info!(
+            "using chroot kv backend with prefix: {prefix}",
+            prefix = opts.store_key_prefix
+        );
+        kv_backend = Arc::new(ChrootKvBackend::new(
+            opts.store_key_prefix.clone().into_bytes(),
+            kv_backend,
+        ))
+    }
+
     let in_memory = Arc::new(MemoryKvBackend::new()) as ResettableKvBackendRef;
 
     let selector = match opts.selector {
diff --git a/src/meta-srv/src/metasrv.rs b/src/meta-srv/src/metasrv.rs
index 716b85f83485..da614ac9b943 100644
--- a/src/meta-srv/src/metasrv.rs
+++ b/src/meta-srv/src/metasrv.rs
@@ -470,6 +470,10 @@ impl Metasrv {
                 });
             }
         } else {
+            warn!(
+                "Ensure only one instance of Metasrv is running, as there is no election service."
+            );
+
             if let Err(e) = self.wal_options_allocator.start().await {
                 error!(e; "Failed to start wal options allocator");
             }

From 2c4ac76754265762c884958cc0057170ed97081a Mon Sep 17 00:00:00 2001
From: dennis zhuang <killme2008@gmail.com>
Date: Wed, 11 Dec 2024 16:08:05 +0800
Subject: [PATCH 22/36] feat: adjust WAL purge default configurations (#5107)

* feat: adjust WAL purge default configurations

* fix: config

* feat: change raft engine file_size default to 128Mib
---
 config/config.md                         | 22 +++++++++++-----------
 config/datanode.example.toml             | 11 +++++------
 config/metasrv.example.toml              | 16 ++++++++--------
 config/standalone.example.toml           | 12 ++++++------
 src/common/wal/src/config/raft_engine.rs |  6 +++---
 tests-integration/tests/http.rs          |  6 +++---
 6 files changed, 36 insertions(+), 37 deletions(-)

diff --git a/config/config.md b/config/config.md
index 0f70a8cb126e..6a500a5b4a34 100644
--- a/config/config.md
+++ b/config/config.md
@@ -13,11 +13,11 @@
 | Key | Type | Default | Descriptions |
 | --- | -----| ------- | ----------- |
 | `mode` | String | `standalone` | The running mode of the datanode. It can be `standalone` or `distributed`. |
-| `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. |
 | `default_timezone` | String | Unset | The default timezone of the server. |
 | `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.<br/>By default, it provides services after all regions have been initialized. |
 | `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. |
 | `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited. |
+| `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. Enabled by default. |
 | `runtime` | -- | -- | The runtime options. |
 | `runtime.global_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. |
 | `runtime.compact_rt_size` | Integer | `4` | The number of threads to execute the runtime for global write operations. |
@@ -61,9 +61,9 @@
 | `wal` | -- | -- | The WAL options. |
 | `wal.provider` | String | `raft_engine` | The provider of the WAL.<br/>- `raft_engine`: the wal is stored in the local file system by raft-engine.<br/>- `kafka`: it's remote wal that data is stored in Kafka. |
 | `wal.dir` | String | Unset | The directory to store the WAL files.<br/>**It's only used when the provider is `raft_engine`**. |
-| `wal.file_size` | String | `256MB` | The size of the WAL segment file.<br/>**It's only used when the provider is `raft_engine`**. |
-| `wal.purge_threshold` | String | `4GB` | The threshold of the WAL size to trigger a flush.<br/>**It's only used when the provider is `raft_engine`**. |
-| `wal.purge_interval` | String | `10m` | The interval to trigger a flush.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.file_size` | String | `128MB` | The size of the WAL segment file.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.purge_threshold` | String | `1GB` | The threshold of the WAL size to trigger a flush.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.purge_interval` | String | `1m` | The interval to trigger a flush.<br/>**It's only used when the provider is `raft_engine`**. |
 | `wal.read_batch_size` | Integer | `128` | The read batch size.<br/>**It's only used when the provider is `raft_engine`**. |
 | `wal.sync_write` | Bool | `false` | Whether to use sync write.<br/>**It's only used when the provider is `raft_engine`**. |
 | `wal.enable_log_recycle` | Bool | `true` | Whether to reuse logically truncated log files.<br/>**It's only used when the provider is `raft_engine`**. |
@@ -286,12 +286,12 @@
 | `bind_addr` | String | `127.0.0.1:3002` | The bind address of metasrv. |
 | `server_addr` | String | `127.0.0.1:3002` | The communication server address for frontend and datanode to connect to metasrv,  "127.0.0.1:3002" by default for localhost. |
 | `store_addrs` | Array | -- | Store server address default to etcd store. |
+| `store_key_prefix` | String | `""` | If it's not empty, the metasrv will store all data with this key prefix. |
+| `backend` | String | `EtcdStore` | The datastore for meta server. |
 | `selector` | String | `round_robin` | Datanode selector type.<br/>- `round_robin` (default value)<br/>- `lease_based`<br/>- `load_based`<br/>For details, please see "https://docs.greptime.com/developer-guide/metasrv/selector". |
 | `use_memory_store` | Bool | `false` | Store data in memory. |
-| `enable_telemetry` | Bool | `true` | Whether to enable greptimedb telemetry. |
-| `store_key_prefix` | String | `""` | If it's not empty, the metasrv will store all data with this key prefix. |
 | `enable_region_failover` | Bool | `false` | Whether to enable region failover.<br/>This feature is only available on GreptimeDB running on cluster mode and<br/>- Using Remote WAL<br/>- Using shared storage (e.g., s3). |
-| `backend` | String | `EtcdStore` | The datastore for meta server. |
+| `enable_telemetry` | Bool | `true` | Whether to enable greptimedb telemetry. Enabled by default. |
 | `runtime` | -- | -- | The runtime options. |
 | `runtime.global_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. |
 | `runtime.compact_rt_size` | Integer | `4` | The number of threads to execute the runtime for global write operations. |
@@ -356,7 +356,6 @@
 | `node_id` | Integer | Unset | The datanode identifier and should be unique in the cluster. |
 | `require_lease_before_startup` | Bool | `false` | Start services after regions have obtained leases.<br/>It will block the datanode start if it can't receive leases in the heartbeat from metasrv. |
 | `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.<br/>By default, it provides services after all regions have been initialized. |
-| `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. |
 | `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. |
 | `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited. |
 | `rpc_addr` | String | Unset | Deprecated, use `grpc.addr` instead. |
@@ -364,6 +363,7 @@
 | `rpc_runtime_size` | Integer | Unset | Deprecated, use `grpc.runtime_size` instead. |
 | `rpc_max_recv_message_size` | String | Unset | Deprecated, use `grpc.rpc_max_recv_message_size` instead. |
 | `rpc_max_send_message_size` | String | Unset | Deprecated, use `grpc.rpc_max_send_message_size` instead. |
+| `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. Enabled by default. |
 | `http` | -- | -- | The HTTP server options. |
 | `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. |
 | `http.timeout` | String | `30s` | HTTP request timeout. Set to 0 to disable timeout. |
@@ -398,9 +398,9 @@
 | `wal` | -- | -- | The WAL options. |
 | `wal.provider` | String | `raft_engine` | The provider of the WAL.<br/>- `raft_engine`: the wal is stored in the local file system by raft-engine.<br/>- `kafka`: it's remote wal that data is stored in Kafka. |
 | `wal.dir` | String | Unset | The directory to store the WAL files.<br/>**It's only used when the provider is `raft_engine`**. |
-| `wal.file_size` | String | `256MB` | The size of the WAL segment file.<br/>**It's only used when the provider is `raft_engine`**. |
-| `wal.purge_threshold` | String | `4GB` | The threshold of the WAL size to trigger a flush.<br/>**It's only used when the provider is `raft_engine`**. |
-| `wal.purge_interval` | String | `10m` | The interval to trigger a flush.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.file_size` | String | `128MB` | The size of the WAL segment file.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.purge_threshold` | String | `1GB` | The threshold of the WAL size to trigger a flush.<br/>**It's only used when the provider is `raft_engine`**. |
+| `wal.purge_interval` | String | `1m` | The interval to trigger a flush.<br/>**It's only used when the provider is `raft_engine`**. |
 | `wal.read_batch_size` | Integer | `128` | The read batch size.<br/>**It's only used when the provider is `raft_engine`**. |
 | `wal.sync_write` | Bool | `false` | Whether to use sync write.<br/>**It's only used when the provider is `raft_engine`**. |
 | `wal.enable_log_recycle` | Bool | `true` | Whether to reuse logically truncated log files.<br/>**It's only used when the provider is `raft_engine`**. |
diff --git a/config/datanode.example.toml b/config/datanode.example.toml
index 8bfa8732cc41..0ba80a9f7d92 100644
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -13,9 +13,6 @@ require_lease_before_startup = false
 ## By default, it provides services after all regions have been initialized.
 init_regions_in_background = false
 
-## Enable telemetry to collect anonymous usage data.
-enable_telemetry = true
-
 ## Parallelism of initializing regions.
 init_regions_parallelism = 16
 
@@ -42,6 +39,8 @@ rpc_max_recv_message_size = "512MB"
 ## @toml2docs:none-default
 rpc_max_send_message_size = "512MB"
 
+## Enable telemetry to collect anonymous usage data. Enabled by default.
+#+ enable_telemetry = true
 
 ## The HTTP server options.
 [http]
@@ -143,15 +142,15 @@ dir = "/tmp/greptimedb/wal"
 
 ## The size of the WAL segment file.
 ## **It's only used when the provider is `raft_engine`**.
-file_size = "256MB"
+file_size = "128MB"
 
 ## The threshold of the WAL size to trigger a flush.
 ## **It's only used when the provider is `raft_engine`**.
-purge_threshold = "4GB"
+purge_threshold = "1GB"
 
 ## The interval to trigger a flush.
 ## **It's only used when the provider is `raft_engine`**.
-purge_interval = "10m"
+purge_interval = "1m"
 
 ## The read batch size.
 ## **It's only used when the provider is `raft_engine`**.
diff --git a/config/metasrv.example.toml b/config/metasrv.example.toml
index bcd7ee41412b..27716b5aa37b 100644
--- a/config/metasrv.example.toml
+++ b/config/metasrv.example.toml
@@ -10,6 +10,12 @@ server_addr = "127.0.0.1:3002"
 ## Store server address default to etcd store.
 store_addrs = ["127.0.0.1:2379"]
 
+## If it's not empty, the metasrv will store all data with this key prefix.
+store_key_prefix = ""
+
+## The datastore for meta server.
+backend = "EtcdStore"
+
 ## Datanode selector type.
 ## - `round_robin` (default value)
 ## - `lease_based`
@@ -20,20 +26,14 @@ selector = "round_robin"
 ## Store data in memory.
 use_memory_store = false
 
-## Whether to enable greptimedb telemetry.
-enable_telemetry = true
-
-## If it's not empty, the metasrv will store all data with this key prefix.
-store_key_prefix = ""
-
 ## Whether to enable region failover.
 ## This feature is only available on GreptimeDB running on cluster mode and
 ## - Using Remote WAL
 ## - Using shared storage (e.g., s3).
 enable_region_failover = false
 
-## The datastore for meta server.
-backend = "EtcdStore"
+## Whether to enable greptimedb telemetry. Enabled by default.
+#+ enable_telemetry = true
 
 ## The runtime options.
 #+ [runtime]
diff --git a/config/standalone.example.toml b/config/standalone.example.toml
index 56cbeaddb9d5..8eae532d6166 100644
--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -1,9 +1,6 @@
 ## The running mode of the datanode. It can be `standalone` or `distributed`.
 mode = "standalone"
 
-## Enable telemetry to collect anonymous usage data.
-enable_telemetry = true
-
 ## The default timezone of the server.
 ## @toml2docs:none-default
 default_timezone = "UTC"
@@ -18,6 +15,9 @@ init_regions_parallelism = 16
 ## The maximum current queries allowed to be executed. Zero means unlimited.
 max_concurrent_queries = 0
 
+## Enable telemetry to collect anonymous usage data. Enabled by default.
+#+ enable_telemetry = true
+
 ## The runtime options.
 #+ [runtime]
 ## The number of threads to execute the runtime for global read operations.
@@ -147,15 +147,15 @@ dir = "/tmp/greptimedb/wal"
 
 ## The size of the WAL segment file.
 ## **It's only used when the provider is `raft_engine`**.
-file_size = "256MB"
+file_size = "128MB"
 
 ## The threshold of the WAL size to trigger a flush.
 ## **It's only used when the provider is `raft_engine`**.
-purge_threshold = "4GB"
+purge_threshold = "1GB"
 
 ## The interval to trigger a flush.
 ## **It's only used when the provider is `raft_engine`**.
-purge_interval = "10m"
+purge_interval = "1m"
 
 ## The read batch size.
 ## **It's only used when the provider is `raft_engine`**.
diff --git a/src/common/wal/src/config/raft_engine.rs b/src/common/wal/src/config/raft_engine.rs
index af5daa9d386d..cfefd0c758b7 100644
--- a/src/common/wal/src/config/raft_engine.rs
+++ b/src/common/wal/src/config/raft_engine.rs
@@ -49,9 +49,9 @@ impl Default for RaftEngineConfig {
     fn default() -> Self {
         Self {
             dir: None,
-            file_size: ReadableSize::mb(256),
-            purge_threshold: ReadableSize::gb(4),
-            purge_interval: Duration::from_secs(600),
+            file_size: ReadableSize::mb(128),
+            purge_threshold: ReadableSize::gb(1),
+            purge_interval: Duration::from_secs(60),
             read_batch_size: 128,
             sync_write: false,
             enable_log_recycle: true,
diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs
index 9d7b81f3919b..4da65f0b21f5 100644
--- a/tests-integration/tests/http.rs
+++ b/tests-integration/tests/http.rs
@@ -886,9 +886,9 @@ with_metric_engine = true
 
 [wal]
 provider = "raft_engine"
-file_size = "256MiB"
-purge_threshold = "4GiB"
-purge_interval = "10m"
+file_size = "128MiB"
+purge_threshold = "1GiB"
+purge_interval = "1m"
 read_batch_size = 128
 sync_write = false
 enable_log_recycle = true

From a30d918df2bf4b57e65b59d3eef26765ad96f6b6 Mon Sep 17 00:00:00 2001
From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com>
Date: Wed, 11 Dec 2024 16:24:41 +0800
Subject: [PATCH 23/36] perf: avoid cache during compaction (#5135)

* Revert "refactor: Avoid wrapping Option for CacheManagerRef (#4996)"

This reverts commit 42bf7e99655bf842a08c657d1d601c0a8a9f41f2.

* fix: memory usage during log ingestion

* fix: fmt
---
 src/mito2/src/cache/write_cache.rs     |  2 +-
 src/mito2/src/compaction.rs            |  3 +-
 src/mito2/src/compaction/compactor.rs  |  1 -
 src/mito2/src/engine.rs                | 16 ++++++----
 src/mito2/src/read/last_row.rs         | 24 ++++++++++----
 src/mito2/src/read/projection.rs       | 24 +++++++-------
 src/mito2/src/read/range.rs            |  9 +++---
 src/mito2/src/read/scan_region.rs      | 19 +++++++----
 src/mito2/src/read/seq_scan.rs         |  2 +-
 src/mito2/src/read/unordered_scan.rs   |  2 +-
 src/mito2/src/sst/parquet.rs           | 10 +++---
 src/mito2/src/sst/parquet/reader.rs    | 32 +++++++++----------
 src/mito2/src/sst/parquet/row_group.rs | 44 +++++++++++++++-----------
 13 files changed, 108 insertions(+), 80 deletions(-)

diff --git a/src/mito2/src/cache/write_cache.rs b/src/mito2/src/cache/write_cache.rs
index 4e2fe357fd09..8a431f22a63d 100644
--- a/src/mito2/src/cache/write_cache.rs
+++ b/src/mito2/src/cache/write_cache.rs
@@ -501,7 +501,7 @@ mod tests {
 
         // Read metadata from write cache
         let builder = ParquetReaderBuilder::new(data_home, handle.clone(), mock_store.clone())
-            .cache(cache_manager.clone());
+            .cache(Some(cache_manager.clone()));
         let reader = builder.build().await.unwrap();
 
         // Check parquet metadata
diff --git a/src/mito2/src/compaction.rs b/src/mito2/src/compaction.rs
index 5f462f33a111..2b70f455d815 100644
--- a/src/mito2/src/compaction.rs
+++ b/src/mito2/src/compaction.rs
@@ -570,7 +570,6 @@ pub struct SerializedCompactionOutput {
 struct CompactionSstReaderBuilder<'a> {
     metadata: RegionMetadataRef,
     sst_layer: AccessLayerRef,
-    cache: CacheManagerRef,
     inputs: &'a [FileHandle],
     append_mode: bool,
     filter_deleted: bool,
@@ -584,7 +583,7 @@ impl<'a> CompactionSstReaderBuilder<'a> {
         let mut scan_input = ScanInput::new(self.sst_layer, ProjectionMapper::all(&self.metadata)?)
             .with_files(self.inputs.to_vec())
             .with_append_mode(self.append_mode)
-            .with_cache(self.cache)
+            .with_cache(None)
             .with_filter_deleted(self.filter_deleted)
             // We ignore file not found error during compaction.
             .with_ignore_file_not_found(true)
diff --git a/src/mito2/src/compaction/compactor.rs b/src/mito2/src/compaction/compactor.rs
index 792634b2e4a2..91ab34c961cf 100644
--- a/src/mito2/src/compaction/compactor.rs
+++ b/src/mito2/src/compaction/compactor.rs
@@ -296,7 +296,6 @@ impl Compactor for DefaultCompactor {
                 let reader = CompactionSstReaderBuilder {
                     metadata: region_metadata.clone(),
                     sst_layer: sst_layer.clone(),
-                    cache: cache_manager.clone(),
                     inputs: &output.inputs,
                     append_mode,
                     filter_deleted: output.filter_deleted,
diff --git a/src/mito2/src/engine.rs b/src/mito2/src/engine.rs
index a518da32535d..9b912318e16b 100644
--- a/src/mito2/src/engine.rs
+++ b/src/mito2/src/engine.rs
@@ -424,12 +424,16 @@ impl EngineInner {
         // Get cache.
         let cache_manager = self.workers.cache_manager();
 
-        let scan_region =
-            ScanRegion::new(version, region.access_layer.clone(), request, cache_manager)
-                .with_parallel_scan_channel_size(self.config.parallel_scan_channel_size)
-                .with_ignore_inverted_index(self.config.inverted_index.apply_on_query.disabled())
-                .with_ignore_fulltext_index(self.config.fulltext_index.apply_on_query.disabled())
-                .with_start_time(query_start);
+        let scan_region = ScanRegion::new(
+            version,
+            region.access_layer.clone(),
+            request,
+            Some(cache_manager),
+        )
+        .with_parallel_scan_channel_size(self.config.parallel_scan_channel_size)
+        .with_ignore_inverted_index(self.config.inverted_index.apply_on_query.disabled())
+        .with_ignore_fulltext_index(self.config.fulltext_index.apply_on_query.disabled())
+        .with_start_time(query_start);
 
         Ok(scan_region)
     }
diff --git a/src/mito2/src/read/last_row.rs b/src/mito2/src/read/last_row.rs
index ee775a8ec2ba..79d035e03271 100644
--- a/src/mito2/src/read/last_row.rs
+++ b/src/mito2/src/read/last_row.rs
@@ -86,7 +86,7 @@ impl RowGroupLastRowCachedReader {
     pub(crate) fn new(
         file_id: FileId,
         row_group_idx: usize,
-        cache_manager: CacheManagerRef,
+        cache_manager: Option<CacheManagerRef>,
         row_group_reader: RowGroupReader,
     ) -> Self {
         let key = SelectorResultKey {
@@ -95,6 +95,9 @@ impl RowGroupLastRowCachedReader {
             selector: TimeSeriesRowSelector::LastRow,
         };
 
+        let Some(cache_manager) = cache_manager else {
+            return Self::new_miss(key, row_group_reader, None);
+        };
         if let Some(value) = cache_manager.get_selector_result(&key) {
             let schema_matches = value.projection
                 == row_group_reader
@@ -105,10 +108,10 @@ impl RowGroupLastRowCachedReader {
                 // Schema matches, use cache batches.
                 Self::new_hit(value)
             } else {
-                Self::new_miss(key, row_group_reader, cache_manager)
+                Self::new_miss(key, row_group_reader, Some(cache_manager))
             }
         } else {
-            Self::new_miss(key, row_group_reader, cache_manager)
+            Self::new_miss(key, row_group_reader, Some(cache_manager))
         }
     }
 
@@ -122,7 +125,7 @@ impl RowGroupLastRowCachedReader {
     fn new_miss(
         key: SelectorResultKey,
         row_group_reader: RowGroupReader,
-        cache_manager: CacheManagerRef,
+        cache_manager: Option<CacheManagerRef>,
     ) -> Self {
         selector_result_cache_miss();
         Self::Miss(RowGroupLastRowReader::new(
@@ -167,13 +170,17 @@ pub(crate) struct RowGroupLastRowReader {
     reader: RowGroupReader,
     selector: LastRowSelector,
     yielded_batches: Vec<Batch>,
-    cache_manager: CacheManagerRef,
+    cache_manager: Option<CacheManagerRef>,
     /// Index buffer to take a new batch from the last row.
     take_index: UInt32Vector,
 }
 
 impl RowGroupLastRowReader {
-    fn new(key: SelectorResultKey, reader: RowGroupReader, cache_manager: CacheManagerRef) -> Self {
+    fn new(
+        key: SelectorResultKey,
+        reader: RowGroupReader,
+        cache_manager: Option<CacheManagerRef>,
+    ) -> Self {
         Self {
             key,
             reader,
@@ -213,6 +220,9 @@ impl RowGroupLastRowReader {
             // we always expect that row groups yields batches.
             return;
         }
+        let Some(cache) = &self.cache_manager else {
+            return;
+        };
         let value = Arc::new(SelectorResultValue {
             result: std::mem::take(&mut self.yielded_batches),
             projection: self
@@ -222,7 +232,7 @@ impl RowGroupLastRowReader {
                 .projection_indices()
                 .to_vec(),
         });
-        self.cache_manager.put_selector_result(self.key, value);
+        cache.put_selector_result(self.key, value);
     }
 }
 
diff --git a/src/mito2/src/read/projection.rs b/src/mito2/src/read/projection.rs
index 78866f0c1ba0..9ba5f6eccf1e 100644
--- a/src/mito2/src/read/projection.rs
+++ b/src/mito2/src/read/projection.rs
@@ -171,7 +171,7 @@ impl ProjectionMapper {
     pub(crate) fn convert(
         &self,
         batch: &Batch,
-        cache_manager: &CacheManager,
+        cache_manager: Option<&CacheManager>,
     ) -> common_recordbatch::error::Result<RecordBatch> {
         debug_assert_eq!(self.batch_fields.len(), batch.fields().len());
         debug_assert!(self
@@ -204,12 +204,15 @@ impl ProjectionMapper {
             match index {
                 BatchIndex::Tag(idx) => {
                     let value = &pk_values[*idx];
-                    let vector = repeated_vector_with_cache(
-                        &column_schema.data_type,
-                        value,
-                        num_rows,
-                        cache_manager,
-                    )?;
+                    let vector = match cache_manager {
+                        Some(cache) => repeated_vector_with_cache(
+                            &column_schema.data_type,
+                            value,
+                            num_rows,
+                            cache,
+                        )?,
+                        None => new_repeated_vector(&column_schema.data_type, value, num_rows)?,
+                    };
                     columns.push(vector);
                 }
                 BatchIndex::Timestamp => {
@@ -357,7 +360,7 @@ mod tests {
         // With vector cache.
         let cache = CacheManager::builder().vector_cache_size(1024).build();
         let batch = new_batch(0, &[1, 2], &[(3, 3), (4, 4)], 3);
-        let record_batch = mapper.convert(&batch, &cache).unwrap();
+        let record_batch = mapper.convert(&batch, Some(&cache)).unwrap();
         let expect = "\
 +---------------------+----+----+----+----+
 | ts                  | k0 | k1 | v0 | v1 |
@@ -377,7 +380,7 @@ mod tests {
         assert!(cache
             .get_repeated_vector(&ConcreteDataType::int64_datatype(), &Value::Int64(3))
             .is_none());
-        let record_batch = mapper.convert(&batch, &cache).unwrap();
+        let record_batch = mapper.convert(&batch, Some(&cache)).unwrap();
         assert_eq!(expect, print_record_batch(record_batch));
     }
 
@@ -398,8 +401,7 @@ mod tests {
         );
 
         let batch = new_batch(0, &[1, 2], &[(4, 4)], 3);
-        let cache = CacheManager::builder().vector_cache_size(1024).build();
-        let record_batch = mapper.convert(&batch, &cache).unwrap();
+        let record_batch = mapper.convert(&batch, None).unwrap();
         let expect = "\
 +----+----+
 | v1 | k0 |
diff --git a/src/mito2/src/read/range.rs b/src/mito2/src/read/range.rs
index 554751830ffc..bdad5f8fef0c 100644
--- a/src/mito2/src/read/range.rs
+++ b/src/mito2/src/read/range.rs
@@ -112,7 +112,7 @@ impl RangeMeta {
         Self::push_unordered_file_ranges(
             input.memtables.len(),
             &input.files,
-            &input.cache_manager,
+            input.cache_manager.as_deref(),
             &mut ranges,
         );
 
@@ -203,15 +203,16 @@ impl RangeMeta {
     fn push_unordered_file_ranges(
         num_memtables: usize,
         files: &[FileHandle],
-        cache: &CacheManager,
+        cache: Option<&CacheManager>,
         ranges: &mut Vec<RangeMeta>,
     ) {
         // For append mode, we can parallelize reading row groups.
         for (i, file) in files.iter().enumerate() {
             let file_index = num_memtables + i;
             // Get parquet meta from the cache.
-            let parquet_meta =
-                cache.get_parquet_meta_data_from_mem_cache(file.region_id(), file.file_id());
+            let parquet_meta = cache.and_then(|c| {
+                c.get_parquet_meta_data_from_mem_cache(file.region_id(), file.file_id())
+            });
             if let Some(parquet_meta) = parquet_meta {
                 // Scans each row group.
                 for row_group_index in 0..file.meta_ref().num_row_groups {
diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs
index 471cc1a8e5d4..19324f119f3e 100644
--- a/src/mito2/src/read/scan_region.rs
+++ b/src/mito2/src/read/scan_region.rs
@@ -167,7 +167,7 @@ pub(crate) struct ScanRegion {
     /// Scan request.
     request: ScanRequest,
     /// Cache.
-    cache_manager: CacheManagerRef,
+    cache_manager: Option<CacheManagerRef>,
     /// Capacity of the channel to send data from parallel scan tasks to the main task.
     parallel_scan_channel_size: usize,
     /// Whether to ignore inverted index.
@@ -184,7 +184,7 @@ impl ScanRegion {
         version: VersionRef,
         access_layer: AccessLayerRef,
         request: ScanRequest,
-        cache_manager: CacheManagerRef,
+        cache_manager: Option<CacheManagerRef>,
     ) -> ScanRegion {
         ScanRegion {
             version,
@@ -401,12 +401,17 @@ impl ScanRegion {
         }
 
         let file_cache = || -> Option<FileCacheRef> {
-            let write_cache = self.cache_manager.write_cache()?;
+            let cache_manager = self.cache_manager.as_ref()?;
+            let write_cache = cache_manager.write_cache()?;
             let file_cache = write_cache.file_cache();
             Some(file_cache)
         }();
 
-        let index_cache = self.cache_manager.index_cache().cloned();
+        let index_cache = self
+            .cache_manager
+            .as_ref()
+            .and_then(|c| c.index_cache())
+            .cloned();
 
         InvertedIndexApplierBuilder::new(
             self.access_layer.region_dir().to_string(),
@@ -477,7 +482,7 @@ pub(crate) struct ScanInput {
     /// Handles to SST files to scan.
     pub(crate) files: Vec<FileHandle>,
     /// Cache.
-    pub(crate) cache_manager: CacheManagerRef,
+    pub(crate) cache_manager: Option<CacheManagerRef>,
     /// Ignores file not found error.
     ignore_file_not_found: bool,
     /// Capacity of the channel to send data from parallel scan tasks to the main task.
@@ -508,7 +513,7 @@ impl ScanInput {
             predicate: None,
             memtables: Vec::new(),
             files: Vec::new(),
-            cache_manager: CacheManagerRef::default(),
+            cache_manager: None,
             ignore_file_not_found: false,
             parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE,
             inverted_index_applier: None,
@@ -551,7 +556,7 @@ impl ScanInput {
 
     /// Sets cache for this query.
     #[must_use]
-    pub(crate) fn with_cache(mut self, cache: CacheManagerRef) -> Self {
+    pub(crate) fn with_cache(mut self, cache: Option<CacheManagerRef>) -> Self {
         self.cache_manager = cache;
         self
     }
diff --git a/src/mito2/src/read/seq_scan.rs b/src/mito2/src/read/seq_scan.rs
index d8732cb93df2..bdf3a7d6b8bb 100644
--- a/src/mito2/src/read/seq_scan.rs
+++ b/src/mito2/src/read/seq_scan.rs
@@ -257,7 +257,7 @@ impl SeqScan {
                         .await
                         .map_err(BoxedError::new)
                         .context(ExternalSnafu)?;
-                let cache = &stream_ctx.input.cache_manager;
+                let cache = stream_ctx.input.cache_manager.as_deref();
                 let mut metrics = ScannerMetrics::default();
                 let mut fetch_start = Instant::now();
                 #[cfg(debug_assertions)]
diff --git a/src/mito2/src/read/unordered_scan.rs b/src/mito2/src/read/unordered_scan.rs
index 97db9b86592c..60e5ca5c7cdb 100644
--- a/src/mito2/src/read/unordered_scan.rs
+++ b/src/mito2/src/read/unordered_scan.rs
@@ -149,7 +149,7 @@ impl UnorderedScan {
         let stream = try_stream! {
             part_metrics.on_first_poll();
 
-            let cache = &stream_ctx.input.cache_manager;
+            let cache = stream_ctx.input.cache_manager.as_deref();
             let range_builder_list = Arc::new(RangeBuilderList::new(
                 stream_ctx.input.num_memtables(),
                 stream_ctx.input.num_files(),
diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs
index c94ae600735f..ae51a0d37c29 100644
--- a/src/mito2/src/sst/parquet.rs
+++ b/src/mito2/src/sst/parquet.rs
@@ -195,11 +195,11 @@ mod tests {
             .unwrap();
 
         // Enable page cache.
-        let cache = Arc::new(
+        let cache = Some(Arc::new(
             CacheManager::builder()
                 .page_cache_size(64 * 1024 * 1024)
                 .build(),
-        );
+        ));
         let builder = ParquetReaderBuilder::new(FILE_DIR.to_string(), handle.clone(), object_store)
             .cache(cache.clone());
         for _ in 0..3 {
@@ -219,15 +219,15 @@ mod tests {
 
         // Doesn't have compressed page cached.
         let page_key = PageKey::new_compressed(metadata.region_id, handle.file_id(), 0, 0);
-        assert!(cache.get_pages(&page_key).is_none());
+        assert!(cache.as_ref().unwrap().get_pages(&page_key).is_none());
 
         // Cache 4 row groups.
         for i in 0..4 {
             let page_key = PageKey::new_uncompressed(metadata.region_id, handle.file_id(), i, 0);
-            assert!(cache.get_pages(&page_key).is_some());
+            assert!(cache.as_ref().unwrap().get_pages(&page_key).is_some());
         }
         let page_key = PageKey::new_uncompressed(metadata.region_id, handle.file_id(), 5, 0);
-        assert!(cache.get_pages(&page_key).is_none());
+        assert!(cache.as_ref().unwrap().get_pages(&page_key).is_none());
     }
 
     #[tokio::test]
diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs
index cd219f47ccd6..b73026a7a6e3 100644
--- a/src/mito2/src/sst/parquet/reader.rs
+++ b/src/mito2/src/sst/parquet/reader.rs
@@ -82,7 +82,7 @@ pub struct ParquetReaderBuilder {
     /// can contain columns not in the parquet file.
     projection: Option<Vec<ColumnId>>,
     /// Manager that caches SST data.
-    cache_manager: CacheManagerRef,
+    cache_manager: Option<CacheManagerRef>,
     /// Index appliers.
     inverted_index_applier: Option<InvertedIndexApplierRef>,
     fulltext_index_applier: Option<FulltextIndexApplierRef>,
@@ -106,7 +106,7 @@ impl ParquetReaderBuilder {
             predicate: None,
             time_range: None,
             projection: None,
-            cache_manager: CacheManagerRef::default(),
+            cache_manager: None,
             inverted_index_applier: None,
             fulltext_index_applier: None,
             expected_metadata: None,
@@ -138,7 +138,7 @@ impl ParquetReaderBuilder {
 
     /// Attaches the cache to the builder.
     #[must_use]
-    pub fn cache(mut self, cache: CacheManagerRef) -> ParquetReaderBuilder {
+    pub fn cache(mut self, cache: Option<CacheManagerRef>) -> ParquetReaderBuilder {
         self.cache_manager = cache;
         self
     }
@@ -313,12 +313,10 @@ impl ParquetReaderBuilder {
         let region_id = self.file_handle.region_id();
         let file_id = self.file_handle.file_id();
         // Tries to get from global cache.
-        if let Some(metadata) = self
-            .cache_manager
-            .get_parquet_meta_data(region_id, file_id)
-            .await
-        {
-            return Ok(metadata);
+        if let Some(manager) = &self.cache_manager {
+            if let Some(metadata) = manager.get_parquet_meta_data(region_id, file_id).await {
+                return Ok(metadata);
+            }
         }
 
         // Cache miss, load metadata directly.
@@ -326,11 +324,13 @@ impl ParquetReaderBuilder {
         let metadata = metadata_loader.load().await?;
         let metadata = Arc::new(metadata);
         // Cache the metadata.
-        self.cache_manager.put_parquet_meta_data(
-            self.file_handle.region_id(),
-            self.file_handle.file_id(),
-            metadata.clone(),
-        );
+        if let Some(cache) = &self.cache_manager {
+            cache.put_parquet_meta_data(
+                self.file_handle.region_id(),
+                self.file_handle.file_id(),
+                metadata.clone(),
+            );
+        }
 
         Ok(metadata)
     }
@@ -846,7 +846,7 @@ pub(crate) struct RowGroupReaderBuilder {
     /// Field levels to read.
     field_levels: FieldLevels,
     /// Cache.
-    cache_manager: CacheManagerRef,
+    cache_manager: Option<CacheManagerRef>,
 }
 
 impl RowGroupReaderBuilder {
@@ -864,7 +864,7 @@ impl RowGroupReaderBuilder {
         &self.parquet_meta
     }
 
-    pub(crate) fn cache_manager(&self) -> &CacheManagerRef {
+    pub(crate) fn cache_manager(&self) -> &Option<CacheManagerRef> {
         &self.cache_manager
     }
 
diff --git a/src/mito2/src/sst/parquet/row_group.rs b/src/mito2/src/sst/parquet/row_group.rs
index dd572d8863f8..73382c06d9b3 100644
--- a/src/mito2/src/sst/parquet/row_group.rs
+++ b/src/mito2/src/sst/parquet/row_group.rs
@@ -48,7 +48,7 @@ pub struct InMemoryRowGroup<'a> {
     region_id: RegionId,
     file_id: FileId,
     row_group_idx: usize,
-    cache_manager: CacheManagerRef,
+    cache_manager: Option<CacheManagerRef>,
     /// Row group level cached pages for each column.
     ///
     /// These pages are uncompressed pages of a row group.
@@ -69,7 +69,7 @@ impl<'a> InMemoryRowGroup<'a> {
         file_id: FileId,
         parquet_meta: &'a ParquetMetaData,
         row_group_idx: usize,
-        cache_manager: CacheManagerRef,
+        cache_manager: Option<CacheManagerRef>,
         file_path: &'a str,
         object_store: ObjectStore,
     ) -> Self {
@@ -208,18 +208,19 @@ impl<'a> InMemoryRowGroup<'a> {
                 };
 
                 let column = self.metadata.column(idx);
-
-                if !cache_uncompressed_pages(column) {
-                    // For columns that have multiple uncompressed pages, we only cache the compressed page
-                    // to save memory.
-                    let page_key = PageKey::new_compressed(
-                        self.region_id,
-                        self.file_id,
-                        self.row_group_idx,
-                        idx,
-                    );
-                    self.cache_manager
-                        .put_pages(page_key, Arc::new(PageValue::new_compressed(data.clone())));
+                if let Some(cache) = &self.cache_manager {
+                    if !cache_uncompressed_pages(column) {
+                        // For columns that have multiple uncompressed pages, we only cache the compressed page
+                        // to save memory.
+                        let page_key = PageKey::new_compressed(
+                            self.region_id,
+                            self.file_id,
+                            self.row_group_idx,
+                            idx,
+                        );
+                        cache
+                            .put_pages(page_key, Arc::new(PageValue::new_compressed(data.clone())));
+                    }
                 }
 
                 *chunk = Some(Arc::new(ColumnChunkData::Dense {
@@ -241,6 +242,9 @@ impl<'a> InMemoryRowGroup<'a> {
             .enumerate()
             .filter(|(idx, chunk)| chunk.is_none() && projection.leaf_included(*idx))
             .for_each(|(idx, chunk)| {
+                let Some(cache) = &self.cache_manager else {
+                    return;
+                };
                 let column = self.metadata.column(idx);
                 if cache_uncompressed_pages(column) {
                     // Fetches uncompressed pages for the row group.
@@ -250,7 +254,7 @@ impl<'a> InMemoryRowGroup<'a> {
                         self.row_group_idx,
                         idx,
                     );
-                    self.column_uncompressed_pages[idx] = self.cache_manager.get_pages(&page_key);
+                    self.column_uncompressed_pages[idx] = cache.get_pages(&page_key);
                 } else {
                     // Fetches the compressed page from the cache.
                     let page_key = PageKey::new_compressed(
@@ -260,7 +264,7 @@ impl<'a> InMemoryRowGroup<'a> {
                         idx,
                     );
 
-                    *chunk = self.cache_manager.get_pages(&page_key).map(|page_value| {
+                    *chunk = cache.get_pages(&page_key).map(|page_value| {
                         Arc::new(ColumnChunkData::Dense {
                             offset: column.byte_range().0 as usize,
                             data: page_value.compressed.clone(),
@@ -296,7 +300,7 @@ impl<'a> InMemoryRowGroup<'a> {
         key: IndexKey,
         ranges: &[Range<u64>],
     ) -> Option<Vec<Bytes>> {
-        if let Some(cache) = self.cache_manager.write_cache() {
+        if let Some(cache) = self.cache_manager.as_ref()?.write_cache() {
             return cache.file_cache().read_ranges(key, ranges).await;
         }
         None
@@ -327,6 +331,10 @@ impl<'a> InMemoryRowGroup<'a> {
             }
         };
 
+        let Some(cache) = &self.cache_manager else {
+            return Ok(Box::new(page_reader));
+        };
+
         let column = self.metadata.column(i);
         if cache_uncompressed_pages(column) {
             // This column use row group level page cache.
@@ -335,7 +343,7 @@ impl<'a> InMemoryRowGroup<'a> {
             let page_value = Arc::new(PageValue::new_row_group(pages));
             let page_key =
                 PageKey::new_uncompressed(self.region_id, self.file_id, self.row_group_idx, i);
-            self.cache_manager.put_pages(page_key, page_value.clone());
+            cache.put_pages(page_key, page_value.clone());
 
             return Ok(Box::new(RowGroupCachedReader::new(&page_value.row_group)));
         }

From e1e39993f7847821da113b0102357ec6b07ec0f0 Mon Sep 17 00:00:00 2001
From: Zhenchi <zhongzc_arch@outlook.com>
Date: Wed, 11 Dec 2024 17:25:56 +0800
Subject: [PATCH 24/36] feat(vector): add scalar add function  (#5119)

* refactor: extract implicit conversion helper functions of vector

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* feat(vector): add scalar add function

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* fix fmt

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

---------

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>
---
 src/common/function/src/scalars/vector.rs     |   4 +
 .../function/src/scalars/vector/impl_conv.rs  |   1 -
 .../function/src/scalars/vector/scalar_add.rs | 173 ++++++++++++++++++
 .../function/vector/vector_scalar.result      |  48 +++++
 .../common/function/vector/vector_scalar.sql  |  11 ++
 5 files changed, 236 insertions(+), 1 deletion(-)
 create mode 100644 src/common/function/src/scalars/vector/scalar_add.rs
 create mode 100644 tests/cases/standalone/common/function/vector/vector_scalar.result
 create mode 100644 tests/cases/standalone/common/function/vector/vector_scalar.sql

diff --git a/src/common/function/src/scalars/vector.rs b/src/common/function/src/scalars/vector.rs
index 7c8cf5550e25..0c0428ce9a45 100644
--- a/src/common/function/src/scalars/vector.rs
+++ b/src/common/function/src/scalars/vector.rs
@@ -15,6 +15,7 @@
 mod convert;
 mod distance;
 pub(crate) mod impl_conv;
+mod scalar_add;
 
 use std::sync::Arc;
 
@@ -32,5 +33,8 @@ impl VectorFunction {
         registry.register(Arc::new(distance::CosDistanceFunction));
         registry.register(Arc::new(distance::DotProductFunction));
         registry.register(Arc::new(distance::L2SqDistanceFunction));
+
+        // scalar calculation
+        registry.register(Arc::new(scalar_add::ScalarAddFunction));
     }
 }
diff --git a/src/common/function/src/scalars/vector/impl_conv.rs b/src/common/function/src/scalars/vector/impl_conv.rs
index 903bfb2a0336..70a142c2906b 100644
--- a/src/common/function/src/scalars/vector/impl_conv.rs
+++ b/src/common/function/src/scalars/vector/impl_conv.rs
@@ -109,7 +109,6 @@ pub fn parse_veclit_from_strlit(s: &str) -> Result<Vec<f32>> {
         })
 }
 
-#[allow(unused)]
 /// Convert a vector literal to a binary literal.
 pub fn veclit_to_binlit(vec: &[f32]) -> Vec<u8> {
     if cfg!(target_endian = "little") {
diff --git a/src/common/function/src/scalars/vector/scalar_add.rs b/src/common/function/src/scalars/vector/scalar_add.rs
new file mode 100644
index 000000000000..ef016eff4b47
--- /dev/null
+++ b/src/common/function/src/scalars/vector/scalar_add.rs
@@ -0,0 +1,173 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::borrow::Cow;
+use std::fmt::Display;
+
+use common_query::error::{InvalidFuncArgsSnafu, Result};
+use common_query::prelude::Signature;
+use datatypes::prelude::ConcreteDataType;
+use datatypes::scalars::ScalarVectorBuilder;
+use datatypes::vectors::{BinaryVectorBuilder, MutableVector, VectorRef};
+use nalgebra::DVectorView;
+use snafu::ensure;
+
+use crate::function::{Function, FunctionContext};
+use crate::helper;
+use crate::scalars::vector::impl_conv::{as_veclit, as_veclit_if_const, veclit_to_binlit};
+
+const NAME: &str = "vec_scalar_add";
+
+/// Adds a scalar to each element of a vector.
+///
+/// # Example
+///
+/// ```sql
+/// SELECT vec_to_string(vec_scalar_add(1, "[1, 2, 3]")) as result;
+///
+/// +---------+
+/// | result  |
+/// +---------+
+/// | [2,3,4] |
+/// +---------+
+///
+/// -- Negative scalar to simulate subtraction
+/// SELECT vec_to_string(vec_scalar_add(-1, "[1, 2, 3]")) as result;
+///
+/// +---------+
+/// | result  |
+/// +---------+
+/// | [0,1,2] |
+/// +---------+
+/// ```
+#[derive(Debug, Clone, Default)]
+pub struct ScalarAddFunction;
+
+impl Function for ScalarAddFunction {
+    fn name(&self) -> &str {
+        NAME
+    }
+
+    fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
+        Ok(ConcreteDataType::binary_datatype())
+    }
+
+    fn signature(&self) -> Signature {
+        helper::one_of_sigs2(
+            vec![ConcreteDataType::float64_datatype()],
+            vec![
+                ConcreteDataType::string_datatype(),
+                ConcreteDataType::binary_datatype(),
+            ],
+        )
+    }
+
+    fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result<VectorRef> {
+        ensure!(
+            columns.len() == 2,
+            InvalidFuncArgsSnafu {
+                err_msg: format!(
+                    "The length of the args is not correct, expect exactly two, have: {}",
+                    columns.len()
+                ),
+            }
+        );
+        let arg0 = &columns[0];
+        let arg1 = &columns[1];
+
+        let len = arg0.len();
+        let mut result = BinaryVectorBuilder::with_capacity(len);
+        if len == 0 {
+            return Ok(result.to_vector());
+        }
+
+        let arg1_const = as_veclit_if_const(arg1)?;
+
+        for i in 0..len {
+            let arg0 = arg0.get(i).as_f64_lossy();
+            let Some(arg0) = arg0 else {
+                result.push_null();
+                continue;
+            };
+
+            let arg1 = match arg1_const.as_ref() {
+                Some(arg1) => Some(Cow::Borrowed(arg1.as_ref())),
+                None => as_veclit(arg1.get_ref(i))?,
+            };
+            let Some(arg1) = arg1 else {
+                result.push_null();
+                continue;
+            };
+
+            let vec = DVectorView::from_slice(&arg1, arg1.len());
+            let vec_res = vec.add_scalar(arg0 as _);
+
+            let veclit = vec_res.as_slice();
+            let binlit = veclit_to_binlit(veclit);
+            result.push(Some(&binlit));
+        }
+
+        Ok(result.to_vector())
+    }
+}
+
+impl Display for ScalarAddFunction {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", NAME.to_ascii_uppercase())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use datatypes::vectors::{Float32Vector, StringVector};
+
+    use super::*;
+
+    #[test]
+    fn test_scalar_add() {
+        let func = ScalarAddFunction;
+
+        let input0 = Arc::new(Float32Vector::from(vec![
+            Some(1.0),
+            Some(-1.0),
+            None,
+            Some(3.0),
+        ]));
+        let input1 = Arc::new(StringVector::from(vec![
+            Some("[1.0,2.0,3.0]".to_string()),
+            Some("[4.0,5.0,6.0]".to_string()),
+            Some("[7.0,8.0,9.0]".to_string()),
+            None,
+        ]));
+
+        let result = func
+            .eval(FunctionContext::default(), &[input0, input1])
+            .unwrap();
+
+        let result = result.as_ref();
+        assert_eq!(result.len(), 4);
+        assert_eq!(
+            result.get_ref(0).as_binary().unwrap(),
+            Some(veclit_to_binlit(&[2.0, 3.0, 4.0]).as_slice())
+        );
+        assert_eq!(
+            result.get_ref(1).as_binary().unwrap(),
+            Some(veclit_to_binlit(&[3.0, 4.0, 5.0]).as_slice())
+        );
+        assert!(result.get_ref(2).is_null());
+        assert!(result.get_ref(3).is_null());
+    }
+}
diff --git a/tests/cases/standalone/common/function/vector/vector_scalar.result b/tests/cases/standalone/common/function/vector/vector_scalar.result
new file mode 100644
index 000000000000..5750a0adfdb8
--- /dev/null
+++ b/tests/cases/standalone/common/function/vector/vector_scalar.result
@@ -0,0 +1,48 @@
+SELECT vec_to_string(vec_scalar_add(1.0, '[1.0, 2.0]'));
+
++--------------------------------------------------------------+
+| vec_to_string(vec_scalar_add(Float64(1),Utf8("[1.0, 2.0]"))) |
++--------------------------------------------------------------+
+| [2,3]                                                        |
++--------------------------------------------------------------+
+
+SELECT vec_to_string(vec_scalar_add(-1.0, '[1.0, 2.0]'));
+
++---------------------------------------------------------------+
+| vec_to_string(vec_scalar_add(Float64(-1),Utf8("[1.0, 2.0]"))) |
++---------------------------------------------------------------+
+| [0,1]                                                         |
++---------------------------------------------------------------+
+
+SELECT vec_to_string(vec_scalar_add(1.0, parse_vec('[1.0, 2.0]')));
+
++-------------------------------------------------------------------------+
+| vec_to_string(vec_scalar_add(Float64(1),parse_vec(Utf8("[1.0, 2.0]")))) |
++-------------------------------------------------------------------------+
+| [2,3]                                                                   |
++-------------------------------------------------------------------------+
+
+SELECT vec_to_string(vec_scalar_add(-1.0, parse_vec('[1.0, 2.0]')));
+
++--------------------------------------------------------------------------+
+| vec_to_string(vec_scalar_add(Float64(-1),parse_vec(Utf8("[1.0, 2.0]")))) |
++--------------------------------------------------------------------------+
+| [0,1]                                                                    |
++--------------------------------------------------------------------------+
+
+SELECT vec_to_string(vec_scalar_add(1, '[1.0, 2.0]'));
+
++------------------------------------------------------------+
+| vec_to_string(vec_scalar_add(Int64(1),Utf8("[1.0, 2.0]"))) |
++------------------------------------------------------------+
+| [2,3]                                                      |
++------------------------------------------------------------+
+
+SELECT vec_to_string(vec_scalar_add(-1, '[1.0, 2.0]'));
+
++-------------------------------------------------------------+
+| vec_to_string(vec_scalar_add(Int64(-1),Utf8("[1.0, 2.0]"))) |
++-------------------------------------------------------------+
+| [0,1]                                                       |
++-------------------------------------------------------------+
+
diff --git a/tests/cases/standalone/common/function/vector/vector_scalar.sql b/tests/cases/standalone/common/function/vector/vector_scalar.sql
new file mode 100644
index 000000000000..e438ac6a40ba
--- /dev/null
+++ b/tests/cases/standalone/common/function/vector/vector_scalar.sql
@@ -0,0 +1,11 @@
+SELECT vec_to_string(vec_scalar_add(1.0, '[1.0, 2.0]'));
+
+SELECT vec_to_string(vec_scalar_add(-1.0, '[1.0, 2.0]'));
+
+SELECT vec_to_string(vec_scalar_add(1.0, parse_vec('[1.0, 2.0]')));
+
+SELECT vec_to_string(vec_scalar_add(-1.0, parse_vec('[1.0, 2.0]')));
+
+SELECT vec_to_string(vec_scalar_add(1, '[1.0, 2.0]'));
+
+SELECT vec_to_string(vec_scalar_add(-1, '[1.0, 2.0]'));

From 1a8e77a480cdd0b4d625c919b3594b27ddf76207 Mon Sep 17 00:00:00 2001
From: Yohan Wal <profsyb@gmail.com>
Date: Wed, 11 Dec 2024 17:28:13 +0800
Subject: [PATCH 25/36] test: part of parser test migrated from duckdb (#5125)

* test: update test

* fix: fix test
---
 .../standalone/common/parser/parser.result    | 50 +++++++++++++++++++
 .../cases/standalone/common/parser/parser.sql | 35 +++++++++++++
 2 files changed, 85 insertions(+)
 create mode 100644 tests/cases/standalone/common/parser/parser.result
 create mode 100644 tests/cases/standalone/common/parser/parser.sql

diff --git a/tests/cases/standalone/common/parser/parser.result b/tests/cases/standalone/common/parser/parser.result
new file mode 100644
index 000000000000..7e6dce85b79b
--- /dev/null
+++ b/tests/cases/standalone/common/parser/parser.result
@@ -0,0 +1,50 @@
+-- columns aliases, from:
+-- https://github.com/duckdb/duckdb/blob/9196dd9b0a163e6c8aada26218803d04be30c562/test/sql/parser/columns_aliases.test
+CREATE TABLE integers (ts TIMESTAMP TIME INDEX, i INT, j INT);
+
+Affected Rows: 0
+
+INSERT INTO integers SELECT 0::TIMESTAMP ts, 42 i, 84 j UNION ALL SELECT 1::TIMESTAMP, 13, 14;
+
+Affected Rows: 2
+
+SELECT i, j FROM (SELECT COLUMNS(*)::VARCHAR FROM integers);
+
+Error: 3000(PlanQuery), Failed to plan SQL: Error during planning: Invalid function 'columns'.
+Did you mean 'COUNT'?
+
+SELECT i, j FROM (SELECT * FROM integers);
+
++----+----+
+| i  | j  |
++----+----+
+| 42 | 84 |
+| 13 | 14 |
++----+----+
+
+SELECT min_i, min_j, max_i, max_j FROM (SELECT MIN(i) AS "min_i", MAX(i) AS "max_i", MIN(j) AS "min_j", MAX(j) AS "max_j" FROM integers);
+
++-------+-------+-------+-------+
+| min_i | min_j | max_i | max_j |
++-------+-------+-------+-------+
+| 13    | 14    | 42    | 84    |
++-------+-------+-------+-------+
+
+DROP TABLE integers;
+
+Affected Rows: 0
+
+-- skipped, unsupported feature: digit separators
+-- SELECT 1_000_000;
+-- skipped, unsupported feature: division operator precedence
+-- SELECT 6 + 1 // 2;
+-- expression depth, from:
+-- https://github.com/duckdb/duckdb/blob/9196dd9b0a163e6c8aada26218803d04be30c562/test/sql/parser/expression_depth_limit.test
+SELECT (1+(1+(1+(1+(1+(1+(1+1)))))));
+
++---------------------------------------------------------------------------------------+
+| Int64(1) + Int64(1) + Int64(1) + Int64(1) + Int64(1) + Int64(1) + Int64(1) + Int64(1) |
++---------------------------------------------------------------------------------------+
+| 8                                                                                     |
++---------------------------------------------------------------------------------------+
+
diff --git a/tests/cases/standalone/common/parser/parser.sql b/tests/cases/standalone/common/parser/parser.sql
new file mode 100644
index 000000000000..bd7dcbf400c3
--- /dev/null
+++ b/tests/cases/standalone/common/parser/parser.sql
@@ -0,0 +1,35 @@
+
+-- columns aliases, from:
+-- https://github.com/duckdb/duckdb/blob/9196dd9b0a163e6c8aada26218803d04be30c562/test/sql/parser/columns_aliases.test
+
+CREATE TABLE integers (ts TIMESTAMP TIME INDEX, i INT, j INT);
+
+INSERT INTO integers SELECT 0::TIMESTAMP ts, 42 i, 84 j UNION ALL SELECT 1::TIMESTAMP, 13, 14;
+
+SELECT i, j FROM (SELECT COLUMNS(*)::VARCHAR FROM integers);
+
+SELECT i, j FROM (SELECT * FROM integers);
+
+SELECT min_i, min_j, max_i, max_j FROM (SELECT MIN(i) AS "min_i", MAX(i) AS "max_i", MIN(j) AS "min_j", MAX(j) AS "max_j" FROM integers);
+
+DROP TABLE integers;
+
+-- skipped, unsupported feature: digit separators
+-- SELECT 1_000_000;
+
+-- skipped, unsupported feature: division operator precedence
+-- SELECT 6 + 1 // 2;
+
+-- expression depth, from:
+-- https://github.com/duckdb/duckdb/blob/9196dd9b0a163e6c8aada26218803d04be30c562/test/sql/parser/expression_depth_limit.test
+SELECT (1+(1+(1+(1+(1+(1+(1+1)))))));
+
+-- skipped, unsupported feature: dollar quotes
+-- SELECT $$$$ = '';
+
+-- skipped, unsupported feature: from_first, see also:
+-- https://github.com/GreptimeTeam/greptimedb/issues/5012
+-- FROM integers;
+
+-- skipped, unsupported feature: function chaining
+-- SELECT "abcd".upper().lower();

From 9da2e17d0e0a6302e243f8fefe1c636b0497d45d Mon Sep 17 00:00:00 2001
From: ZonaHe <zonahe@qq.com>
Date: Wed, 11 Dec 2024 20:47:59 +0800
Subject: [PATCH 26/36] feat: update dashboard to v0.7.2 (#5141)

Co-authored-by: sunchanglong <sunchanglong@users.noreply.github.com>
---
 src/servers/dashboard/VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/servers/dashboard/VERSION b/src/servers/dashboard/VERSION
index 63f2359f6421..2c0a9c7b7754 100644
--- a/src/servers/dashboard/VERSION
+++ b/src/servers/dashboard/VERSION
@@ -1 +1 @@
-v0.7.1
+v0.7.2

From 60f8dbf7f01dc08e43b1145f7444ff467d741e38 Mon Sep 17 00:00:00 2001
From: Ruihang Xia <waynestxia@gmail.com>
Date: Wed, 11 Dec 2024 21:33:54 +0800
Subject: [PATCH 27/36] feat: implement `v1/sql/parse` endpoint to parse
 GreptimeDB's SQL dialect (#5144)

* derive ser/de

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* impl method

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix typo

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* remove deserialize

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
---
 Cargo.lock                              |  2 ++
 Cargo.toml                              |  1 +
 src/servers/src/error.rs                | 10 ++++++++-
 src/servers/src/http.rs                 |  4 ++++
 src/servers/src/http/handler.rs         | 28 +++++++++++++++++++++++-
 src/sql/Cargo.toml                      |  1 +
 src/sql/src/statements/admin.rs         |  3 ++-
 src/sql/src/statements/alter.rs         | 11 +++++-----
 src/sql/src/statements/copy.rs          | 11 +++++-----
 src/sql/src/statements/create.rs        | 21 +++++++++---------
 src/sql/src/statements/cursor.rs        |  7 +++---
 src/sql/src/statements/delete.rs        |  3 ++-
 src/sql/src/statements/describe.rs      |  3 ++-
 src/sql/src/statements/drop.rs          |  9 ++++----
 src/sql/src/statements/explain.rs       |  3 ++-
 src/sql/src/statements/insert.rs        |  3 ++-
 src/sql/src/statements/option_map.rs    |  4 +++-
 src/sql/src/statements/query.rs         |  3 ++-
 src/sql/src/statements/set_variables.rs |  3 ++-
 src/sql/src/statements/show.rs          | 29 +++++++++++++------------
 src/sql/src/statements/statement.rs     |  7 ++++--
 src/sql/src/statements/tql.rs           |  9 ++++----
 src/sql/src/statements/truncate.rs      |  3 ++-
 tests-integration/tests/http.rs         |  8 +++++++
 24 files changed, 128 insertions(+), 58 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 628c6a582418..311caafcb2fe 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -11295,6 +11295,7 @@ dependencies = [
  "jsonb",
  "lazy_static",
  "regex",
+ "serde",
  "serde_json",
  "snafu 0.8.5",
  "sqlparser 0.45.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=54a267ac89c09b11c0c88934690530807185d3e7)",
@@ -11371,6 +11372,7 @@ dependencies = [
  "lazy_static",
  "log",
  "regex",
+ "serde",
  "sqlparser 0.45.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "sqlparser_derive 0.2.2 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=54a267ac89c09b11c0c88934690530807185d3e7)",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index d1d360850e70..990bc71a907b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -180,6 +180,7 @@ sysinfo = "0.30"
 # on branch v0.44.x
 sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "54a267ac89c09b11c0c88934690530807185d3e7", features = [
     "visitor",
+    "serde",
 ] }
 strum = { version = "0.25", features = ["derive"] }
 tempfile = "3"
diff --git a/src/servers/src/error.rs b/src/servers/src/error.rs
index 6682a1c78967..071de93683cc 100644
--- a/src/servers/src/error.rs
+++ b/src/servers/src/error.rs
@@ -189,6 +189,13 @@ pub enum Error {
         location: Location,
     },
 
+    #[snafu(display("Failed to parse query"))]
+    FailedToParseQuery {
+        #[snafu(implicit)]
+        location: Location,
+        source: sql::error::Error,
+    },
+
     #[snafu(display("Failed to parse InfluxDB line protocol"))]
     InfluxdbLineProtocol {
         #[snafu(implicit)]
@@ -651,7 +658,8 @@ impl ErrorExt for Error {
             | OpenTelemetryLog { .. }
             | UnsupportedJsonDataTypeForTag { .. }
             | InvalidTableName { .. }
-            | PrepareStatementNotFound { .. } => StatusCode::InvalidArguments,
+            | PrepareStatementNotFound { .. }
+            | FailedToParseQuery { .. } => StatusCode::InvalidArguments,
 
             Catalog { source, .. } => source.status_code(),
             RowWriter { source, .. } => source.status_code(),
diff --git a/src/servers/src/http.rs b/src/servers/src/http.rs
index d8d07ed31fa0..1107870c9a25 100644
--- a/src/servers/src/http.rs
+++ b/src/servers/src/http.rs
@@ -755,6 +755,10 @@ impl HttpServer {
     fn route_sql<S>(api_state: ApiState) -> Router<S> {
         Router::new()
             .route("/sql", routing::get(handler::sql).post(handler::sql))
+            .route(
+                "/sql/parse",
+                routing::get(handler::sql_parse).post(handler::sql_parse),
+            )
             .route(
                 "/promql",
                 routing::get(handler::promql).post(handler::promql),
diff --git a/src/servers/src/http/handler.rs b/src/servers/src/http/handler.rs
index 15a1a0e16c73..153b824d6ef1 100644
--- a/src/servers/src/http/handler.rs
+++ b/src/servers/src/http/handler.rs
@@ -30,8 +30,13 @@ use query::parser::{PromQuery, DEFAULT_LOOKBACK_STRING};
 use serde::{Deserialize, Serialize};
 use serde_json::Value;
 use session::context::{Channel, QueryContext, QueryContextRef};
+use snafu::ResultExt;
+use sql::dialect::GreptimeDbDialect;
+use sql::parser::{ParseOptions, ParserContext};
+use sql::statements::statement::Statement;
 
 use super::header::collect_plan_metrics;
+use crate::error::{FailedToParseQuerySnafu, InvalidQuerySnafu, Result};
 use crate::http::result::arrow_result::ArrowResponse;
 use crate::http::result::csv_result::CsvResponse;
 use crate::http::result::error_result::ErrorResponse;
@@ -146,10 +151,31 @@ pub async fn sql(
     resp.with_execution_time(start.elapsed().as_millis() as u64)
 }
 
+/// Handler to parse sql
+#[axum_macros::debug_handler]
+#[tracing::instrument(skip_all, fields(protocol = "http", request_type = "sql"))]
+pub async fn sql_parse(
+    Query(query_params): Query<SqlQuery>,
+    Form(form_params): Form<SqlQuery>,
+) -> Result<Json<Vec<Statement>>> {
+    let Some(sql) = query_params.sql.or(form_params.sql) else {
+        return InvalidQuerySnafu {
+            reason: "sql parameter is required.",
+        }
+        .fail();
+    };
+
+    let stmts =
+        ParserContext::create_with_dialect(&sql, &GreptimeDbDialect {}, ParseOptions::default())
+            .context(FailedToParseQuerySnafu)?;
+
+    Ok(stmts.into())
+}
+
 /// Create a response from query result
 pub async fn from_output(
     outputs: Vec<crate::error::Result<Output>>,
-) -> Result<(Vec<GreptimeQueryOutput>, HashMap<String, Value>), ErrorResponse> {
+) -> std::result::Result<(Vec<GreptimeQueryOutput>, HashMap<String, Value>), ErrorResponse> {
     // TODO(sunng87): this api response structure cannot represent error well.
     //  It hides successful execution results from error response
     let mut results = Vec::with_capacity(outputs.len());
diff --git a/src/sql/Cargo.toml b/src/sql/Cargo.toml
index e3340a8f6c90..3cb81d6dd494 100644
--- a/src/sql/Cargo.toml
+++ b/src/sql/Cargo.toml
@@ -30,6 +30,7 @@ itertools.workspace = true
 jsonb.workspace = true
 lazy_static.workspace = true
 regex.workspace = true
+serde.workspace = true
 serde_json.workspace = true
 snafu.workspace = true
 sqlparser.workspace = true
diff --git a/src/sql/src/statements/admin.rs b/src/sql/src/statements/admin.rs
index bbe805a4c163..ed068ea47510 100644
--- a/src/sql/src/statements/admin.rs
+++ b/src/sql/src/statements/admin.rs
@@ -14,12 +14,13 @@
 
 use std::fmt::Display;
 
+use serde::Serialize;
 use sqlparser_derive::{Visit, VisitMut};
 
 use crate::ast::Function;
 
 /// `ADMIN` statement to execute some administration commands.
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub enum Admin {
     /// Run a admin function.
     Func(Function),
diff --git a/src/sql/src/statements/alter.rs b/src/sql/src/statements/alter.rs
index cf59257e8931..174bdbbdc310 100644
--- a/src/sql/src/statements/alter.rs
+++ b/src/sql/src/statements/alter.rs
@@ -18,10 +18,11 @@ use api::v1;
 use common_query::AddColumnLocation;
 use datatypes::schema::FulltextOptions;
 use itertools::Itertools;
+use serde::Serialize;
 use sqlparser::ast::{ColumnDef, DataType, Ident, ObjectName, TableConstraint};
 use sqlparser_derive::{Visit, VisitMut};
 
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct AlterTable {
     pub table_name: ObjectName,
     pub alter_operation: AlterTableOperation,
@@ -56,7 +57,7 @@ impl Display for AlterTable {
     }
 }
 
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub enum AlterTableOperation {
     /// `ADD <table_constraint>`
     AddConstraint(TableConstraint),
@@ -151,7 +152,7 @@ impl Display for AlterTableOperation {
     }
 }
 
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct KeyValueOption {
     pub key: String,
     pub value: String,
@@ -166,7 +167,7 @@ impl From<KeyValueOption> for v1::Option {
     }
 }
 
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct AlterDatabase {
     pub database_name: ObjectName,
     pub alter_operation: AlterDatabaseOperation,
@@ -197,7 +198,7 @@ impl Display for AlterDatabase {
     }
 }
 
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub enum AlterDatabaseOperation {
     SetDatabaseOption { options: Vec<KeyValueOption> },
     UnsetDatabaseOption { keys: Vec<String> },
diff --git a/src/sql/src/statements/copy.rs b/src/sql/src/statements/copy.rs
index c68b9d8c0321..436d86d3abaf 100644
--- a/src/sql/src/statements/copy.rs
+++ b/src/sql/src/statements/copy.rs
@@ -14,12 +14,13 @@
 
 use std::fmt::Display;
 
+use serde::Serialize;
 use sqlparser::ast::ObjectName;
 use sqlparser_derive::{Visit, VisitMut};
 
 use crate::statements::OptionMap;
 
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub enum Copy {
     CopyTable(CopyTable),
     CopyDatabase(CopyDatabase),
@@ -34,7 +35,7 @@ impl Display for Copy {
     }
 }
 
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub enum CopyTable {
     To(CopyTableArgument),
     From(CopyTableArgument),
@@ -65,7 +66,7 @@ impl Display for CopyTable {
     }
 }
 
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub enum CopyDatabase {
     To(CopyDatabaseArgument),
     From(CopyDatabaseArgument),
@@ -96,7 +97,7 @@ impl Display for CopyDatabase {
     }
 }
 
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct CopyDatabaseArgument {
     pub database_name: ObjectName,
     pub with: OptionMap,
@@ -104,7 +105,7 @@ pub struct CopyDatabaseArgument {
     pub location: String,
 }
 
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct CopyTableArgument {
     pub table_name: ObjectName,
     pub with: OptionMap,
diff --git a/src/sql/src/statements/create.rs b/src/sql/src/statements/create.rs
index 20ed7b555965..e4ea46572e5f 100644
--- a/src/sql/src/statements/create.rs
+++ b/src/sql/src/statements/create.rs
@@ -18,6 +18,7 @@ use std::fmt::{Display, Formatter};
 use common_catalog::consts::FILE_ENGINE;
 use datatypes::schema::FulltextOptions;
 use itertools::Itertools;
+use serde::Serialize;
 use snafu::ResultExt;
 use sqlparser::ast::{ColumnOptionDef, DataType, Expr, Query};
 use sqlparser_derive::{Visit, VisitMut};
@@ -58,7 +59,7 @@ fn format_table_constraint(constraints: &[TableConstraint]) -> String {
 }
 
 /// Table constraint for create table statement.
-#[derive(Debug, PartialEq, Eq, Clone, Visit, VisitMut)]
+#[derive(Debug, PartialEq, Eq, Clone, Visit, VisitMut, Serialize)]
 pub enum TableConstraint {
     /// Primary key constraint.
     PrimaryKey { columns: Vec<Ident> },
@@ -84,7 +85,7 @@ impl Display for TableConstraint {
     }
 }
 
-#[derive(Debug, PartialEq, Eq, Clone, Visit, VisitMut)]
+#[derive(Debug, PartialEq, Eq, Clone, Visit, VisitMut, Serialize)]
 pub struct CreateTable {
     /// Create if not exists
     pub if_not_exists: bool,
@@ -100,7 +101,7 @@ pub struct CreateTable {
 }
 
 /// Column definition in `CREATE TABLE` statement.
-#[derive(Debug, PartialEq, Eq, Clone, Visit, VisitMut)]
+#[derive(Debug, PartialEq, Eq, Clone, Visit, VisitMut, Serialize)]
 pub struct Column {
     /// `ColumnDef` from `sqlparser::ast`
     pub column_def: ColumnDef,
@@ -109,7 +110,7 @@ pub struct Column {
 }
 
 /// Column extensions for greptimedb dialect.
-#[derive(Debug, PartialEq, Eq, Clone, Visit, VisitMut, Default)]
+#[derive(Debug, PartialEq, Eq, Clone, Visit, VisitMut, Default, Serialize)]
 pub struct ColumnExtensions {
     /// Fulltext options.
     pub fulltext_options: Option<OptionMap>,
@@ -172,7 +173,7 @@ impl ColumnExtensions {
     }
 }
 
-#[derive(Debug, PartialEq, Eq, Clone, Visit, VisitMut)]
+#[derive(Debug, PartialEq, Eq, Clone, Visit, VisitMut, Serialize)]
 pub struct Partitions {
     pub column_list: Vec<Ident>,
     pub exprs: Vec<Expr>,
@@ -244,7 +245,7 @@ impl Display for CreateTable {
     }
 }
 
-#[derive(Debug, PartialEq, Eq, Clone, Visit, VisitMut)]
+#[derive(Debug, PartialEq, Eq, Clone, Visit, VisitMut, Serialize)]
 pub struct CreateDatabase {
     pub name: ObjectName,
     /// Create if not exists
@@ -278,7 +279,7 @@ impl Display for CreateDatabase {
     }
 }
 
-#[derive(Debug, PartialEq, Eq, Clone, Visit, VisitMut)]
+#[derive(Debug, PartialEq, Eq, Clone, Visit, VisitMut, Serialize)]
 pub struct CreateExternalTable {
     /// Table name
     pub name: ObjectName,
@@ -309,7 +310,7 @@ impl Display for CreateExternalTable {
     }
 }
 
-#[derive(Debug, PartialEq, Eq, Clone, Visit, VisitMut)]
+#[derive(Debug, PartialEq, Eq, Clone, Visit, VisitMut, Serialize)]
 pub struct CreateTableLike {
     /// Table name
     pub table_name: ObjectName,
@@ -325,7 +326,7 @@ impl Display for CreateTableLike {
     }
 }
 
-#[derive(Debug, PartialEq, Eq, Clone, Visit, VisitMut)]
+#[derive(Debug, PartialEq, Eq, Clone, Visit, VisitMut, Serialize)]
 pub struct CreateFlow {
     /// Flow name
     pub flow_name: ObjectName,
@@ -367,7 +368,7 @@ impl Display for CreateFlow {
 }
 
 /// Create SQL view statement.
-#[derive(Debug, PartialEq, Eq, Clone, Visit, VisitMut)]
+#[derive(Debug, PartialEq, Eq, Clone, Visit, VisitMut, Serialize)]
 pub struct CreateView {
     /// View name
     pub name: ObjectName,
diff --git a/src/sql/src/statements/cursor.rs b/src/sql/src/statements/cursor.rs
index 72ef4cdcae98..4381cc5e7be5 100644
--- a/src/sql/src/statements/cursor.rs
+++ b/src/sql/src/statements/cursor.rs
@@ -14,6 +14,7 @@
 
 use std::fmt::Display;
 
+use serde::Serialize;
 use sqlparser::ast::ObjectName;
 use sqlparser_derive::{Visit, VisitMut};
 
@@ -22,7 +23,7 @@ use super::query::Query;
 /// Represents a DECLARE CURSOR statement
 ///
 /// This statement will carry a SQL query
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct DeclareCursor {
     pub cursor_name: ObjectName,
     pub query: Box<Query>,
@@ -35,7 +36,7 @@ impl Display for DeclareCursor {
 }
 
 /// Represents a FETCH FROM cursor statement
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct FetchCursor {
     pub cursor_name: ObjectName,
     pub fetch_size: u64,
@@ -48,7 +49,7 @@ impl Display for FetchCursor {
 }
 
 /// Represents a CLOSE cursor statement
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct CloseCursor {
     pub cursor_name: ObjectName,
 }
diff --git a/src/sql/src/statements/delete.rs b/src/sql/src/statements/delete.rs
index 4346610b7d19..dc8f5d69014e 100644
--- a/src/sql/src/statements/delete.rs
+++ b/src/sql/src/statements/delete.rs
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use serde::Serialize;
 use sqlparser::ast::Statement;
 use sqlparser_derive::{Visit, VisitMut};
 
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct Delete {
     pub inner: Statement,
 }
diff --git a/src/sql/src/statements/describe.rs b/src/sql/src/statements/describe.rs
index 743f2b0123c2..1a7bba24e5d3 100644
--- a/src/sql/src/statements/describe.rs
+++ b/src/sql/src/statements/describe.rs
@@ -14,11 +14,12 @@
 
 use std::fmt::Display;
 
+use serde::Serialize;
 use sqlparser::ast::ObjectName;
 use sqlparser_derive::{Visit, VisitMut};
 
 /// SQL structure for `DESCRIBE TABLE`.
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct DescribeTable {
     name: ObjectName,
 }
diff --git a/src/sql/src/statements/drop.rs b/src/sql/src/statements/drop.rs
index a46450db78f7..799722904dab 100644
--- a/src/sql/src/statements/drop.rs
+++ b/src/sql/src/statements/drop.rs
@@ -14,11 +14,12 @@
 
 use std::fmt::Display;
 
+use serde::Serialize;
 use sqlparser::ast::ObjectName;
 use sqlparser_derive::{Visit, VisitMut};
 
 /// DROP TABLE statement.
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct DropTable {
     table_names: Vec<ObjectName>,
 
@@ -62,7 +63,7 @@ impl Display for DropTable {
 }
 
 /// DROP DATABASE statement.
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct DropDatabase {
     name: ObjectName,
     /// drop table if exists
@@ -99,7 +100,7 @@ impl Display for DropDatabase {
 }
 
 /// DROP FLOW statement.
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct DropFlow {
     flow_name: ObjectName,
     /// drop flow if exists
@@ -138,7 +139,7 @@ impl Display for DropFlow {
 }
 
 /// `DROP VIEW` statement.
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct DropView {
     // The view name
     pub view_name: ObjectName,
diff --git a/src/sql/src/statements/explain.rs b/src/sql/src/statements/explain.rs
index 5b3a2671f939..96a12c7a41c6 100644
--- a/src/sql/src/statements/explain.rs
+++ b/src/sql/src/statements/explain.rs
@@ -14,13 +14,14 @@
 
 use std::fmt::{Display, Formatter};
 
+use serde::Serialize;
 use sqlparser::ast::Statement as SpStatement;
 use sqlparser_derive::{Visit, VisitMut};
 
 use crate::error::Error;
 
 /// Explain statement.
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct Explain {
     pub inner: SpStatement,
 }
diff --git a/src/sql/src/statements/insert.rs b/src/sql/src/statements/insert.rs
index 4eae7f1e1874..f1c0b7144441 100644
--- a/src/sql/src/statements/insert.rs
+++ b/src/sql/src/statements/insert.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use serde::Serialize;
 use sqlparser::ast::{ObjectName, Query, SetExpr, Statement, UnaryOperator, Values};
 use sqlparser::parser::ParserError;
 use sqlparser_derive::{Visit, VisitMut};
@@ -20,7 +21,7 @@ use crate::ast::{Expr, Value};
 use crate::error::Result;
 use crate::statements::query::Query as GtQuery;
 
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct Insert {
     // Can only be sqlparser::ast::Statement::Insert variant
     pub inner: Statement,
diff --git a/src/sql/src/statements/option_map.rs b/src/sql/src/statements/option_map.rs
index 9ff8d94312fd..d66cadf16461 100644
--- a/src/sql/src/statements/option_map.rs
+++ b/src/sql/src/statements/option_map.rs
@@ -16,14 +16,16 @@ use std::collections::{BTreeMap, HashMap};
 use std::ops::ControlFlow;
 
 use common_base::secrets::{ExposeSecret, ExposeSecretMut, SecretString};
+use serde::Serialize;
 use sqlparser::ast::{Visit, VisitMut, Visitor, VisitorMut};
 
 const REDACTED_OPTIONS: [&str; 2] = ["access_key_id", "secret_access_key"];
 
 /// Options hashmap.
-#[derive(Clone, Debug, Default)]
+#[derive(Clone, Debug, Default, Serialize)]
 pub struct OptionMap {
     options: BTreeMap<String, String>,
+    #[serde(skip_serializing)]
     secrets: BTreeMap<String, SecretString>,
 }
 
diff --git a/src/sql/src/statements/query.rs b/src/sql/src/statements/query.rs
index 3b571a1a0ba1..b5221a226356 100644
--- a/src/sql/src/statements/query.rs
+++ b/src/sql/src/statements/query.rs
@@ -14,13 +14,14 @@
 
 use std::fmt;
 
+use serde::Serialize;
 use sqlparser::ast::Query as SpQuery;
 use sqlparser_derive::{Visit, VisitMut};
 
 use crate::error::Error;
 
 /// Query statement instance.
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct Query {
     pub inner: SpQuery,
 }
diff --git a/src/sql/src/statements/set_variables.rs b/src/sql/src/statements/set_variables.rs
index 7a2a94a531df..748d077d84ce 100644
--- a/src/sql/src/statements/set_variables.rs
+++ b/src/sql/src/statements/set_variables.rs
@@ -14,11 +14,12 @@
 
 use std::fmt::Display;
 
+use serde::Serialize;
 use sqlparser::ast::{Expr, ObjectName};
 use sqlparser_derive::{Visit, VisitMut};
 
 /// SET variables statement.
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct SetVariables {
     pub variable: ObjectName,
     pub value: Vec<Expr>,
diff --git a/src/sql/src/statements/show.rs b/src/sql/src/statements/show.rs
index f6a8dab72897..055cd7768f02 100644
--- a/src/sql/src/statements/show.rs
+++ b/src/sql/src/statements/show.rs
@@ -14,12 +14,13 @@
 
 use std::fmt::{self, Display};
 
+use serde::Serialize;
 use sqlparser_derive::{Visit, VisitMut};
 
 use crate::ast::{Expr, Ident, ObjectName};
 
 /// Show kind for SQL expressions like `SHOW DATABASE` or `SHOW TABLE`
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub enum ShowKind {
     All,
     Like(Ident),
@@ -46,14 +47,14 @@ macro_rules! format_kind {
 }
 
 /// SQL structure for `SHOW DATABASES`.
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct ShowDatabases {
     pub kind: ShowKind,
     pub full: bool,
 }
 
 /// The SQL `SHOW COLUMNS` statement
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct ShowColumns {
     pub kind: ShowKind,
     pub table: String,
@@ -77,7 +78,7 @@ impl Display for ShowColumns {
 }
 
 /// The SQL `SHOW INDEX` statement
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct ShowIndex {
     pub kind: ShowKind,
     pub table: String,
@@ -118,7 +119,7 @@ impl Display for ShowDatabases {
 }
 
 /// SQL structure for `SHOW TABLES`.
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct ShowTables {
     pub kind: ShowKind,
     pub database: Option<String>,
@@ -142,7 +143,7 @@ impl Display for ShowTables {
 }
 
 /// SQL structure for `SHOW TABLE STATUS`.
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct ShowTableStatus {
     pub kind: ShowKind,
     pub database: Option<String>,
@@ -162,7 +163,7 @@ impl Display for ShowTableStatus {
 }
 
 /// SQL structure for `SHOW CREATE DATABASE`.
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct ShowCreateDatabase {
     pub database_name: ObjectName,
 }
@@ -175,7 +176,7 @@ impl Display for ShowCreateDatabase {
 }
 
 /// SQL structure for `SHOW CREATE TABLE`.
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct ShowCreateTable {
     pub table_name: ObjectName,
 }
@@ -188,7 +189,7 @@ impl Display for ShowCreateTable {
 }
 
 /// SQL structure for `SHOW CREATE FLOW`.
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct ShowCreateFlow {
     pub flow_name: ObjectName,
 }
@@ -201,7 +202,7 @@ impl Display for ShowCreateFlow {
 }
 
 /// SQL structure for `SHOW FLOWS`.
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct ShowFlows {
     pub kind: ShowKind,
     pub database: Option<String>,
@@ -220,7 +221,7 @@ impl Display for ShowFlows {
 }
 
 /// SQL structure for `SHOW CREATE VIEW`.
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct ShowCreateView {
     pub view_name: ObjectName,
 }
@@ -233,7 +234,7 @@ impl Display for ShowCreateView {
 }
 
 /// SQL structure for `SHOW VIEWS`.
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct ShowViews {
     pub kind: ShowKind,
     pub database: Option<String>,
@@ -252,7 +253,7 @@ impl Display for ShowViews {
 }
 
 /// SQL structure for `SHOW VARIABLES xxx`.
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct ShowVariables {
     pub variable: ObjectName,
 }
@@ -265,7 +266,7 @@ impl Display for ShowVariables {
 }
 
 /// SQL structure for "SHOW STATUS"
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct ShowStatus {}
 
 impl Display for ShowStatus {
diff --git a/src/sql/src/statements/statement.rs b/src/sql/src/statements/statement.rs
index 8ad391a00dd2..2870f2b64a6a 100644
--- a/src/sql/src/statements/statement.rs
+++ b/src/sql/src/statements/statement.rs
@@ -15,12 +15,14 @@
 use std::fmt::Display;
 
 use datafusion_sql::parser::Statement as DfStatement;
+use serde::Serialize;
 use sqlparser::ast::Statement as SpStatement;
 use sqlparser_derive::{Visit, VisitMut};
 
 use crate::error::{ConvertToDfStatementSnafu, Error};
 use crate::statements::admin::Admin;
 use crate::statements::alter::{AlterDatabase, AlterTable};
+use crate::statements::copy::Copy;
 use crate::statements::create::{
     CreateDatabase, CreateExternalTable, CreateFlow, CreateTable, CreateTableLike, CreateView,
 };
@@ -42,7 +44,7 @@ use crate::statements::truncate::TruncateTable;
 
 /// Tokens parsed by `DFParser` are converted into these values.
 #[allow(clippy::large_enum_variant)]
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub enum Statement {
     // Query
     Query(Box<Query>),
@@ -107,7 +109,8 @@ pub enum Statement {
     // EXPLAIN QUERY
     Explain(Explain),
     // COPY
-    Copy(crate::statements::copy::Copy),
+    Copy(Copy),
+    // Telemetry Query Language
     Tql(Tql),
     // TRUNCATE TABLE
     TruncateTable(TruncateTable),
diff --git a/src/sql/src/statements/tql.rs b/src/sql/src/statements/tql.rs
index 0f7a85f95ab8..7980103431ef 100644
--- a/src/sql/src/statements/tql.rs
+++ b/src/sql/src/statements/tql.rs
@@ -14,9 +14,10 @@
 
 use std::fmt::Display;
 
+use serde::Serialize;
 use sqlparser_derive::{Visit, VisitMut};
 
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub enum Tql {
     Eval(TqlEval),
     Explain(TqlExplain),
@@ -49,7 +50,7 @@ fn format_tql(
 }
 
 /// TQL EVAL (<start>, <end>, <step>, [lookback]) <promql>
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct TqlEval {
     pub start: String,
     pub end: String,
@@ -74,7 +75,7 @@ impl Display for TqlEval {
 
 /// TQL EXPLAIN [VERBOSE] [<start>, <end>, <step>, [lookback]] <promql>
 /// doesn't execute the query but tells how the query would be executed (similar to SQL EXPLAIN).
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct TqlExplain {
     pub start: String,
     pub end: String,
@@ -103,7 +104,7 @@ impl Display for TqlExplain {
 
 /// TQL ANALYZE [VERBOSE] (<start>, <end>, <step>, [lookback]) <promql>
 /// executes the plan and tells the detailed per-step execution time (similar to SQL ANALYZE).
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct TqlAnalyze {
     pub start: String,
     pub end: String,
diff --git a/src/sql/src/statements/truncate.rs b/src/sql/src/statements/truncate.rs
index c1a063f959ce..710b5f72df3c 100644
--- a/src/sql/src/statements/truncate.rs
+++ b/src/sql/src/statements/truncate.rs
@@ -14,11 +14,12 @@
 
 use std::fmt::Display;
 
+use serde::Serialize;
 use sqlparser::ast::ObjectName;
 use sqlparser_derive::{Visit, VisitMut};
 
 /// TRUNCATE TABLE statement.
-#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut)]
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct TruncateTable {
     table_name: ObjectName,
 }
diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs
index 4da65f0b21f5..5a48fef39e43 100644
--- a/tests-integration/tests/http.rs
+++ b/tests-integration/tests/http.rs
@@ -361,6 +361,14 @@ pub async fn test_sql_api(store_type: StorageType) {
     let body = serde_json::from_str::<ErrorResponse>(&res.text().await).unwrap();
     assert_eq!(body.code(), ErrorCode::DatabaseNotFound as u32);
 
+    // test parse method
+    let res = client.get("/v1/sql/parse?sql=desc table t").send().await;
+    assert_eq!(res.status(), StatusCode::OK);
+    assert_eq!(
+        res.text().await,
+        "[{\"DescribeTable\":{\"name\":[{\"value\":\"t\",\"quote_style\":null}]}}]"
+    );
+
     // test timezone header
     let res = client
         .get("/v1/sql?&sql=show variables system_time_zone")

From a8012147ab52f43513580f17ae210a2dbb439318 Mon Sep 17 00:00:00 2001
From: Niwaka <61189782+NiwakaDev@users.noreply.github.com>
Date: Wed, 11 Dec 2024 22:46:23 +0900
Subject: [PATCH 28/36] feat: support push down IN filter (#5129)

* feat: support push down IN filter

* chore: move tests to prune.sql
---
 src/query/src/dist_plan/commutativity.rs      |  2 +-
 .../standalone/common/select/prune.result     | 26 +++++++++++++++++++
 .../cases/standalone/common/select/prune.sql  | 10 +++++++
 3 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/src/query/src/dist_plan/commutativity.rs b/src/query/src/dist_plan/commutativity.rs
index 8166400b8fbd..45378e532c0c 100644
--- a/src/query/src/dist_plan/commutativity.rs
+++ b/src/query/src/dist_plan/commutativity.rs
@@ -146,6 +146,7 @@ impl Categorizer {
             | Expr::Between(_)
             | Expr::Sort(_)
             | Expr::Exists(_)
+            | Expr::InList(_)
             | Expr::ScalarFunction(_) => Commutativity::Commutative,
 
             Expr::Like(_)
@@ -157,7 +158,6 @@ impl Categorizer {
             | Expr::TryCast(_)
             | Expr::AggregateFunction(_)
             | Expr::WindowFunction(_)
-            | Expr::InList(_)
             | Expr::InSubquery(_)
             | Expr::ScalarSubquery(_)
             | Expr::Wildcard { .. } => Commutativity::Unimplemented,
diff --git a/tests/cases/standalone/common/select/prune.result b/tests/cases/standalone/common/select/prune.result
index 13ddee5510d2..04282b6035a0 100644
--- a/tests/cases/standalone/common/select/prune.result
+++ b/tests/cases/standalone/common/select/prune.result
@@ -94,6 +94,32 @@ explain analyze select * from demo where idc='idc1';
 |_|_| Total rows: 2_|
 +-+-+-+
 
+SELECT * FROM demo where host in ('test1');
+
++-------------------------+-------+-------+------+-----------+
+| ts                      | value | host  | idc  | collector |
++-------------------------+-------+-------+------+-----------+
+| 1970-01-01T00:00:00.001 | 2.0   | test1 | idc1 | disk      |
++-------------------------+-------+-------+------+-----------+
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+explain analyze SELECT * FROM demo where host in ('test1');
+
++-+-+-+
+| stage | node | plan_|
++-+-+-+
+| 0_| 0_|_MergeScanExec: REDACTED
+|_|_|_|
+| 1_| 0_|_SeqScan: region=REDACTED, partition_count=1 (1 memtable ranges, 0 file 0 ranges) REDACTED
+|_|_|_|
+|_|_| Total rows: 1_|
++-+-+-+
+
 drop table demo;
 
 Affected Rows: 0
diff --git a/tests/cases/standalone/common/select/prune.sql b/tests/cases/standalone/common/select/prune.sql
index e7fd643537a1..4b976cdb1c7d 100644
--- a/tests/cases/standalone/common/select/prune.sql
+++ b/tests/cases/standalone/common/select/prune.sql
@@ -27,4 +27,14 @@ select * from demo where collector='disk' order by ts;
 -- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
 explain analyze select * from demo where idc='idc1';
 
+SELECT * FROM demo where host in ('test1');
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+explain analyze SELECT * FROM demo where host in ('test1');
+
 drop table demo;

From e2a41ccaec9976641dbaeeb4b1e6cec6f3d37783 Mon Sep 17 00:00:00 2001
From: Weny Xu <wenymedia@gmail.com>
Date: Thu, 12 Dec 2024 11:13:36 +0800
Subject: [PATCH 29/36] feat: add prefetch support to `PuffinFileFooterReader`
 for reduced I/O time (#5145)

* feat: introduce `PuffinFileFooterReader`

* refactor: remove `SyncReader` trait and impl

* refactor: replace `FooterParser` with `PuffinFileFooterReader`

* chore: remove unused errors
---
 src/index/src/inverted_index/error.rs       |  11 +-
 src/puffin/src/error.rs                     |  52 +---
 src/puffin/src/file_format/reader.rs        |  14 +-
 src/puffin/src/file_format/reader/file.rs   |  73 +----
 src/puffin/src/file_format/reader/footer.rs | 323 +++++---------------
 src/puffin/src/tests.rs                     | 180 ++---------
 6 files changed, 130 insertions(+), 523 deletions(-)

diff --git a/src/index/src/inverted_index/error.rs b/src/index/src/inverted_index/error.rs
index 07a42b8b8767..49816e63c463 100644
--- a/src/index/src/inverted_index/error.rs
+++ b/src/index/src/inverted_index/error.rs
@@ -26,14 +26,6 @@ use crate::inverted_index::search::predicate::Predicate;
 #[snafu(visibility(pub))]
 #[stack_trace_debug]
 pub enum Error {
-    #[snafu(display("Failed to seek"))]
-    Seek {
-        #[snafu(source)]
-        error: IoError,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
     #[snafu(display("Failed to read"))]
     Read {
         #[snafu(source)]
@@ -215,8 +207,7 @@ impl ErrorExt for Error {
     fn status_code(&self) -> StatusCode {
         use Error::*;
         match self {
-            Seek { .. }
-            | Read { .. }
+            Read { .. }
             | Write { .. }
             | Flush { .. }
             | Close { .. }
diff --git a/src/puffin/src/error.rs b/src/puffin/src/error.rs
index 57aec44d1fb8..634ede5b1364 100644
--- a/src/puffin/src/error.rs
+++ b/src/puffin/src/error.rs
@@ -25,14 +25,6 @@ use snafu::{Location, Snafu};
 #[snafu(visibility(pub))]
 #[stack_trace_debug]
 pub enum Error {
-    #[snafu(display("Failed to seek"))]
-    Seek {
-        #[snafu(source)]
-        error: IoError,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
     #[snafu(display("Failed to read"))]
     Read {
         #[snafu(source)]
@@ -119,14 +111,6 @@ pub enum Error {
         location: Location,
     },
 
-    #[snafu(display("Failed to convert bytes to integer"))]
-    BytesToInteger {
-        #[snafu(source)]
-        error: std::array::TryFromSliceError,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
     #[snafu(display("Unsupported decompression: {}", decompression))]
     UnsupportedDecompression {
         decompression: String,
@@ -150,17 +134,15 @@ pub enum Error {
         location: Location,
     },
 
-    #[snafu(display("Parse stage not match, expected: {}, actual: {}", expected, actual))]
-    ParseStageNotMatch {
-        expected: String,
-        actual: String,
+    #[snafu(display("Unexpected footer payload size: {}", size))]
+    UnexpectedFooterPayloadSize {
+        size: i32,
         #[snafu(implicit)]
         location: Location,
     },
 
-    #[snafu(display("Unexpected footer payload size: {}", size))]
-    UnexpectedFooterPayloadSize {
-        size: i32,
+    #[snafu(display("Invalid puffin footer"))]
+    InvalidPuffinFooter {
         #[snafu(implicit)]
         location: Location,
     },
@@ -177,20 +159,6 @@ pub enum Error {
         location: Location,
     },
 
-    #[snafu(display("Invalid blob offset: {}, location: {:?}", offset, location))]
-    InvalidBlobOffset {
-        offset: i64,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
-    #[snafu(display("Invalid blob area end: {}, location: {:?}", offset, location))]
-    InvalidBlobAreaEnd {
-        offset: u64,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
     #[snafu(display("Failed to compress lz4"))]
     Lz4Compression {
         #[snafu(source)]
@@ -262,8 +230,7 @@ impl ErrorExt for Error {
     fn status_code(&self) -> StatusCode {
         use Error::*;
         match self {
-            Seek { .. }
-            | Read { .. }
+            Read { .. }
             | MagicNotMatched { .. }
             | DeserializeJson { .. }
             | Write { .. }
@@ -275,18 +242,15 @@ impl ErrorExt for Error {
             | Remove { .. }
             | Rename { .. }
             | SerializeJson { .. }
-            | BytesToInteger { .. }
-            | ParseStageNotMatch { .. }
             | UnexpectedFooterPayloadSize { .. }
             | UnexpectedPuffinFileSize { .. }
-            | InvalidBlobOffset { .. }
-            | InvalidBlobAreaEnd { .. }
             | Lz4Compression { .. }
             | Lz4Decompression { .. }
             | BlobNotFound { .. }
             | BlobIndexOutOfBound { .. }
             | FileKeyNotMatch { .. }
-            | WalkDir { .. } => StatusCode::Unexpected,
+            | WalkDir { .. }
+            | InvalidPuffinFooter { .. } => StatusCode::Unexpected,
 
             UnsupportedCompression { .. } | UnsupportedDecompression { .. } => {
                 StatusCode::Unsupported
diff --git a/src/puffin/src/file_format/reader.rs b/src/puffin/src/file_format/reader.rs
index 3f48bf4b105e..162d7116a578 100644
--- a/src/puffin/src/file_format/reader.rs
+++ b/src/puffin/src/file_format/reader.rs
@@ -21,21 +21,9 @@ use common_base::range_read::RangeReader;
 use crate::blob_metadata::BlobMetadata;
 use crate::error::Result;
 pub use crate::file_format::reader::file::PuffinFileReader;
+pub use crate::file_format::reader::footer::PuffinFileFooterReader;
 use crate::file_metadata::FileMetadata;
 
-/// `SyncReader` defines a synchronous reader for puffin data.
-pub trait SyncReader<'a> {
-    type Reader: std::io::Read + std::io::Seek;
-
-    /// Fetches the FileMetadata.
-    fn metadata(&'a mut self) -> Result<FileMetadata>;
-
-    /// Reads particular blob data based on given metadata.
-    ///
-    /// Data read from the reader is compressed leaving the caller to decompress the data.
-    fn blob_reader(&'a mut self, blob_metadata: &BlobMetadata) -> Result<Self::Reader>;
-}
-
 /// `AsyncReader` defines an asynchronous reader for puffin data.
 #[async_trait]
 pub trait AsyncReader<'a> {
diff --git a/src/puffin/src/file_format/reader/file.rs b/src/puffin/src/file_format/reader/file.rs
index 3736ed5d2d8d..31e8e10bc4d5 100644
--- a/src/puffin/src/file_format/reader/file.rs
+++ b/src/puffin/src/file_format/reader/file.rs
@@ -12,20 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::io::{self, SeekFrom};
-
 use async_trait::async_trait;
 use common_base::range_read::RangeReader;
 use snafu::{ensure, ResultExt};
 
 use crate::blob_metadata::BlobMetadata;
-use crate::error::{
-    MagicNotMatchedSnafu, ReadSnafu, Result, SeekSnafu, UnexpectedPuffinFileSizeSnafu,
-    UnsupportedDecompressionSnafu,
-};
-use crate::file_format::reader::footer::FooterParser;
-use crate::file_format::reader::{AsyncReader, SyncReader};
-use crate::file_format::{MAGIC, MAGIC_SIZE, MIN_FILE_SIZE};
+use crate::error::{ReadSnafu, Result, UnexpectedPuffinFileSizeSnafu};
+use crate::file_format::reader::footer::DEFAULT_PREFETCH_SIZE;
+use crate::file_format::reader::{AsyncReader, PuffinFileFooterReader};
+use crate::file_format::MIN_FILE_SIZE;
 use crate::file_metadata::FileMetadata;
 use crate::partial_reader::PartialReader;
 
@@ -72,45 +67,6 @@ impl<R> PuffinFileReader<R> {
     }
 }
 
-impl<'a, R: io::Read + io::Seek + 'a> SyncReader<'a> for PuffinFileReader<R> {
-    type Reader = PartialReader<&'a mut R>;
-
-    fn metadata(&mut self) -> Result<FileMetadata> {
-        if let Some(metadata) = &self.metadata {
-            return Ok(metadata.clone());
-        }
-
-        // check the magic
-        let mut magic = [0; MAGIC_SIZE as usize];
-        self.source.read_exact(&mut magic).context(ReadSnafu)?;
-        ensure!(magic == MAGIC, MagicNotMatchedSnafu);
-
-        let file_size = self.get_file_size_sync()?;
-
-        // parse the footer
-        let metadata = FooterParser::new(&mut self.source, file_size).parse_sync()?;
-        self.metadata = Some(metadata.clone());
-        Ok(metadata)
-    }
-
-    fn blob_reader(&'a mut self, blob_metadata: &BlobMetadata) -> Result<Self::Reader> {
-        // TODO(zhongzc): support decompression
-        let compression = blob_metadata.compression_codec.as_ref();
-        ensure!(
-            compression.is_none(),
-            UnsupportedDecompressionSnafu {
-                decompression: compression.unwrap().to_string()
-            }
-        );
-
-        Ok(PartialReader::new(
-            &mut self.source,
-            blob_metadata.offset as _,
-            blob_metadata.length as _,
-        ))
-    }
-}
-
 #[async_trait]
 impl<'a, R: RangeReader + 'a> AsyncReader<'a> for PuffinFileReader<R> {
     type Reader = PartialReader<&'a mut R>;
@@ -119,17 +75,10 @@ impl<'a, R: RangeReader + 'a> AsyncReader<'a> for PuffinFileReader<R> {
         if let Some(metadata) = &self.metadata {
             return Ok(metadata.clone());
         }
-
-        // check the magic
-        let magic = self.source.read(0..MAGIC_SIZE).await.context(ReadSnafu)?;
-        ensure!(*magic == MAGIC, MagicNotMatchedSnafu);
-
         let file_size = self.get_file_size_async().await?;
-
-        // parse the footer
-        let metadata = FooterParser::new(&mut self.source, file_size)
-            .parse_async()
-            .await?;
+        let mut reader = PuffinFileFooterReader::new(&mut self.source, file_size)
+            .with_prefetch_size(DEFAULT_PREFETCH_SIZE);
+        let metadata = reader.metadata().await?;
         self.metadata = Some(metadata.clone());
         Ok(metadata)
     }
@@ -143,14 +92,6 @@ impl<'a, R: RangeReader + 'a> AsyncReader<'a> for PuffinFileReader<R> {
     }
 }
 
-impl<R: io::Read + io::Seek> PuffinFileReader<R> {
-    fn get_file_size_sync(&mut self) -> Result<u64> {
-        let file_size = self.source.seek(SeekFrom::End(0)).context(SeekSnafu)?;
-        Self::validate_file_size(file_size)?;
-        Ok(file_size)
-    }
-}
-
 impl<R: RangeReader> PuffinFileReader<R> {
     async fn get_file_size_async(&mut self) -> Result<u64> {
         let file_size = self
diff --git a/src/puffin/src/file_format/reader/footer.rs b/src/puffin/src/file_format/reader/footer.rs
index aa764fd32a21..d0cd1e8ed4f0 100644
--- a/src/puffin/src/file_format/reader/footer.rs
+++ b/src/puffin/src/file_format/reader/footer.rs
@@ -12,240 +12,98 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::io::{self, Cursor, SeekFrom};
+use std::io::Cursor;
 
 use common_base::range_read::RangeReader;
 use snafu::{ensure, ResultExt};
 
 use crate::error::{
-    BytesToIntegerSnafu, DeserializeJsonSnafu, InvalidBlobAreaEndSnafu, InvalidBlobOffsetSnafu,
-    Lz4DecompressionSnafu, MagicNotMatchedSnafu, ParseStageNotMatchSnafu, ReadSnafu, Result,
-    SeekSnafu, UnexpectedFooterPayloadSizeSnafu,
+    DeserializeJsonSnafu, InvalidPuffinFooterSnafu, Lz4DecompressionSnafu, MagicNotMatchedSnafu,
+    ReadSnafu, Result, UnexpectedFooterPayloadSizeSnafu,
 };
 use crate::file_format::{Flags, FLAGS_SIZE, MAGIC, MAGIC_SIZE, MIN_FILE_SIZE, PAYLOAD_SIZE_SIZE};
 use crate::file_metadata::FileMetadata;
 
-/// Parser for the footer of a Puffin data file
+/// The default prefetch size for the footer reader.
+pub const DEFAULT_PREFETCH_SIZE: u64 = 1024; // 1KiB
+
+/// Reader for the footer of a Puffin data file
 ///
 /// The footer has a specific layout that needs to be read and parsed to
 /// extract metadata about the file, which is encapsulated in the [`FileMetadata`] type.
 ///
+/// This reader supports prefetching, allowing for more efficient reading
+/// of the footer by fetching additional data ahead of time.
+///
 /// ```text
 /// Footer layout: HeadMagic Payload PayloadSize Flags FootMagic
 ///                [4]       [?]     [4]         [4]   [4]
 /// ```
-pub struct FooterParser<R> {
-    // The underlying IO source
+pub struct PuffinFileFooterReader<R> {
+    /// The source of the puffin file
     source: R,
-
-    // The size of the file, used for calculating offsets to read from
-    file_size: u64,
-}
-
-impl<R> FooterParser<R> {
-    pub fn new(source: R, file_size: u64) -> Self {
-        Self { source, file_size }
-    }
-}
-
-impl<R: io::Read + io::Seek> FooterParser<R> {
-    /// Parses the footer from the IO source in a synchronous manner.
-    pub fn parse_sync(&mut self) -> Result<FileMetadata> {
-        let mut parser = StageParser::new(self.file_size);
-
-        let mut buf = vec![];
-        while let Some(byte_to_read) = parser.next_to_read() {
-            self.source
-                .seek(SeekFrom::Start(byte_to_read.offset))
-                .context(SeekSnafu)?;
-            let size = byte_to_read.size as usize;
-
-            buf.resize(size, 0);
-            let buf = &mut buf[..size];
-
-            self.source.read_exact(buf).context(ReadSnafu)?;
-
-            parser.consume_bytes(buf)?;
-        }
-
-        parser.finish()
-    }
-}
-
-impl<R: RangeReader> FooterParser<R> {
-    /// Parses the footer from the IO source in a asynchronous manner.
-    pub async fn parse_async(&mut self) -> Result<FileMetadata> {
-        let mut parser = StageParser::new(self.file_size);
-
-        let mut buf = vec![];
-        while let Some(byte_to_read) = parser.next_to_read() {
-            buf.clear();
-            let range = byte_to_read.offset..byte_to_read.offset + byte_to_read.size;
-            self.source
-                .read_into(range, &mut buf)
-                .await
-                .context(ReadSnafu)?;
-            parser.consume_bytes(&buf)?;
-        }
-
-        parser.finish()
-    }
-}
-
-/// The internal stages of parsing the footer.
-/// This enum allows the StageParser to keep track of which part
-/// of the footer needs to be parsed next.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-enum ParseStage {
-    FootMagic,
-    Flags,
-    PayloadSize,
-    Payload,
-    HeadMagic,
-    Done,
-}
-
-/// Manages the parsing process of the file's footer.
-struct StageParser {
-    /// Current stage in the parsing sequence of the footer.
-    stage: ParseStage,
-
-    /// Total file size; used for calculating offsets to read from.
+    /// The content length of the puffin file
     file_size: u64,
-
-    /// Flags from the footer, set when the `Flags` field is parsed.
-    flags: Flags,
-
-    /// Size of the footer's payload, set when the `PayloadSize` is parsed.
-    payload_size: u64,
-
-    /// Metadata from the footer's payload, set when the `Payload` is parsed.
-    metadata: Option<FileMetadata>,
-}
-
-/// Represents a read operation that needs to be performed, including the
-/// offset from the start of the file and the number of bytes to read.
-struct BytesToRead {
-    offset: u64,
-    size: u64,
+    /// The prefetch footer size
+    prefetch_size: Option<u64>,
 }
 
-impl StageParser {
-    fn new(file_size: u64) -> Self {
+impl<'a, R: RangeReader + 'a> PuffinFileFooterReader<R> {
+    pub fn new(source: R, content_len: u64) -> Self {
         Self {
-            stage: ParseStage::FootMagic,
-            file_size,
-            payload_size: 0,
-            flags: Flags::empty(),
-            metadata: None,
+            source,
+            file_size: content_len,
+            prefetch_size: None,
         }
     }
 
-    /// Determines the next segment of bytes to read based on the current parsing stage.
-    /// This method returns information like the offset and size of the next read,
-    /// or None if parsing is complete.
-    fn next_to_read(&self) -> Option<BytesToRead> {
-        if self.stage == ParseStage::Done {
-            return None;
-        }
-
-        let btr = match self.stage {
-            ParseStage::FootMagic => BytesToRead {
-                offset: self.foot_magic_offset(),
-                size: MAGIC_SIZE,
-            },
-            ParseStage::Flags => BytesToRead {
-                offset: self.flags_offset(),
-                size: FLAGS_SIZE,
-            },
-            ParseStage::PayloadSize => BytesToRead {
-                offset: self.payload_size_offset(),
-                size: PAYLOAD_SIZE_SIZE,
-            },
-            ParseStage::Payload => BytesToRead {
-                offset: self.payload_offset(),
-                size: self.payload_size,
-            },
-            ParseStage::HeadMagic => BytesToRead {
-                offset: self.head_magic_offset(),
-                size: MAGIC_SIZE,
-            },
-            ParseStage::Done => unreachable!(),
-        };
-
-        Some(btr)
+    fn prefetch_size(&self) -> u64 {
+        self.prefetch_size.unwrap_or(MIN_FILE_SIZE)
     }
 
-    /// Processes the bytes that have been read according to the current parsing stage
-    /// and advances the parsing stage. It ensures the correct sequence of bytes is
-    /// encountered and stores the necessary information in the `StageParser`.
-    fn consume_bytes(&mut self, bytes: &[u8]) -> Result<()> {
-        match self.stage {
-            ParseStage::FootMagic => {
-                ensure!(bytes == MAGIC, MagicNotMatchedSnafu);
-                self.stage = ParseStage::Flags;
-            }
-            ParseStage::Flags => {
-                self.flags = Self::parse_flags(bytes)?;
-                self.stage = ParseStage::PayloadSize;
-            }
-            ParseStage::PayloadSize => {
-                self.payload_size = Self::parse_payload_size(bytes)?;
-                self.validate_payload_size()?;
-                self.stage = ParseStage::Payload;
-            }
-            ParseStage::Payload => {
-                self.metadata = Some(self.parse_payload(bytes)?);
-                self.validate_metadata()?;
-                self.stage = ParseStage::HeadMagic;
-            }
-            ParseStage::HeadMagic => {
-                ensure!(bytes == MAGIC, MagicNotMatchedSnafu);
-                self.stage = ParseStage::Done;
-            }
-            ParseStage::Done => unreachable!(),
-        }
-
-        Ok(())
+    pub fn with_prefetch_size(mut self, prefetch_size: u64) -> Self {
+        self.prefetch_size = Some(prefetch_size.max(MIN_FILE_SIZE));
+        self
     }
 
-    /// Finalizes the parsing process, ensuring all stages are complete, and returns
-    /// the parsed `FileMetadata`. It converts the raw footer payload into structured data.
-    fn finish(self) -> Result<FileMetadata> {
-        ensure!(
-            self.stage == ParseStage::Done,
-            ParseStageNotMatchSnafu {
-                expected: format!("{:?}", ParseStage::Done),
-                actual: format!("{:?}", self.stage),
-            }
-        );
+    pub async fn metadata(&'a mut self) -> Result<FileMetadata> {
+        // Note: prefetch > content_len is allowed, since we're using saturating_sub.
+        let footer_start = self.file_size.saturating_sub(self.prefetch_size());
+        let suffix = self
+            .source
+            .read(footer_start..self.file_size)
+            .await
+            .context(ReadSnafu)?;
+        let suffix_len = suffix.len();
 
-        Ok(self.metadata.unwrap())
-    }
+        // check the magic
+        let magic = Self::read_tailing_four_bytes(&suffix)?;
+        ensure!(magic == MAGIC, MagicNotMatchedSnafu);
 
-    fn parse_flags(bytes: &[u8]) -> Result<Flags> {
-        let n = u32::from_le_bytes(bytes.try_into().context(BytesToIntegerSnafu)?);
-        Ok(Flags::from_bits_truncate(n))
-    }
-
-    fn parse_payload_size(bytes: &[u8]) -> Result<u64> {
-        let n = i32::from_le_bytes(bytes.try_into().context(BytesToIntegerSnafu)?);
-        ensure!(n >= 0, UnexpectedFooterPayloadSizeSnafu { size: n });
-        Ok(n as u64)
-    }
+        let flags = self.decode_flags(&suffix[..suffix_len - MAGIC_SIZE as usize])?;
+        let length = self.decode_payload_size(
+            &suffix[..suffix_len - MAGIC_SIZE as usize - FLAGS_SIZE as usize],
+        )?;
+        let footer_size = PAYLOAD_SIZE_SIZE + FLAGS_SIZE + MAGIC_SIZE;
 
-    fn validate_payload_size(&self) -> Result<()> {
-        ensure!(
-            self.payload_size <= self.file_size - MIN_FILE_SIZE,
-            UnexpectedFooterPayloadSizeSnafu {
-                size: self.payload_size as i32
-            }
-        );
-        Ok(())
+        // Did not fetch the entire file metadata in the initial read, need to make a second request.
+        if length > suffix_len as u64 - footer_size {
+            let metadata_start = self.file_size - length - footer_size;
+            let meta = self
+                .source
+                .read(metadata_start..self.file_size - footer_size)
+                .await
+                .context(ReadSnafu)?;
+            self.parse_payload(&flags, &meta)
+        } else {
+            let metadata_start = self.file_size - length - footer_size - footer_start;
+            let meta = &suffix[metadata_start as usize..suffix_len - footer_size as usize];
+            self.parse_payload(&flags, meta)
+        }
     }
 
-    fn parse_payload(&self, bytes: &[u8]) -> Result<FileMetadata> {
-        if self.flags.contains(Flags::FOOTER_PAYLOAD_COMPRESSED_LZ4) {
+    fn parse_payload(&self, flags: &Flags, bytes: &[u8]) -> Result<FileMetadata> {
+        if flags.contains(Flags::FOOTER_PAYLOAD_COMPRESSED_LZ4) {
             let decoder = lz4_flex::frame::FrameDecoder::new(Cursor::new(bytes));
             let res = serde_json::from_reader(decoder).context(Lz4DecompressionSnafu)?;
             Ok(res)
@@ -254,54 +112,35 @@ impl StageParser {
         }
     }
 
-    fn validate_metadata(&self) -> Result<()> {
-        let metadata = self.metadata.as_ref().expect("metadata is not set");
-
-        let mut next_blob_offset = MAGIC_SIZE;
-        // check blob offsets
-        for blob in &metadata.blobs {
-            ensure!(
-                blob.offset as u64 == next_blob_offset,
-                InvalidBlobOffsetSnafu {
-                    offset: blob.offset
-                }
-            );
-            next_blob_offset += blob.length as u64;
-        }
-
-        let blob_area_end = metadata
-            .blobs
-            .last()
-            .map_or(MAGIC_SIZE, |b| (b.offset + b.length) as u64);
-        ensure!(
-            blob_area_end == self.head_magic_offset(),
-            InvalidBlobAreaEndSnafu {
-                offset: blob_area_end
-            }
-        );
-
-        Ok(())
-    }
+    fn read_tailing_four_bytes(suffix: &[u8]) -> Result<[u8; 4]> {
+        let suffix_len = suffix.len();
+        ensure!(suffix_len >= 4, InvalidPuffinFooterSnafu);
+        let mut bytes = [0; 4];
+        bytes.copy_from_slice(&suffix[suffix_len - 4..suffix_len]);
 
-    fn foot_magic_offset(&self) -> u64 {
-        self.file_size - MAGIC_SIZE
+        Ok(bytes)
     }
 
-    fn flags_offset(&self) -> u64 {
-        self.file_size - MAGIC_SIZE - FLAGS_SIZE
+    fn decode_flags(&self, suffix: &[u8]) -> Result<Flags> {
+        let flags = u32::from_le_bytes(Self::read_tailing_four_bytes(suffix)?);
+        Ok(Flags::from_bits_truncate(flags))
     }
 
-    fn payload_size_offset(&self) -> u64 {
-        self.file_size - MAGIC_SIZE - FLAGS_SIZE - PAYLOAD_SIZE_SIZE
-    }
+    fn decode_payload_size(&self, suffix: &[u8]) -> Result<u64> {
+        let payload_size = i32::from_le_bytes(Self::read_tailing_four_bytes(suffix)?);
 
-    fn payload_offset(&self) -> u64 {
-        // `validate_payload_size` ensures that this subtraction will not overflow
-        self.file_size - MAGIC_SIZE - FLAGS_SIZE - PAYLOAD_SIZE_SIZE - self.payload_size
-    }
+        ensure!(
+            payload_size >= 0,
+            UnexpectedFooterPayloadSizeSnafu { size: payload_size }
+        );
+        let payload_size = payload_size as u64;
+        ensure!(
+            payload_size <= self.file_size - MIN_FILE_SIZE,
+            UnexpectedFooterPayloadSizeSnafu {
+                size: self.file_size as i32
+            }
+        );
 
-    fn head_magic_offset(&self) -> u64 {
-        // `validate_payload_size` ensures that this subtraction will not overflow
-        self.file_size - MAGIC_SIZE * 2 - FLAGS_SIZE - PAYLOAD_SIZE_SIZE - self.payload_size
+        Ok(payload_size)
     }
 }
diff --git a/src/puffin/src/tests.rs b/src/puffin/src/tests.rs
index a152d4124bd6..a3bb48587924 100644
--- a/src/puffin/src/tests.rs
+++ b/src/puffin/src/tests.rs
@@ -13,26 +13,14 @@
 // limitations under the License.
 
 use std::collections::HashMap;
-use std::fs::File;
-use std::io::{Cursor, Read};
 use std::vec;
 
 use common_base::range_read::{FileReader, RangeReader};
 use futures::io::Cursor as AsyncCursor;
 
-use crate::file_format::reader::{AsyncReader, PuffinFileReader, SyncReader};
-use crate::file_format::writer::{AsyncWriter, Blob, PuffinFileWriter, SyncWriter};
-
-#[test]
-fn test_read_empty_puffin_sync() {
-    let path = "src/tests/resources/empty-puffin-uncompressed.puffin";
-
-    let file = File::open(path).unwrap();
-    let mut reader = PuffinFileReader::new(file);
-    let metadata = reader.metadata().unwrap();
-    assert_eq!(metadata.properties.len(), 0);
-    assert_eq!(metadata.blobs.len(), 0);
-}
+use crate::file_format::reader::{AsyncReader, PuffinFileFooterReader, PuffinFileReader};
+use crate::file_format::writer::{AsyncWriter, Blob, PuffinFileWriter};
+use crate::file_metadata::FileMetadata;
 
 #[tokio::test]
 async fn test_read_empty_puffin_async() {
@@ -45,39 +33,37 @@ async fn test_read_empty_puffin_async() {
     assert_eq!(metadata.blobs.len(), 0);
 }
 
-#[test]
-fn test_sample_metric_data_puffin_sync() {
-    let path = "src/tests/resources/sample-metric-data-uncompressed.puffin";
-
-    let file = File::open(path).unwrap();
-    let mut reader = PuffinFileReader::new(file);
-    let metadata = reader.metadata().unwrap();
-
-    assert_eq!(metadata.properties.len(), 1);
-    assert_eq!(
-        metadata.properties.get("created-by"),
-        Some(&"Test 1234".to_string())
-    );
-
-    assert_eq!(metadata.blobs.len(), 2);
-    assert_eq!(metadata.blobs[0].blob_type, "some-blob");
-    assert_eq!(metadata.blobs[0].offset, 4);
-    assert_eq!(metadata.blobs[0].length, 9);
-
-    assert_eq!(metadata.blobs[1].blob_type, "some-other-blob");
-    assert_eq!(metadata.blobs[1].offset, 13);
-    assert_eq!(metadata.blobs[1].length, 83);
+async fn test_read_puffin_file_metadata(
+    path: &str,
+    file_size: u64,
+    expeccted_metadata: FileMetadata,
+) {
+    for prefetch_size in [0, file_size / 2, file_size, file_size + 10] {
+        let reader = FileReader::new(path).await.unwrap();
+        let mut footer_reader = PuffinFileFooterReader::new(reader, file_size);
+        if prefetch_size > 0 {
+            footer_reader = footer_reader.with_prefetch_size(prefetch_size);
+        }
+        let metadata = footer_reader.metadata().await.unwrap();
+        assert_eq!(metadata.properties, expeccted_metadata.properties,);
+        assert_eq!(metadata.blobs, expeccted_metadata.blobs);
+    }
+}
 
-    let mut some_blob = reader.blob_reader(&metadata.blobs[0]).unwrap();
-    let mut buf = String::new();
-    some_blob.read_to_string(&mut buf).unwrap();
-    assert_eq!(buf, "abcdefghi");
+#[tokio::test]
+async fn test_read_puffin_file_metadata_async() {
+    let paths = vec![
+        "src/tests/resources/empty-puffin-uncompressed.puffin",
+        "src/tests/resources/sample-metric-data-uncompressed.puffin",
+    ];
+    for path in paths {
+        let mut reader = FileReader::new(path).await.unwrap();
+        let file_size = reader.metadata().await.unwrap().content_length;
+        let mut reader = PuffinFileReader::new(reader);
+        let metadata = reader.metadata().await.unwrap();
 
-    let mut some_other_blob = reader.blob_reader(&metadata.blobs[1]).unwrap();
-    let mut buf = Vec::new();
-    some_other_blob.read_to_end(&mut buf).unwrap();
-    let expected = include_bytes!("tests/resources/sample-metric-data.blob");
-    assert_eq!(buf, expected);
+        test_read_puffin_file_metadata(path, file_size, metadata).await;
+    }
 }
 
 #[tokio::test]
@@ -113,38 +99,6 @@ async fn test_sample_metric_data_puffin_async() {
     assert_eq!(buf, expected);
 }
 
-#[test]
-fn test_writer_reader_with_empty_sync() {
-    fn test_writer_reader_with_empty_sync(footer_compressed: bool) {
-        let mut buf = Cursor::new(vec![]);
-
-        let mut writer = PuffinFileWriter::new(&mut buf);
-        writer.set_properties(HashMap::from([(
-            "created-by".to_string(),
-            "Test 1234".to_string(),
-        )]));
-
-        writer.set_footer_lz4_compressed(footer_compressed);
-        let written_bytes = writer.finish().unwrap();
-        assert!(written_bytes > 0);
-
-        let mut buf = Cursor::new(buf.into_inner());
-        let mut reader = PuffinFileReader::new(&mut buf);
-        let metadata = reader.metadata().unwrap();
-
-        assert_eq!(metadata.properties.len(), 1);
-        assert_eq!(
-            metadata.properties.get("created-by"),
-            Some(&"Test 1234".to_string())
-        );
-
-        assert_eq!(metadata.blobs.len(), 0);
-    }
-
-    test_writer_reader_with_empty_sync(false);
-    test_writer_reader_with_empty_sync(true);
-}
-
 #[tokio::test]
 async fn test_writer_reader_empty_async() {
     async fn test_writer_reader_empty_async(footer_compressed: bool) {
@@ -176,76 +130,6 @@ async fn test_writer_reader_empty_async() {
     test_writer_reader_empty_async(true).await;
 }
 
-#[test]
-fn test_writer_reader_sync() {
-    fn test_writer_reader_sync(footer_compressed: bool) {
-        let mut buf = Cursor::new(vec![]);
-
-        let mut writer = PuffinFileWriter::new(&mut buf);
-
-        let blob1 = "abcdefghi";
-        writer
-            .add_blob(Blob {
-                compressed_data: Cursor::new(&blob1),
-                blob_type: "some-blob".to_string(),
-                properties: Default::default(),
-                compression_codec: None,
-            })
-            .unwrap();
-
-        let blob2 = include_bytes!("tests/resources/sample-metric-data.blob");
-        writer
-            .add_blob(Blob {
-                compressed_data: Cursor::new(&blob2),
-                blob_type: "some-other-blob".to_string(),
-                properties: Default::default(),
-                compression_codec: None,
-            })
-            .unwrap();
-
-        writer.set_properties(HashMap::from([(
-            "created-by".to_string(),
-            "Test 1234".to_string(),
-        )]));
-
-        writer.set_footer_lz4_compressed(footer_compressed);
-        let written_bytes = writer.finish().unwrap();
-        assert!(written_bytes > 0);
-
-        let mut buf = Cursor::new(buf.into_inner());
-        let mut reader = PuffinFileReader::new(&mut buf);
-        let metadata = reader.metadata().unwrap();
-
-        assert_eq!(metadata.properties.len(), 1);
-        assert_eq!(
-            metadata.properties.get("created-by"),
-            Some(&"Test 1234".to_string())
-        );
-
-        assert_eq!(metadata.blobs.len(), 2);
-        assert_eq!(metadata.blobs[0].blob_type, "some-blob");
-        assert_eq!(metadata.blobs[0].offset, 4);
-        assert_eq!(metadata.blobs[0].length, 9);
-
-        assert_eq!(metadata.blobs[1].blob_type, "some-other-blob");
-        assert_eq!(metadata.blobs[1].offset, 13);
-        assert_eq!(metadata.blobs[1].length, 83);
-
-        let mut some_blob = reader.blob_reader(&metadata.blobs[0]).unwrap();
-        let mut buf = String::new();
-        some_blob.read_to_string(&mut buf).unwrap();
-        assert_eq!(buf, blob1);
-
-        let mut some_other_blob = reader.blob_reader(&metadata.blobs[1]).unwrap();
-        let mut buf = Vec::new();
-        some_other_blob.read_to_end(&mut buf).unwrap();
-        assert_eq!(buf, blob2);
-    }
-
-    test_writer_reader_sync(false);
-    test_writer_reader_sync(true);
-}
-
 #[tokio::test]
 async fn test_writer_reader_async() {
     async fn test_writer_reader_async(footer_compressed: bool) {

From 8c1959c580fdb3c5ecafdb6bc4fb6395a80ebedf Mon Sep 17 00:00:00 2001
From: Weny Xu <wenymedia@gmail.com>
Date: Thu, 12 Dec 2024 11:49:54 +0800
Subject: [PATCH 30/36] feat: add prefetch support to
 `InvertedIndexFooterReader` for reduced I/O time (#5146)

* feat: add prefetch support to `InvertedIndeFooterReader`

* chore: correct struct name

* chore: apply suggestions from CR
---
 src/index/src/inverted_index/error.rs         |  16 ++-
 .../src/inverted_index/format/reader/blob.rs  |   6 +-
 .../inverted_index/format/reader/footer.rs    | 135 ++++++++++++------
 src/index/src/lib.rs                          |   1 +
 4 files changed, 114 insertions(+), 44 deletions(-)

diff --git a/src/index/src/inverted_index/error.rs b/src/index/src/inverted_index/error.rs
index 49816e63c463..7e861beda6d1 100644
--- a/src/index/src/inverted_index/error.rs
+++ b/src/index/src/inverted_index/error.rs
@@ -68,6 +68,18 @@ pub enum Error {
         location: Location,
     },
 
+    #[snafu(display("Blob size too small"))]
+    BlobSizeTooSmall {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Invalid footer payload size"))]
+    InvalidFooterPayloadSize {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
     #[snafu(display("Unexpected inverted index footer payload size, max: {max_payload_size}, actual: {actual_payload_size}"))]
     UnexpectedFooterPayloadSize {
         max_payload_size: u64,
@@ -220,7 +232,9 @@ impl ErrorExt for Error {
             | KeysApplierUnexpectedPredicates { .. }
             | CommonIo { .. }
             | UnknownIntermediateCodecMagic { .. }
-            | FstCompile { .. } => StatusCode::Unexpected,
+            | FstCompile { .. }
+            | InvalidFooterPayloadSize { .. }
+            | BlobSizeTooSmall { .. } => StatusCode::Unexpected,
 
             ParseRegex { .. }
             | ParseDFA { .. }
diff --git a/src/index/src/inverted_index/format/reader/blob.rs b/src/index/src/inverted_index/format/reader/blob.rs
index ace0e5c48536..de34cd36f849 100644
--- a/src/index/src/inverted_index/format/reader/blob.rs
+++ b/src/index/src/inverted_index/format/reader/blob.rs
@@ -19,8 +19,9 @@ use common_base::range_read::RangeReader;
 use greptime_proto::v1::index::InvertedIndexMetas;
 use snafu::{ensure, ResultExt};
 
+use super::footer::DEFAULT_PREFETCH_SIZE;
 use crate::inverted_index::error::{CommonIoSnafu, Result, UnexpectedBlobSizeSnafu};
-use crate::inverted_index::format::reader::footer::InvertedIndeFooterReader;
+use crate::inverted_index::format::reader::footer::InvertedIndexFooterReader;
 use crate::inverted_index::format::reader::InvertedIndexReader;
 use crate::inverted_index::format::MIN_BLOB_SIZE;
 
@@ -72,7 +73,8 @@ impl<R: RangeReader> InvertedIndexReader for InvertedIndexBlobReader<R> {
         let blob_size = metadata.content_length;
         Self::validate_blob_size(blob_size)?;
 
-        let mut footer_reader = InvertedIndeFooterReader::new(&mut self.source, blob_size);
+        let mut footer_reader = InvertedIndexFooterReader::new(&mut self.source, blob_size)
+            .with_prefetch_size(DEFAULT_PREFETCH_SIZE);
         footer_reader.metadata().await.map(Arc::new)
     }
 }
diff --git a/src/index/src/inverted_index/format/reader/footer.rs b/src/index/src/inverted_index/format/reader/footer.rs
index 1f35237711ce..c025ecf52ecd 100644
--- a/src/index/src/inverted_index/format/reader/footer.rs
+++ b/src/index/src/inverted_index/format/reader/footer.rs
@@ -18,53 +18,88 @@ use prost::Message;
 use snafu::{ensure, ResultExt};
 
 use crate::inverted_index::error::{
-    CommonIoSnafu, DecodeProtoSnafu, Result, UnexpectedFooterPayloadSizeSnafu,
-    UnexpectedOffsetSizeSnafu, UnexpectedZeroSegmentRowCountSnafu,
+    BlobSizeTooSmallSnafu, CommonIoSnafu, DecodeProtoSnafu, InvalidFooterPayloadSizeSnafu, Result,
+    UnexpectedFooterPayloadSizeSnafu, UnexpectedOffsetSizeSnafu,
+    UnexpectedZeroSegmentRowCountSnafu,
 };
 use crate::inverted_index::format::FOOTER_PAYLOAD_SIZE_SIZE;
 
-/// InvertedIndeFooterReader is for reading the footer section of the blob.
-pub struct InvertedIndeFooterReader<R> {
+pub const DEFAULT_PREFETCH_SIZE: u64 = 1024; // 1KiB
+
+/// InvertedIndexFooterReader is for reading the footer section of the blob.
+pub struct InvertedIndexFooterReader<R> {
     source: R,
     blob_size: u64,
+    prefetch_size: Option<u64>,
 }
 
-impl<R> InvertedIndeFooterReader<R> {
+impl<R> InvertedIndexFooterReader<R> {
     pub fn new(source: R, blob_size: u64) -> Self {
-        Self { source, blob_size }
+        Self {
+            source,
+            blob_size,
+            prefetch_size: None,
+        }
+    }
+
+    /// Set the prefetch size for the footer reader.
+    pub fn with_prefetch_size(mut self, prefetch_size: u64) -> Self {
+        self.prefetch_size = Some(prefetch_size.max(FOOTER_PAYLOAD_SIZE_SIZE));
+        self
+    }
+
+    pub fn prefetch_size(&self) -> u64 {
+        self.prefetch_size.unwrap_or(FOOTER_PAYLOAD_SIZE_SIZE)
     }
 }
 
-impl<R: RangeReader> InvertedIndeFooterReader<R> {
+impl<R: RangeReader> InvertedIndexFooterReader<R> {
     pub async fn metadata(&mut self) -> Result<InvertedIndexMetas> {
-        let payload_size = self.read_payload_size().await?;
-        let metas = self.read_payload(payload_size).await?;
-        Ok(metas)
-    }
+        ensure!(
+            self.blob_size >= FOOTER_PAYLOAD_SIZE_SIZE,
+            BlobSizeTooSmallSnafu
+        );
 
-    async fn read_payload_size(&mut self) -> Result<u64> {
-        let mut size_buf = [0u8; FOOTER_PAYLOAD_SIZE_SIZE as usize];
-        let end = self.blob_size;
-        let start = end - FOOTER_PAYLOAD_SIZE_SIZE;
-        self.source
-            .read_into(start..end, &mut &mut size_buf[..])
+        let footer_start = self.blob_size.saturating_sub(self.prefetch_size());
+        let suffix = self
+            .source
+            .read(footer_start..self.blob_size)
             .await
             .context(CommonIoSnafu)?;
+        let suffix_len = suffix.len();
+        let length = u32::from_le_bytes(Self::read_tailing_four_bytes(&suffix)?) as u64;
+        self.validate_payload_size(length)?;
+
+        let footer_size = FOOTER_PAYLOAD_SIZE_SIZE;
+
+        // Did not fetch the entire file metadata in the initial read, need to make a second request.
+        if length > suffix_len as u64 - footer_size {
+            let metadata_start = self.blob_size - length - footer_size;
+            let meta = self
+                .source
+                .read(metadata_start..self.blob_size - footer_size)
+                .await
+                .context(CommonIoSnafu)?;
+            self.parse_payload(&meta, length)
+        } else {
+            let metadata_start = self.blob_size - length - footer_size - footer_start;
+            let meta = &suffix[metadata_start as usize..suffix_len - footer_size as usize];
+            self.parse_payload(meta, length)
+        }
+    }
 
-        let payload_size = u32::from_le_bytes(size_buf) as u64;
-        self.validate_payload_size(payload_size)?;
+    fn read_tailing_four_bytes(suffix: &[u8]) -> Result<[u8; 4]> {
+        let suffix_len = suffix.len();
+        ensure!(suffix_len >= 4, InvalidFooterPayloadSizeSnafu);
+        let mut bytes = [0; 4];
+        bytes.copy_from_slice(&suffix[suffix_len - 4..suffix_len]);
 
-        Ok(payload_size)
+        Ok(bytes)
     }
 
-    async fn read_payload(&mut self, payload_size: u64) -> Result<InvertedIndexMetas> {
-        let end = self.blob_size - FOOTER_PAYLOAD_SIZE_SIZE;
-        let start = end - payload_size;
-        let bytes = self.source.read(start..end).await.context(CommonIoSnafu)?;
-
-        let metas = InvertedIndexMetas::decode(&*bytes).context(DecodeProtoSnafu)?;
+    fn parse_payload(&mut self, bytes: &[u8], payload_size: u64) -> Result<InvertedIndexMetas> {
+        let metas = InvertedIndexMetas::decode(bytes).context(DecodeProtoSnafu)?;
         self.validate_metas(&metas, payload_size)?;
-
         Ok(metas)
     }
 
@@ -113,9 +148,12 @@ impl<R: RangeReader> InvertedIndeFooterReader<R> {
 
 #[cfg(test)]
 mod tests {
+    use std::assert_matches::assert_matches;
+
     use prost::Message;
 
     use super::*;
+    use crate::inverted_index::error::Error;
 
     fn create_test_payload(meta: InvertedIndexMeta) -> Vec<u8> {
         let mut metas = InvertedIndexMetas {
@@ -141,14 +179,18 @@ mod tests {
 
         let mut payload_buf = create_test_payload(meta);
         let blob_size = payload_buf.len() as u64;
-        let mut reader = InvertedIndeFooterReader::new(&mut payload_buf, blob_size);
 
-        let payload_size = reader.read_payload_size().await.unwrap();
-        let metas = reader.read_payload(payload_size).await.unwrap();
+        for prefetch in [0, blob_size / 2, blob_size, blob_size + 10] {
+            let mut reader = InvertedIndexFooterReader::new(&mut payload_buf, blob_size);
+            if prefetch > 0 {
+                reader = reader.with_prefetch_size(prefetch);
+            }
 
-        assert_eq!(metas.metas.len(), 1);
-        let index_meta = &metas.metas.get("test").unwrap();
-        assert_eq!(index_meta.name, "test");
+            let metas = reader.metadata().await.unwrap();
+            assert_eq!(metas.metas.len(), 1);
+            let index_meta = &metas.metas.get("test").unwrap();
+            assert_eq!(index_meta.name, "test");
+        }
     }
 
     #[tokio::test]
@@ -157,14 +199,20 @@ mod tests {
             name: "test".to_string(),
             ..Default::default()
         };
-
         let mut payload_buf = create_test_payload(meta);
         payload_buf.push(0xff); // Add an extra byte to corrupt the footer
         let blob_size = payload_buf.len() as u64;
-        let mut reader = InvertedIndeFooterReader::new(&mut payload_buf, blob_size);
 
-        let payload_size_result = reader.read_payload_size().await;
-        assert!(payload_size_result.is_err());
+        for prefetch in [0, blob_size / 2, blob_size, blob_size + 10] {
+            let blob_size = payload_buf.len() as u64;
+            let mut reader = InvertedIndexFooterReader::new(&mut payload_buf, blob_size);
+            if prefetch > 0 {
+                reader = reader.with_prefetch_size(prefetch);
+            }
+
+            let result = reader.metadata().await;
+            assert_matches!(result, Err(Error::UnexpectedFooterPayloadSize { .. }));
+        }
     }
 
     #[tokio::test]
@@ -178,10 +226,15 @@ mod tests {
 
         let mut payload_buf = create_test_payload(meta);
         let blob_size = payload_buf.len() as u64;
-        let mut reader = InvertedIndeFooterReader::new(&mut payload_buf, blob_size);
 
-        let payload_size = reader.read_payload_size().await.unwrap();
-        let payload_result = reader.read_payload(payload_size).await;
-        assert!(payload_result.is_err());
+        for prefetch in [0, blob_size / 2, blob_size, blob_size + 10] {
+            let mut reader = InvertedIndexFooterReader::new(&mut payload_buf, blob_size);
+            if prefetch > 0 {
+                reader = reader.with_prefetch_size(prefetch);
+            }
+
+            let result = reader.metadata().await;
+            assert_matches!(result, Err(Error::UnexpectedOffsetSize { .. }));
+        }
     }
 }
diff --git a/src/index/src/lib.rs b/src/index/src/lib.rs
index 197fc01818c0..5e2e41166863 100644
--- a/src/index/src/lib.rs
+++ b/src/index/src/lib.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #![feature(iter_partition_in_place)]
+#![feature(assert_matches)]
 
 pub mod fulltext_index;
 pub mod inverted_index;

From d53fbcb9362892623da9a8d6475c82a4ac250faa Mon Sep 17 00:00:00 2001
From: Weny Xu <wenymedia@gmail.com>
Date: Thu, 12 Dec 2024 12:09:36 +0800
Subject: [PATCH 31/36] feat: introduce `PuffinMetadataCache` (#5148)

* feat: introduce `PuffinMetadataCache`

* refactor: remove too_many_arguments

* chore: fmt toml
---
 Cargo.lock                                    |  1 +
 src/mito2/src/cache.rs                        | 17 ++++++
 src/mito2/src/config.rs                       |  4 ++
 src/mito2/src/read/scan_region.rs             | 11 +++-
 src/mito2/src/sst/file.rs                     |  1 +
 .../src/sst/index/inverted_index/applier.rs   | 42 ++++++++++---
 .../index/inverted_index/applier/builder.rs   | 55 ++++++++++++-----
 .../inverted_index/applier/builder/between.rs | 10 ----
 .../applier/builder/comparison.rs             |  8 ---
 .../inverted_index/applier/builder/eq_list.rs | 14 -----
 .../inverted_index/applier/builder/in_list.rs | 10 ----
 .../applier/builder/regex_match.rs            |  8 ---
 .../src/sst/index/inverted_index/creator.rs   |  7 ++-
 src/mito2/src/worker.rs                       |  1 +
 src/puffin/Cargo.toml                         |  1 +
 src/puffin/src/blob_metadata.rs               | 14 +++++
 src/puffin/src/file_format/reader/file.rs     |  5 ++
 src/puffin/src/file_metadata.rs               | 16 +++++
 src/puffin/src/puffin_manager.rs              |  1 +
 src/puffin/src/puffin_manager/cache.rs        | 60 +++++++++++++++++++
 .../src/puffin_manager/fs_puffin_manager.rs   | 17 +++++-
 .../fs_puffin_manager/reader.rs               | 39 ++++++++++--
 22 files changed, 258 insertions(+), 84 deletions(-)
 create mode 100644 src/puffin/src/puffin_manager/cache.rs

diff --git a/Cargo.lock b/Cargo.lock
index 311caafcb2fe..e57a6542afbb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -8883,6 +8883,7 @@ dependencies = [
  "lz4_flex 0.11.3",
  "moka",
  "pin-project",
+ "prometheus",
  "serde",
  "serde_json",
  "sha2",
diff --git a/src/mito2/src/cache.rs b/src/mito2/src/cache.rs
index 7d977a328ca1..7018b039d62e 100644
--- a/src/mito2/src/cache.rs
+++ b/src/mito2/src/cache.rs
@@ -32,6 +32,7 @@ use moka::notification::RemovalCause;
 use moka::sync::Cache;
 use parquet::column::page::Page;
 use parquet::file::metadata::ParquetMetaData;
+use puffin::puffin_manager::cache::{PuffinMetadataCache, PuffinMetadataCacheRef};
 use store_api::storage::{ConcreteDataType, RegionId, TimeSeriesRowSelector};
 
 use crate::cache::cache_size::parquet_meta_size;
@@ -68,6 +69,8 @@ pub struct CacheManager {
     write_cache: Option<WriteCacheRef>,
     /// Cache for inverted index.
     index_cache: Option<InvertedIndexCacheRef>,
+    /// Puffin metadata cache.
+    puffin_metadata_cache: Option<PuffinMetadataCacheRef>,
     /// Cache for time series selectors.
     selector_result_cache: Option<SelectorResultCache>,
 }
@@ -217,6 +220,10 @@ impl CacheManager {
     pub(crate) fn index_cache(&self) -> Option<&InvertedIndexCacheRef> {
         self.index_cache.as_ref()
     }
+
+    pub(crate) fn puffin_metadata_cache(&self) -> Option<&PuffinMetadataCacheRef> {
+        self.puffin_metadata_cache.as_ref()
+    }
 }
 
 /// Increases selector cache miss metrics.
@@ -237,6 +244,7 @@ pub struct CacheManagerBuilder {
     page_cache_size: u64,
     index_metadata_size: u64,
     index_content_size: u64,
+    puffin_metadata_size: u64,
     write_cache: Option<WriteCacheRef>,
     selector_result_cache_size: u64,
 }
@@ -278,6 +286,12 @@ impl CacheManagerBuilder {
         self
     }
 
+    /// Sets cache size for puffin metadata.
+    pub fn puffin_metadata_size(mut self, bytes: u64) -> Self {
+        self.puffin_metadata_size = bytes;
+        self
+    }
+
     /// Sets selector result cache size.
     pub fn selector_result_cache_size(mut self, bytes: u64) -> Self {
         self.selector_result_cache_size = bytes;
@@ -340,6 +354,8 @@ impl CacheManagerBuilder {
         });
         let inverted_index_cache =
             InvertedIndexCache::new(self.index_metadata_size, self.index_content_size);
+        let puffin_metadata_cache =
+            PuffinMetadataCache::new(self.puffin_metadata_size, &CACHE_BYTES);
         let selector_result_cache = (self.selector_result_cache_size != 0).then(|| {
             Cache::builder()
                 .max_capacity(self.selector_result_cache_size)
@@ -361,6 +377,7 @@ impl CacheManagerBuilder {
             page_cache,
             write_cache: self.write_cache,
             index_cache: Some(Arc::new(inverted_index_cache)),
+            puffin_metadata_cache: Some(Arc::new(puffin_metadata_cache)),
             selector_result_cache,
         }
     }
diff --git a/src/mito2/src/config.rs b/src/mito2/src/config.rs
index 9b113027a41b..dda3f4271059 100644
--- a/src/mito2/src/config.rs
+++ b/src/mito2/src/config.rs
@@ -304,6 +304,9 @@ pub struct IndexConfig {
 
     /// Write buffer size for creating the index.
     pub write_buffer_size: ReadableSize,
+
+    /// Cache size for metadata of puffin files. Setting it to 0 to disable the cache.
+    pub metadata_cache_size: ReadableSize,
 }
 
 impl Default for IndexConfig {
@@ -312,6 +315,7 @@ impl Default for IndexConfig {
             aux_path: String::new(),
             staging_size: ReadableSize::gb(2),
             write_buffer_size: ReadableSize::mb(8),
+            metadata_cache_size: ReadableSize::mb(64),
         }
     }
 }
diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs
index 19324f119f3e..32b8c90cda02 100644
--- a/src/mito2/src/read/scan_region.rs
+++ b/src/mito2/src/read/scan_region.rs
@@ -413,11 +413,15 @@ impl ScanRegion {
             .and_then(|c| c.index_cache())
             .cloned();
 
+        let puffin_metadata_cache = self
+            .cache_manager
+            .as_ref()
+            .and_then(|c| c.puffin_metadata_cache())
+            .cloned();
+
         InvertedIndexApplierBuilder::new(
             self.access_layer.region_dir().to_string(),
             self.access_layer.object_store().clone(),
-            file_cache,
-            index_cache,
             self.version.metadata.as_ref(),
             self.version.metadata.inverted_indexed_column_ids(
                 self.version
@@ -429,6 +433,9 @@ impl ScanRegion {
             ),
             self.access_layer.puffin_manager_factory().clone(),
         )
+        .with_file_cache(file_cache)
+        .with_index_cache(index_cache)
+        .with_puffin_metadata_cache(puffin_metadata_cache)
         .build(&self.request.filters)
         .inspect_err(|err| warn!(err; "Failed to build invereted index applier"))
         .ok()
diff --git a/src/mito2/src/sst/file.rs b/src/mito2/src/sst/file.rs
index 451ec44f1cd2..4353ae55e3e9 100644
--- a/src/mito2/src/sst/file.rs
+++ b/src/mito2/src/sst/file.rs
@@ -149,6 +149,7 @@ impl FileMeta {
     pub fn inverted_index_available(&self) -> bool {
         self.available_indexes.contains(&IndexType::InvertedIndex)
     }
+
     pub fn fulltext_index_available(&self) -> bool {
         self.available_indexes.contains(&IndexType::FulltextIndex)
     }
diff --git a/src/mito2/src/sst/index/inverted_index/applier.rs b/src/mito2/src/sst/index/inverted_index/applier.rs
index cac3ffedd74c..bf5206ef44be 100644
--- a/src/mito2/src/sst/index/inverted_index/applier.rs
+++ b/src/mito2/src/sst/index/inverted_index/applier.rs
@@ -22,6 +22,7 @@ use index::inverted_index::search::index_apply::{
     ApplyOutput, IndexApplier, IndexNotFoundStrategy, SearchContext,
 };
 use object_store::ObjectStore;
+use puffin::puffin_manager::cache::PuffinMetadataCacheRef;
 use puffin::puffin_manager::{BlobGuard, PuffinManager, PuffinReader};
 use snafu::ResultExt;
 use store_api::storage::RegionId;
@@ -60,6 +61,9 @@ pub(crate) struct InvertedIndexApplier {
 
     /// In-memory cache for inverted index.
     inverted_index_cache: Option<InvertedIndexCacheRef>,
+
+    /// Puffin metadata cache.
+    puffin_metadata_cache: Option<PuffinMetadataCacheRef>,
 }
 
 pub(crate) type InvertedIndexApplierRef = Arc<InvertedIndexApplier>;
@@ -70,8 +74,6 @@ impl InvertedIndexApplier {
         region_dir: String,
         region_id: RegionId,
         store: ObjectStore,
-        file_cache: Option<FileCacheRef>,
-        index_cache: Option<InvertedIndexCacheRef>,
         index_applier: Box<dyn IndexApplier>,
         puffin_manager_factory: PuffinManagerFactory,
     ) -> Self {
@@ -81,13 +83,35 @@ impl InvertedIndexApplier {
             region_dir,
             region_id,
             store,
-            file_cache,
+            file_cache: None,
             index_applier,
             puffin_manager_factory,
-            inverted_index_cache: index_cache,
+            inverted_index_cache: None,
+            puffin_metadata_cache: None,
         }
     }
 
+    /// Sets the file cache.
+    pub fn with_file_cache(mut self, file_cache: Option<FileCacheRef>) -> Self {
+        self.file_cache = file_cache;
+        self
+    }
+
+    /// Sets the index cache.
+    pub fn with_index_cache(mut self, index_cache: Option<InvertedIndexCacheRef>) -> Self {
+        self.inverted_index_cache = index_cache;
+        self
+    }
+
+    /// Sets the puffin metadata cache.
+    pub fn with_puffin_metadata_cache(
+        mut self,
+        puffin_metadata_cache: Option<PuffinMetadataCacheRef>,
+    ) -> Self {
+        self.puffin_metadata_cache = puffin_metadata_cache;
+        self
+    }
+
     /// Applies predicates to the provided SST file id and returns the relevant row group ids
     pub async fn apply(&self, file_id: FileId) -> Result<ApplyOutput> {
         let _timer = INDEX_APPLY_ELAPSED
@@ -105,6 +129,7 @@ impl InvertedIndexApplier {
                 if let Err(err) = other {
                     warn!(err; "An unexpected error occurred while reading the cached index file. Fallback to remote index file.")
                 }
+
                 self.remote_blob_reader(file_id).await?
             }
         };
@@ -157,7 +182,10 @@ impl InvertedIndexApplier {
 
     /// Creates a blob reader from the remote index file.
     async fn remote_blob_reader(&self, file_id: FileId) -> Result<BlobReader> {
-        let puffin_manager = self.puffin_manager_factory.build(self.store.clone());
+        let puffin_manager = self
+            .puffin_manager_factory
+            .build(self.store.clone())
+            .with_puffin_metadata_cache(self.puffin_metadata_cache.clone());
         let file_path = location::index_file_path(&self.region_dir, file_id);
         puffin_manager
             .reader(&file_path)
@@ -219,8 +247,6 @@ mod tests {
             region_dir.clone(),
             RegionId::new(0, 0),
             object_store,
-            None,
-            None,
             Box::new(mock_index_applier),
             puffin_manager_factory,
         );
@@ -261,8 +287,6 @@ mod tests {
             region_dir.clone(),
             RegionId::new(0, 0),
             object_store,
-            None,
-            None,
             Box::new(mock_index_applier),
             puffin_manager_factory,
         );
diff --git a/src/mito2/src/sst/index/inverted_index/applier/builder.rs b/src/mito2/src/sst/index/inverted_index/applier/builder.rs
index 603cf5aa23fd..653679b9fca8 100644
--- a/src/mito2/src/sst/index/inverted_index/applier/builder.rs
+++ b/src/mito2/src/sst/index/inverted_index/applier/builder.rs
@@ -28,6 +28,7 @@ use datatypes::value::Value;
 use index::inverted_index::search::index_apply::PredicatesIndexApplier;
 use index::inverted_index::search::predicate::Predicate;
 use object_store::ObjectStore;
+use puffin::puffin_manager::cache::PuffinMetadataCacheRef;
 use snafu::{OptionExt, ResultExt};
 use store_api::metadata::RegionMetadata;
 use store_api::storage::ColumnId;
@@ -65,6 +66,9 @@ pub(crate) struct InvertedIndexApplierBuilder<'a> {
 
     /// Cache for inverted index.
     index_cache: Option<InvertedIndexCacheRef>,
+
+    /// Cache for puffin metadata.
+    puffin_metadata_cache: Option<PuffinMetadataCacheRef>,
 }
 
 impl<'a> InvertedIndexApplierBuilder<'a> {
@@ -72,8 +76,6 @@ impl<'a> InvertedIndexApplierBuilder<'a> {
     pub fn new(
         region_dir: String,
         object_store: ObjectStore,
-        file_cache: Option<FileCacheRef>,
-        index_cache: Option<InvertedIndexCacheRef>,
         metadata: &'a RegionMetadata,
         indexed_column_ids: HashSet<ColumnId>,
         puffin_manager_factory: PuffinManagerFactory,
@@ -81,15 +83,37 @@ impl<'a> InvertedIndexApplierBuilder<'a> {
         Self {
             region_dir,
             object_store,
-            file_cache,
             metadata,
             indexed_column_ids,
             output: HashMap::default(),
-            index_cache,
             puffin_manager_factory,
+            file_cache: None,
+            index_cache: None,
+            puffin_metadata_cache: None,
         }
     }
 
+    /// Sets the file cache.
+    pub fn with_file_cache(mut self, file_cache: Option<FileCacheRef>) -> Self {
+        self.file_cache = file_cache;
+        self
+    }
+
+    /// Sets the puffin metadata cache.
+    pub fn with_puffin_metadata_cache(
+        mut self,
+        puffin_metadata_cache: Option<PuffinMetadataCacheRef>,
+    ) -> Self {
+        self.puffin_metadata_cache = puffin_metadata_cache;
+        self
+    }
+
+    /// Sets the index cache.
+    pub fn with_index_cache(mut self, index_cache: Option<InvertedIndexCacheRef>) -> Self {
+        self.index_cache = index_cache;
+        self
+    }
+
     /// Consumes the builder to construct an [`InvertedIndexApplier`], optionally returned based on
     /// the expressions provided. If no predicates match, returns `None`.
     pub fn build(mut self, exprs: &[Expr]) -> Result<Option<InvertedIndexApplier>> {
@@ -108,15 +132,18 @@ impl<'a> InvertedIndexApplierBuilder<'a> {
             .collect();
         let applier = PredicatesIndexApplier::try_from(predicates);
 
-        Ok(Some(InvertedIndexApplier::new(
-            self.region_dir,
-            self.metadata.region_id,
-            self.object_store,
-            self.file_cache,
-            self.index_cache,
-            Box::new(applier.context(BuildIndexApplierSnafu)?),
-            self.puffin_manager_factory,
-        )))
+        Ok(Some(
+            InvertedIndexApplier::new(
+                self.region_dir,
+                self.metadata.region_id,
+                self.object_store,
+                Box::new(applier.context(BuildIndexApplierSnafu)?),
+                self.puffin_manager_factory,
+            )
+            .with_file_cache(self.file_cache)
+            .with_puffin_metadata_cache(self.puffin_metadata_cache)
+            .with_index_cache(self.index_cache),
+        ))
     }
 
     /// Recursively traverses expressions to collect predicates.
@@ -322,8 +349,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
diff --git a/src/mito2/src/sst/index/inverted_index/applier/builder/between.rs b/src/mito2/src/sst/index/inverted_index/applier/builder/between.rs
index 0a196e6f1ac6..51f7f001e25b 100644
--- a/src/mito2/src/sst/index/inverted_index/applier/builder/between.rs
+++ b/src/mito2/src/sst/index/inverted_index/applier/builder/between.rs
@@ -75,8 +75,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
@@ -118,8 +116,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
@@ -144,8 +140,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
@@ -187,8 +181,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
@@ -214,8 +206,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
diff --git a/src/mito2/src/sst/index/inverted_index/applier/builder/comparison.rs b/src/mito2/src/sst/index/inverted_index/applier/builder/comparison.rs
index cdaec9f94e95..138b15b82eb9 100644
--- a/src/mito2/src/sst/index/inverted_index/applier/builder/comparison.rs
+++ b/src/mito2/src/sst/index/inverted_index/applier/builder/comparison.rs
@@ -231,8 +231,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
@@ -260,8 +258,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
@@ -280,8 +276,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
@@ -315,8 +309,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
diff --git a/src/mito2/src/sst/index/inverted_index/applier/builder/eq_list.rs b/src/mito2/src/sst/index/inverted_index/applier/builder/eq_list.rs
index 1d07cca48724..35a5caad56a6 100644
--- a/src/mito2/src/sst/index/inverted_index/applier/builder/eq_list.rs
+++ b/src/mito2/src/sst/index/inverted_index/applier/builder/eq_list.rs
@@ -137,8 +137,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
@@ -175,8 +173,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
@@ -204,8 +200,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
@@ -224,8 +218,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
@@ -244,8 +236,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
@@ -303,8 +293,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
@@ -341,8 +329,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
diff --git a/src/mito2/src/sst/index/inverted_index/applier/builder/in_list.rs b/src/mito2/src/sst/index/inverted_index/applier/builder/in_list.rs
index 6a520ba401d3..224e10c452ff 100644
--- a/src/mito2/src/sst/index/inverted_index/applier/builder/in_list.rs
+++ b/src/mito2/src/sst/index/inverted_index/applier/builder/in_list.rs
@@ -68,8 +68,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
@@ -101,8 +99,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
@@ -126,8 +122,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
@@ -159,8 +153,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
@@ -186,8 +178,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
diff --git a/src/mito2/src/sst/index/inverted_index/applier/builder/regex_match.rs b/src/mito2/src/sst/index/inverted_index/applier/builder/regex_match.rs
index 7fdf7f3de55c..7148986e6d11 100644
--- a/src/mito2/src/sst/index/inverted_index/applier/builder/regex_match.rs
+++ b/src/mito2/src/sst/index/inverted_index/applier/builder/regex_match.rs
@@ -62,8 +62,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
@@ -91,8 +89,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
@@ -120,8 +116,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
@@ -142,8 +136,6 @@ mod tests {
         let mut builder = InvertedIndexApplierBuilder::new(
             "test".to_string(),
             test_object_store(),
-            None,
-            None,
             &metadata,
             HashSet::from_iter([1, 2, 3]),
             facotry,
diff --git a/src/mito2/src/sst/index/inverted_index/creator.rs b/src/mito2/src/sst/index/inverted_index/creator.rs
index 6db1ef6e0b7b..029a0da8484f 100644
--- a/src/mito2/src/sst/index/inverted_index/creator.rs
+++ b/src/mito2/src/sst/index/inverted_index/creator.rs
@@ -310,12 +310,14 @@ mod tests {
     use futures::future::BoxFuture;
     use object_store::services::Memory;
     use object_store::ObjectStore;
+    use puffin::puffin_manager::cache::PuffinMetadataCache;
     use puffin::puffin_manager::PuffinManager;
     use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder};
     use store_api::storage::RegionId;
 
     use super::*;
     use crate::cache::index::InvertedIndexCache;
+    use crate::metrics::CACHE_BYTES;
     use crate::read::BatchColumn;
     use crate::row_converter::{McmpRowCodec, RowCodec, SortField};
     use crate::sst::index::inverted_index::applier::builder::InvertedIndexApplierBuilder;
@@ -447,15 +449,16 @@ mod tests {
         move |expr| {
             let _d = &d;
             let cache = Arc::new(InvertedIndexCache::new(10, 10));
+            let puffin_metadata_cache = Arc::new(PuffinMetadataCache::new(10, &CACHE_BYTES));
             let applier = InvertedIndexApplierBuilder::new(
                 region_dir.clone(),
                 object_store.clone(),
-                None,
-                Some(cache),
                 &region_metadata,
                 indexed_column_ids.clone(),
                 factory.clone(),
             )
+            .with_index_cache(Some(cache))
+            .with_puffin_metadata_cache(Some(puffin_metadata_cache))
             .build(&[expr])
             .unwrap()
             .unwrap();
diff --git a/src/mito2/src/worker.rs b/src/mito2/src/worker.rs
index 33d26c8196df..f8ab9c3f4edb 100644
--- a/src/mito2/src/worker.rs
+++ b/src/mito2/src/worker.rs
@@ -170,6 +170,7 @@ impl WorkerGroup {
                 .selector_result_cache_size(config.selector_result_cache_size.as_bytes())
                 .index_metadata_size(config.inverted_index.metadata_cache_size.as_bytes())
                 .index_content_size(config.inverted_index.content_cache_size.as_bytes())
+                .puffin_metadata_size(config.index.metadata_cache_size.as_bytes())
                 .write_cache(write_cache)
                 .build(),
         );
diff --git a/src/puffin/Cargo.toml b/src/puffin/Cargo.toml
index e4e6c74a5c9b..31c92ba4f972 100644
--- a/src/puffin/Cargo.toml
+++ b/src/puffin/Cargo.toml
@@ -25,6 +25,7 @@ futures.workspace = true
 lz4_flex = "0.11"
 moka = { workspace = true, features = ["future", "sync"] }
 pin-project.workspace = true
+prometheus.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 sha2 = "0.10.8"
diff --git a/src/puffin/src/blob_metadata.rs b/src/puffin/src/blob_metadata.rs
index bb2475bfa336..67eb62c5ff1b 100644
--- a/src/puffin/src/blob_metadata.rs
+++ b/src/puffin/src/blob_metadata.rs
@@ -68,6 +68,20 @@ pub struct BlobMetadata {
     pub properties: HashMap<String, String>,
 }
 
+impl BlobMetadata {
+    /// Calculates the memory usage of the blob metadata in bytes.
+    pub fn memory_usage(&self) -> usize {
+        self.blob_type.len()
+            + self.input_fields.len() * std::mem::size_of::<i32>()
+            + self
+                .properties
+                .iter()
+                .map(|(k, v)| k.len() + v.len())
+                .sum::<usize>()
+            + std::mem::size_of::<Self>()
+    }
+}
+
 /// Compression codec used to compress the blob
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(rename_all = "lowercase")]
diff --git a/src/puffin/src/file_format/reader/file.rs b/src/puffin/src/file_format/reader/file.rs
index 31e8e10bc4d5..9ed40a7f181e 100644
--- a/src/puffin/src/file_format/reader/file.rs
+++ b/src/puffin/src/file_format/reader/file.rs
@@ -46,6 +46,11 @@ impl<R> PuffinFileReader<R> {
         }
     }
 
+    pub fn with_metadata(mut self, metadata: Option<FileMetadata>) -> Self {
+        self.metadata = metadata;
+        self
+    }
+
     fn validate_file_size(file_size: u64) -> Result<()> {
         ensure!(
             file_size >= MIN_FILE_SIZE,
diff --git a/src/puffin/src/file_metadata.rs b/src/puffin/src/file_metadata.rs
index 74eea3aa08f3..4804c65be495 100644
--- a/src/puffin/src/file_metadata.rs
+++ b/src/puffin/src/file_metadata.rs
@@ -33,6 +33,22 @@ pub struct FileMetadata {
     pub properties: HashMap<String, String>,
 }
 
+impl FileMetadata {
+    /// Calculates the memory usage of the file metadata in bytes.
+    pub fn memory_usage(&self) -> usize {
+        self.blobs
+            .iter()
+            .map(|blob| blob.memory_usage())
+            .sum::<usize>()
+            + self
+                .properties
+                .iter()
+                .map(|(k, v)| k.len() + v.len())
+                .sum::<usize>()
+            + std::mem::size_of::<Self>()
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use std::collections::HashMap;
diff --git a/src/puffin/src/puffin_manager.rs b/src/puffin/src/puffin_manager.rs
index 7bd5e9039d03..17101b1662e8 100644
--- a/src/puffin/src/puffin_manager.rs
+++ b/src/puffin/src/puffin_manager.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+pub mod cache;
 pub mod file_accessor;
 pub mod fs_puffin_manager;
 pub mod stager;
diff --git a/src/puffin/src/puffin_manager/cache.rs b/src/puffin/src/puffin_manager/cache.rs
new file mode 100644
index 000000000000..66fcb36bf9c2
--- /dev/null
+++ b/src/puffin/src/puffin_manager/cache.rs
@@ -0,0 +1,60 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use prometheus::IntGaugeVec;
+
+use crate::file_metadata::FileMetadata;
+/// Metrics for index metadata.
+const PUFFIN_METADATA_TYPE: &str = "puffin_metadata";
+
+pub type PuffinMetadataCacheRef = Arc<PuffinMetadataCache>;
+
+/// A cache for storing the metadata of the index files.
+pub struct PuffinMetadataCache {
+    cache: moka::sync::Cache<String, Arc<FileMetadata>>,
+}
+
+fn puffin_metadata_weight(k: &String, v: &Arc<FileMetadata>) -> u32 {
+    (k.as_bytes().len() + v.memory_usage()) as u32
+}
+
+impl PuffinMetadataCache {
+    pub fn new(capacity: u64, cache_bytes: &'static IntGaugeVec) -> Self {
+        common_telemetry::debug!("Building PuffinMetadataCache with capacity: {capacity}");
+        Self {
+            cache: moka::sync::CacheBuilder::new(capacity)
+                .name("puffin_metadata")
+                .weigher(puffin_metadata_weight)
+                .eviction_listener(|k, v, _cause| {
+                    let size = puffin_metadata_weight(&k, &v);
+                    cache_bytes
+                        .with_label_values(&[PUFFIN_METADATA_TYPE])
+                        .sub(size.into());
+                })
+                .build(),
+        }
+    }
+
+    /// Gets the metadata from the cache.
+    pub fn get_metadata(&self, file_id: &str) -> Option<Arc<FileMetadata>> {
+        self.cache.get(file_id)
+    }
+
+    /// Puts the metadata into the cache.
+    pub fn put_metadata(&self, file_id: String, metadata: Arc<FileMetadata>) {
+        self.cache.insert(file_id, metadata);
+    }
+}
diff --git a/src/puffin/src/puffin_manager/fs_puffin_manager.rs b/src/puffin/src/puffin_manager/fs_puffin_manager.rs
index 976eb239979a..52190f92fb28 100644
--- a/src/puffin/src/puffin_manager/fs_puffin_manager.rs
+++ b/src/puffin/src/puffin_manager/fs_puffin_manager.rs
@@ -21,6 +21,7 @@ pub use reader::FsPuffinReader;
 pub use writer::FsPuffinWriter;
 
 use crate::error::Result;
+use crate::puffin_manager::cache::PuffinMetadataCacheRef;
 use crate::puffin_manager::file_accessor::PuffinFileAccessor;
 use crate::puffin_manager::stager::Stager;
 use crate::puffin_manager::PuffinManager;
@@ -31,16 +32,29 @@ pub struct FsPuffinManager<S, F> {
     stager: S,
     /// The puffin file accessor.
     puffin_file_accessor: F,
+    /// The puffin metadata cache.
+    puffin_metadata_cache: Option<PuffinMetadataCacheRef>,
 }
 
 impl<S, F> FsPuffinManager<S, F> {
-    /// Creates a new `FsPuffinManager` with the specified `stager` and `puffin_file_accessor`.
+    /// Creates a new `FsPuffinManager` with the specified `stager` and `puffin_file_accessor`,
+    /// and optionally with a `puffin_metadata_cache`.
     pub fn new(stager: S, puffin_file_accessor: F) -> Self {
         Self {
             stager,
             puffin_file_accessor,
+            puffin_metadata_cache: None,
         }
     }
+
+    /// Sets the puffin metadata cache.
+    pub fn with_puffin_metadata_cache(
+        mut self,
+        puffin_metadata_cache: Option<PuffinMetadataCacheRef>,
+    ) -> Self {
+        self.puffin_metadata_cache = puffin_metadata_cache;
+        self
+    }
 }
 
 #[async_trait]
@@ -57,6 +71,7 @@ where
             puffin_file_name.to_string(),
             self.stager.clone(),
             self.puffin_file_accessor.clone(),
+            self.puffin_metadata_cache.clone(),
         ))
     }
 
diff --git a/src/puffin/src/puffin_manager/fs_puffin_manager/reader.rs b/src/puffin/src/puffin_manager/fs_puffin_manager/reader.rs
index 3de27fdb77b0..2e1ae594adc6 100644
--- a/src/puffin/src/puffin_manager/fs_puffin_manager/reader.rs
+++ b/src/puffin/src/puffin_manager/fs_puffin_manager/reader.rs
@@ -14,6 +14,7 @@
 
 use std::io;
 use std::ops::Range;
+use std::sync::Arc;
 
 use async_compression::futures::bufread::ZstdDecoder;
 use async_trait::async_trait;
@@ -23,12 +24,14 @@ use futures::io::BufReader;
 use futures::{AsyncRead, AsyncWrite};
 use snafu::{ensure, OptionExt, ResultExt};
 
+use super::PuffinMetadataCacheRef;
 use crate::blob_metadata::{BlobMetadata, CompressionCodec};
 use crate::error::{
     BlobIndexOutOfBoundSnafu, BlobNotFoundSnafu, DeserializeJsonSnafu, FileKeyNotMatchSnafu,
     MetadataSnafu, ReadSnafu, Result, UnsupportedDecompressionSnafu, WriteSnafu,
 };
 use crate::file_format::reader::{AsyncReader, PuffinFileReader};
+use crate::file_metadata::FileMetadata;
 use crate::partial_reader::PartialReader;
 use crate::puffin_manager::file_accessor::PuffinFileAccessor;
 use crate::puffin_manager::fs_puffin_manager::dir_meta::DirMetadata;
@@ -45,14 +48,23 @@ pub struct FsPuffinReader<S, F> {
 
     /// The puffin file accessor.
     puffin_file_accessor: F,
+
+    /// The puffin file metadata cache.
+    puffin_file_metadata_cache: Option<PuffinMetadataCacheRef>,
 }
 
 impl<S, F> FsPuffinReader<S, F> {
-    pub(crate) fn new(puffin_file_name: String, stager: S, puffin_file_accessor: F) -> Self {
+    pub(crate) fn new(
+        puffin_file_name: String,
+        stager: S,
+        puffin_file_accessor: F,
+        puffin_file_metadata_cache: Option<PuffinMetadataCacheRef>,
+    ) -> Self {
         Self {
             puffin_file_name,
             stager,
             puffin_file_accessor,
+            puffin_file_metadata_cache,
         }
     }
 }
@@ -73,13 +85,13 @@ where
             .await?;
         let mut file = PuffinFileReader::new(reader);
 
-        // TODO(zhongzc): cache the metadata.
-        let metadata = file.metadata().await?;
+        let metadata = self.get_puffin_file_metadata(&mut file).await?;
         let blob_metadata = metadata
             .blobs
-            .into_iter()
+            .iter()
             .find(|m| m.blob_type == key)
-            .context(BlobNotFoundSnafu { blob: key })?;
+            .context(BlobNotFoundSnafu { blob: key })?
+            .clone();
 
         let blob = if blob_metadata.compression_codec.is_none() {
             // If the blob is not compressed, we can directly read it from the puffin file.
@@ -133,6 +145,23 @@ where
     S: Stager,
     F: PuffinFileAccessor + Clone,
 {
+    async fn get_puffin_file_metadata(
+        &self,
+        reader: &mut PuffinFileReader<F::Reader>,
+    ) -> Result<Arc<FileMetadata>> {
+        if let Some(cache) = self.puffin_file_metadata_cache.as_ref() {
+            if let Some(metadata) = cache.get_metadata(&self.puffin_file_name) {
+                return Ok(metadata);
+            }
+        }
+
+        let metadata = Arc::new(reader.metadata().await?);
+        if let Some(cache) = self.puffin_file_metadata_cache.as_ref() {
+            cache.put_metadata(self.puffin_file_name.to_string(), metadata.clone());
+        }
+        Ok(metadata)
+    }
+
     async fn init_blob_to_stager(
         reader: PuffinFileReader<F::Reader>,
         blob_metadata: BlobMetadata,

From 03ad6e2a8dd8cc5632e433b94bb935fdd286c94c Mon Sep 17 00:00:00 2001
From: Yohan Wal <profsyb@gmail.com>
Date: Thu, 12 Dec 2024 12:21:38 +0800
Subject: [PATCH 32/36] feat(fuzz): add alter table options for alter fuzzer
 (#5074)

* feat(fuzz): add set table options to alter fuzzer

* chore: clippy is happy, I'm sad

* chore: happy ci happy

* fix: unit test

* feat(fuzz): add unset table options to alter fuzzer

* fix: unit test

* feat(fuzz): add table option validator

* fix: make clippy happy

* chore: add comments

* chore: apply review comments

* fix: unit test

* feat(fuzz): add more ttl options

* fix: #5108

* chore: add comments

* chore: add comments
---
 Cargo.lock                                    |   1 +
 src/common/base/src/readable_size.rs          |   2 +-
 src/sql/src/statements/alter.rs               |  21 +-
 tests-fuzz/Cargo.toml                         |  11 +-
 tests-fuzz/src/context.rs                     |  59 ++++-
 tests-fuzz/src/generator/alter_expr.rs        | 143 +++++++++++-
 tests-fuzz/src/ir.rs                          |   2 +-
 tests-fuzz/src/ir/alter_expr.rs               | 206 +++++++++++++++++-
 tests-fuzz/src/test_utils.rs                  |   1 +
 tests-fuzz/src/translator.rs                  |   1 +
 tests-fuzz/src/translator/common.rs           |  67 ++++++
 tests-fuzz/src/translator/mysql/alter_expr.rs |  67 +++++-
 .../src/translator/postgres/alter_expr.rs     |  67 +++++-
 tests-fuzz/src/validator.rs                   |   1 +
 tests-fuzz/src/validator/table.rs             | 103 +++++++++
 .../{ => ddl}/fuzz_alter_logical_table.rs     |   0
 .../targets/{ => ddl}/fuzz_alter_table.rs     |  58 ++++-
 .../targets/{ => ddl}/fuzz_create_database.rs |   0
 .../{ => ddl}/fuzz_create_logical_table.rs    |   0
 .../targets/{ => ddl}/fuzz_create_table.rs    |   0
 20 files changed, 742 insertions(+), 68 deletions(-)
 create mode 100644 tests-fuzz/src/translator/common.rs
 create mode 100644 tests-fuzz/src/validator/table.rs
 rename tests-fuzz/targets/{ => ddl}/fuzz_alter_logical_table.rs (100%)
 rename tests-fuzz/targets/{ => ddl}/fuzz_alter_table.rs (72%)
 rename tests-fuzz/targets/{ => ddl}/fuzz_create_database.rs (100%)
 rename tests-fuzz/targets/{ => ddl}/fuzz_create_logical_table.rs (100%)
 rename tests-fuzz/targets/{ => ddl}/fuzz_create_table.rs (100%)

diff --git a/Cargo.lock b/Cargo.lock
index e57a6542afbb..534b8c465ae6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -12197,6 +12197,7 @@ dependencies = [
  "arbitrary",
  "async-trait",
  "chrono",
+ "common-base",
  "common-error",
  "common-macro",
  "common-query",
diff --git a/src/common/base/src/readable_size.rs b/src/common/base/src/readable_size.rs
index 21908526c72a..4298989291b8 100644
--- a/src/common/base/src/readable_size.rs
+++ b/src/common/base/src/readable_size.rs
@@ -19,7 +19,7 @@ pub const GIB: u64 = MIB * BINARY_DATA_MAGNITUDE;
 pub const TIB: u64 = GIB * BINARY_DATA_MAGNITUDE;
 pub const PIB: u64 = TIB * BINARY_DATA_MAGNITUDE;
 
-#[derive(Clone, Copy, PartialEq, Eq, Ord, PartialOrd)]
+#[derive(Clone, Copy, PartialEq, Eq, Ord, PartialOrd, Default)]
 pub struct ReadableSize(pub u64);
 
 impl ReadableSize {
diff --git a/src/sql/src/statements/alter.rs b/src/sql/src/statements/alter.rs
index 174bdbbdc310..df148ae5b63d 100644
--- a/src/sql/src/statements/alter.rs
+++ b/src/sql/src/statements/alter.rs
@@ -72,29 +72,20 @@ pub enum AlterTableOperation {
         target_type: DataType,
     },
     /// `SET <table attrs key> = <table attr value>`
-    SetTableOptions {
-        options: Vec<KeyValueOption>,
-    },
-    UnsetTableOptions {
-        keys: Vec<String>,
-    },
+    SetTableOptions { options: Vec<KeyValueOption> },
+    /// `UNSET <table attrs key>`
+    UnsetTableOptions { keys: Vec<String> },
     /// `DROP COLUMN <name>`
-    DropColumn {
-        name: Ident,
-    },
+    DropColumn { name: Ident },
     /// `RENAME <new_table_name>`
-    RenameTable {
-        new_table_name: String,
-    },
+    RenameTable { new_table_name: String },
     /// `MODIFY COLUMN <column_name> SET FULLTEXT [WITH <options>]`
     SetColumnFulltext {
         column_name: Ident,
         options: FulltextOptions,
     },
     /// `MODIFY COLUMN <column_name> UNSET FULLTEXT`
-    UnsetColumnFulltext {
-        column_name: Ident,
-    },
+    UnsetColumnFulltext { column_name: Ident },
 }
 
 impl Display for AlterTableOperation {
diff --git a/tests-fuzz/Cargo.toml b/tests-fuzz/Cargo.toml
index cbac9df7133a..c408992bd508 100644
--- a/tests-fuzz/Cargo.toml
+++ b/tests-fuzz/Cargo.toml
@@ -18,6 +18,7 @@ unstable = ["nix"]
 arbitrary = { version = "1.3.0", features = ["derive"] }
 async-trait = { workspace = true }
 chrono = { workspace = true }
+common-base = { workspace = true }
 common-error = { workspace = true }
 common-macro = { workspace = true }
 common-query = { workspace = true }
@@ -67,14 +68,14 @@ dotenv.workspace = true
 
 [[bin]]
 name = "fuzz_create_table"
-path = "targets/fuzz_create_table.rs"
+path = "targets/ddl/fuzz_create_table.rs"
 test = false
 bench = false
 doc = false
 
 [[bin]]
 name = "fuzz_create_logical_table"
-path = "targets/fuzz_create_logical_table.rs"
+path = "targets/ddl/fuzz_create_logical_table.rs"
 test = false
 bench = false
 doc = false
@@ -95,21 +96,21 @@ doc = false
 
 [[bin]]
 name = "fuzz_alter_table"
-path = "targets/fuzz_alter_table.rs"
+path = "targets/ddl/fuzz_alter_table.rs"
 test = false
 bench = false
 doc = false
 
 [[bin]]
 name = "fuzz_alter_logical_table"
-path = "targets/fuzz_alter_logical_table.rs"
+path = "targets/ddl/fuzz_alter_logical_table.rs"
 test = false
 bench = false
 doc = false
 
 [[bin]]
 name = "fuzz_create_database"
-path = "targets/fuzz_create_database.rs"
+path = "targets/ddl/fuzz_create_database.rs"
 test = false
 bench = false
 doc = false
diff --git a/tests-fuzz/src/context.rs b/tests-fuzz/src/context.rs
index 8cfd0ca9fa43..d0d5dee72dd7 100644
--- a/tests-fuzz/src/context.rs
+++ b/tests-fuzz/src/context.rs
@@ -21,7 +21,7 @@ use snafu::{ensure, OptionExt};
 
 use crate::error::{self, Result};
 use crate::generator::Random;
-use crate::ir::alter_expr::AlterTableOperation;
+use crate::ir::alter_expr::{AlterTableOperation, AlterTableOption};
 use crate::ir::{AlterTableExpr, Column, CreateTableExpr, Ident};
 
 pub type TableContextRef = Arc<TableContext>;
@@ -35,6 +35,7 @@ pub struct TableContext {
     // GreptimeDB specific options
     pub partition: Option<PartitionDef>,
     pub primary_keys: Vec<usize>,
+    pub table_options: Vec<AlterTableOption>,
 }
 
 impl From<&CreateTableExpr> for TableContext {
@@ -52,6 +53,7 @@ impl From<&CreateTableExpr> for TableContext {
             columns: columns.clone(),
             partition: partition.clone(),
             primary_keys: primary_keys.clone(),
+            table_options: vec![],
         }
     }
 }
@@ -64,7 +66,7 @@ impl TableContext {
 
     /// Applies the [AlterTableExpr].
     pub fn alter(mut self, expr: AlterTableExpr) -> Result<TableContext> {
-        match expr.alter_options {
+        match expr.alter_kinds {
             AlterTableOperation::AddColumn { column, location } => {
                 ensure!(
                     !self.columns.iter().any(|col| col.name == column.name),
@@ -140,6 +142,25 @@ impl TableContext {
                 }
                 Ok(self)
             }
+            AlterTableOperation::SetTableOptions { options } => {
+                for option in options {
+                    if let Some(idx) = self
+                        .table_options
+                        .iter()
+                        .position(|opt| opt.key() == option.key())
+                    {
+                        self.table_options[idx] = option;
+                    } else {
+                        self.table_options.push(option);
+                    }
+                }
+                Ok(self)
+            }
+            AlterTableOperation::UnsetTableOptions { keys } => {
+                self.table_options
+                    .retain(|opt| !keys.contains(&opt.key().to_string()));
+                Ok(self)
+            }
         }
     }
 
@@ -171,10 +192,11 @@ impl TableContext {
 #[cfg(test)]
 mod tests {
     use common_query::AddColumnLocation;
+    use common_time::Duration;
     use datatypes::data_type::ConcreteDataType;
 
     use super::TableContext;
-    use crate::ir::alter_expr::AlterTableOperation;
+    use crate::ir::alter_expr::{AlterTableOperation, AlterTableOption, Ttl};
     use crate::ir::create_expr::ColumnOption;
     use crate::ir::{AlterTableExpr, Column, Ident};
 
@@ -185,11 +207,12 @@ mod tests {
             columns: vec![],
             partition: None,
             primary_keys: vec![],
+            table_options: vec![],
         };
         // Add a column
         let expr = AlterTableExpr {
             table_name: "foo".into(),
-            alter_options: AlterTableOperation::AddColumn {
+            alter_kinds: AlterTableOperation::AddColumn {
                 column: Column {
                     name: "a".into(),
                     column_type: ConcreteDataType::timestamp_microsecond_datatype(),
@@ -205,7 +228,7 @@ mod tests {
         // Add a column at first
         let expr = AlterTableExpr {
             table_name: "foo".into(),
-            alter_options: AlterTableOperation::AddColumn {
+            alter_kinds: AlterTableOperation::AddColumn {
                 column: Column {
                     name: "b".into(),
                     column_type: ConcreteDataType::timestamp_microsecond_datatype(),
@@ -221,7 +244,7 @@ mod tests {
         // Add a column after "b"
         let expr = AlterTableExpr {
             table_name: "foo".into(),
-            alter_options: AlterTableOperation::AddColumn {
+            alter_kinds: AlterTableOperation::AddColumn {
                 column: Column {
                     name: "c".into(),
                     column_type: ConcreteDataType::timestamp_microsecond_datatype(),
@@ -239,10 +262,32 @@ mod tests {
         // Drop the column "b"
         let expr = AlterTableExpr {
             table_name: "foo".into(),
-            alter_options: AlterTableOperation::DropColumn { name: "b".into() },
+            alter_kinds: AlterTableOperation::DropColumn { name: "b".into() },
         };
         let table_ctx = table_ctx.alter(expr).unwrap();
         assert_eq!(table_ctx.columns[1].name, Ident::new("a"));
         assert_eq!(table_ctx.primary_keys, vec![0, 1]);
+
+        // Set table options
+        let ttl_option = AlterTableOption::Ttl(Ttl::Duration(Duration::new_second(60)));
+        let expr = AlterTableExpr {
+            table_name: "foo".into(),
+            alter_kinds: AlterTableOperation::SetTableOptions {
+                options: vec![ttl_option.clone()],
+            },
+        };
+        let table_ctx = table_ctx.alter(expr).unwrap();
+        assert_eq!(table_ctx.table_options.len(), 1);
+        assert_eq!(table_ctx.table_options[0], ttl_option);
+
+        // Unset table options
+        let expr = AlterTableExpr {
+            table_name: "foo".into(),
+            alter_kinds: AlterTableOperation::UnsetTableOptions {
+                keys: vec![ttl_option.key().to_string()],
+            },
+        };
+        let table_ctx = table_ctx.alter(expr).unwrap();
+        assert_eq!(table_ctx.table_options.len(), 0);
     }
 }
diff --git a/tests-fuzz/src/generator/alter_expr.rs b/tests-fuzz/src/generator/alter_expr.rs
index 03aed702fbad..0c5a62899953 100644
--- a/tests-fuzz/src/generator/alter_expr.rs
+++ b/tests-fuzz/src/generator/alter_expr.rs
@@ -14,17 +14,19 @@
 
 use std::marker::PhantomData;
 
+use common_base::readable_size::ReadableSize;
 use common_query::AddColumnLocation;
 use datatypes::data_type::ConcreteDataType;
 use derive_builder::Builder;
 use rand::Rng;
 use snafu::ensure;
+use strum::IntoEnumIterator;
 
 use crate::context::TableContextRef;
 use crate::error::{self, Error, Result};
 use crate::fake::WordGenerator;
 use crate::generator::{ColumnOptionGenerator, ConcreteDataTypeGenerator, Generator, Random};
-use crate::ir::alter_expr::{AlterTableExpr, AlterTableOperation};
+use crate::ir::alter_expr::{AlterTableExpr, AlterTableOperation, AlterTableOption, Ttl};
 use crate::ir::create_expr::ColumnOption;
 use crate::ir::{
     droppable_columns, generate_columns, generate_random_value, modifiable_columns, Column,
@@ -107,7 +109,7 @@ impl<R: Rng + 'static> Generator<AlterTableExpr, R> for AlterExprAddColumnGenera
         .remove(0);
         Ok(AlterTableExpr {
             table_name: self.table_ctx.name.clone(),
-            alter_options: AlterTableOperation::AddColumn { column, location },
+            alter_kinds: AlterTableOperation::AddColumn { column, location },
         })
     }
 }
@@ -130,7 +132,7 @@ impl<R: Rng> Generator<AlterTableExpr, R> for AlterExprDropColumnGenerator<R> {
         let name = droppable[rng.gen_range(0..droppable.len())].name.clone();
         Ok(AlterTableExpr {
             table_name: self.table_ctx.name.clone(),
-            alter_options: AlterTableOperation::DropColumn { name },
+            alter_kinds: AlterTableOperation::DropColumn { name },
         })
     }
 }
@@ -153,7 +155,7 @@ impl<R: Rng> Generator<AlterTableExpr, R> for AlterExprRenameGenerator<R> {
             .generate_unique_table_name(rng, self.name_generator.as_ref());
         Ok(AlterTableExpr {
             table_name: self.table_ctx.name.clone(),
-            alter_options: AlterTableOperation::RenameTable { new_table_name },
+            alter_kinds: AlterTableOperation::RenameTable { new_table_name },
         })
     }
 }
@@ -180,7 +182,7 @@ impl<R: Rng> Generator<AlterTableExpr, R> for AlterExprModifyDataTypeGenerator<R
 
         Ok(AlterTableExpr {
             table_name: self.table_ctx.name.clone(),
-            alter_options: AlterTableOperation::ModifyDataType {
+            alter_kinds: AlterTableOperation::ModifyDataType {
                 column: Column {
                     name: changed.name,
                     column_type: to_type,
@@ -191,6 +193,109 @@ impl<R: Rng> Generator<AlterTableExpr, R> for AlterExprModifyDataTypeGenerator<R
     }
 }
 
+/// Generates the [AlterTableOperation::SetTableOptions] of [AlterTableExpr].
+#[derive(Builder)]
+#[builder(pattern = "owned")]
+pub struct AlterExprSetTableOptionsGenerator<R: Rng> {
+    table_ctx: TableContextRef,
+    #[builder(default)]
+    _phantom: PhantomData<R>,
+}
+
+impl<R: Rng> Generator<AlterTableExpr, R> for AlterExprSetTableOptionsGenerator<R> {
+    type Error = Error;
+
+    fn generate(&self, rng: &mut R) -> Result<AlterTableExpr> {
+        let all_options = AlterTableOption::iter().collect::<Vec<_>>();
+        // Generate random distinct options
+        let mut option_templates_idx = vec![];
+        for _ in 1..rng.gen_range(2..=all_options.len()) {
+            let option = rng.gen_range(0..all_options.len());
+            if !option_templates_idx.contains(&option) {
+                option_templates_idx.push(option);
+            }
+        }
+        let options = option_templates_idx
+            .iter()
+            .map(|idx| match all_options[*idx] {
+                AlterTableOption::Ttl(_) => {
+                    let ttl_type = rng.gen_range(0..3);
+                    match ttl_type {
+                        0 => {
+                            let duration: u32 = rng.gen();
+                            AlterTableOption::Ttl(Ttl::Duration((duration as i64).into()))
+                        }
+                        1 => AlterTableOption::Ttl(Ttl::Instant),
+                        2 => AlterTableOption::Ttl(Ttl::Forever),
+                        _ => unreachable!(),
+                    }
+                }
+                AlterTableOption::TwcsTimeWindow(_) => {
+                    let time_window: u32 = rng.gen();
+                    AlterTableOption::TwcsTimeWindow((time_window as i64).into())
+                }
+                AlterTableOption::TwcsMaxOutputFileSize(_) => {
+                    let max_output_file_size: u64 = rng.gen();
+                    AlterTableOption::TwcsMaxOutputFileSize(ReadableSize(max_output_file_size))
+                }
+                AlterTableOption::TwcsMaxInactiveWindowRuns(_) => {
+                    let max_inactive_window_runs: u64 = rng.gen();
+                    AlterTableOption::TwcsMaxInactiveWindowRuns(max_inactive_window_runs)
+                }
+                AlterTableOption::TwcsMaxActiveWindowFiles(_) => {
+                    let max_active_window_files: u64 = rng.gen();
+                    AlterTableOption::TwcsMaxActiveWindowFiles(max_active_window_files)
+                }
+                AlterTableOption::TwcsMaxActiveWindowRuns(_) => {
+                    let max_active_window_runs: u64 = rng.gen();
+                    AlterTableOption::TwcsMaxActiveWindowRuns(max_active_window_runs)
+                }
+                AlterTableOption::TwcsMaxInactiveWindowFiles(_) => {
+                    let max_inactive_window_files: u64 = rng.gen();
+                    AlterTableOption::TwcsMaxInactiveWindowFiles(max_inactive_window_files)
+                }
+            })
+            .collect();
+        Ok(AlterTableExpr {
+            table_name: self.table_ctx.name.clone(),
+            alter_kinds: AlterTableOperation::SetTableOptions { options },
+        })
+    }
+}
+
+/// Generates the [AlterTableOperation::UnsetTableOptions] of [AlterTableExpr].
+#[derive(Builder)]
+#[builder(pattern = "owned")]
+pub struct AlterExprUnsetTableOptionsGenerator<R: Rng> {
+    table_ctx: TableContextRef,
+    #[builder(default)]
+    _phantom: PhantomData<R>,
+}
+
+impl<R: Rng> Generator<AlterTableExpr, R> for AlterExprUnsetTableOptionsGenerator<R> {
+    type Error = Error;
+
+    fn generate(&self, rng: &mut R) -> Result<AlterTableExpr> {
+        let all_options = AlterTableOption::iter().collect::<Vec<_>>();
+        // Generate random distinct options
+        let mut option_templates_idx = vec![];
+        for _ in 1..rng.gen_range(2..=all_options.len()) {
+            let option = rng.gen_range(0..all_options.len());
+            if !option_templates_idx.contains(&option) {
+                option_templates_idx.push(option);
+            }
+        }
+        let options = option_templates_idx
+            .iter()
+            .map(|idx| all_options[*idx].key().to_string())
+            .collect();
+        Ok(AlterTableExpr {
+            table_name: self.table_ctx.name.clone(),
+            alter_kinds: AlterTableOperation::UnsetTableOptions { keys: options },
+        })
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use std::sync::Arc;
@@ -220,7 +325,7 @@ mod tests {
             .generate(&mut rng)
             .unwrap();
         let serialized = serde_json::to_string(&expr).unwrap();
-        let expected = r#"{"table_name":{"value":"animI","quote_style":null},"alter_options":{"AddColumn":{"column":{"name":{"value":"velit","quote_style":null},"column_type":{"Int32":{}},"options":[{"DefaultValue":{"Int32":1606462472}}]},"location":null}}}"#;
+        let expected = r#"{"table_name":{"value":"animI","quote_style":null},"alter_kinds":{"AddColumn":{"column":{"name":{"value":"velit","quote_style":null},"column_type":{"Int32":{}},"options":[{"DefaultValue":{"Int32":1606462472}}]},"location":null}}}"#;
         assert_eq!(expected, serialized);
 
         let expr = AlterExprRenameGeneratorBuilder::default()
@@ -230,7 +335,7 @@ mod tests {
             .generate(&mut rng)
             .unwrap();
         let serialized = serde_json::to_string(&expr).unwrap();
-        let expected = r#"{"table_name":{"value":"animI","quote_style":null},"alter_options":{"RenameTable":{"new_table_name":{"value":"nihil","quote_style":null}}}}"#;
+        let expected = r#"{"table_name":{"value":"animI","quote_style":null},"alter_kinds":{"RenameTable":{"new_table_name":{"value":"nihil","quote_style":null}}}}"#;
         assert_eq!(expected, serialized);
 
         let expr = AlterExprDropColumnGeneratorBuilder::default()
@@ -240,17 +345,37 @@ mod tests {
             .generate(&mut rng)
             .unwrap();
         let serialized = serde_json::to_string(&expr).unwrap();
-        let expected = r#"{"table_name":{"value":"animI","quote_style":null},"alter_options":{"DropColumn":{"name":{"value":"cUmquE","quote_style":null}}}}"#;
+        let expected = r#"{"table_name":{"value":"animI","quote_style":null},"alter_kinds":{"DropColumn":{"name":{"value":"cUmquE","quote_style":null}}}}"#;
         assert_eq!(expected, serialized);
 
         let expr = AlterExprModifyDataTypeGeneratorBuilder::default()
+            .table_ctx(table_ctx.clone())
+            .build()
+            .unwrap()
+            .generate(&mut rng)
+            .unwrap();
+        let serialized = serde_json::to_string(&expr).unwrap();
+        let expected = r#"{"table_name":{"value":"animI","quote_style":null},"alter_kinds":{"ModifyDataType":{"column":{"name":{"value":"toTAm","quote_style":null},"column_type":{"Int64":{}},"options":[]}}}}"#;
+        assert_eq!(expected, serialized);
+
+        let expr = AlterExprSetTableOptionsGeneratorBuilder::default()
+            .table_ctx(table_ctx.clone())
+            .build()
+            .unwrap()
+            .generate(&mut rng)
+            .unwrap();
+        let serialized = serde_json::to_string(&expr).unwrap();
+        let expected = r#"{"table_name":{"value":"animI","quote_style":null},"alter_kinds":{"SetTableOptions":{"options":[{"TwcsMaxActiveWindowRuns":14908016120444947142},{"TwcsMaxActiveWindowFiles":5840340123887173415},{"TwcsMaxOutputFileSize":17740311466571102265}]}}}"#;
+        assert_eq!(expected, serialized);
+
+        let expr = AlterExprUnsetTableOptionsGeneratorBuilder::default()
             .table_ctx(table_ctx)
             .build()
             .unwrap()
             .generate(&mut rng)
             .unwrap();
         let serialized = serde_json::to_string(&expr).unwrap();
-        let expected = r#"{"table_name":{"value":"animI","quote_style":null},"alter_options":{"ModifyDataType":{"column":{"name":{"value":"toTAm","quote_style":null},"column_type":{"Int64":{}},"options":[]}}}}"#;
+        let expected = r#"{"table_name":{"value":"animI","quote_style":null},"alter_kinds":{"UnsetTableOptions":{"keys":["compaction.twcs.max_active_window_runs"]}}}"#;
         assert_eq!(expected, serialized);
     }
 }
diff --git a/tests-fuzz/src/ir.rs b/tests-fuzz/src/ir.rs
index b9d13ca9fba3..ae6edd595c85 100644
--- a/tests-fuzz/src/ir.rs
+++ b/tests-fuzz/src/ir.rs
@@ -24,7 +24,7 @@ use std::collections::HashMap;
 use std::sync::{Arc, Mutex};
 use std::time::Duration;
 
-pub use alter_expr::AlterTableExpr;
+pub use alter_expr::{AlterTableExpr, AlterTableOption};
 use common_time::timestamp::TimeUnit;
 use common_time::{Date, DateTime, Timestamp};
 pub use create_expr::{CreateDatabaseExpr, CreateTableExpr};
diff --git a/tests-fuzz/src/ir/alter_expr.rs b/tests-fuzz/src/ir/alter_expr.rs
index a9fdc18c2228..1d637ff6604c 100644
--- a/tests-fuzz/src/ir/alter_expr.rs
+++ b/tests-fuzz/src/ir/alter_expr.rs
@@ -12,16 +12,28 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::fmt::Display;
+use std::str::FromStr;
+
+use common_base::readable_size::ReadableSize;
 use common_query::AddColumnLocation;
+use common_time::{Duration, FOREVER, INSTANT};
 use derive_builder::Builder;
 use serde::{Deserialize, Serialize};
+use store_api::mito_engine_options::{
+    APPEND_MODE_KEY, COMPACTION_TYPE, TTL_KEY, TWCS_MAX_ACTIVE_WINDOW_FILES,
+    TWCS_MAX_ACTIVE_WINDOW_RUNS, TWCS_MAX_INACTIVE_WINDOW_FILES, TWCS_MAX_INACTIVE_WINDOW_RUNS,
+    TWCS_MAX_OUTPUT_FILE_SIZE, TWCS_TIME_WINDOW,
+};
+use strum::EnumIter;
 
+use crate::error::{self, Result};
 use crate::ir::{Column, Ident};
 
 #[derive(Debug, Builder, Clone, Serialize, Deserialize)]
 pub struct AlterTableExpr {
     pub table_name: Ident,
-    pub alter_options: AlterTableOperation,
+    pub alter_kinds: AlterTableOperation,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -37,4 +49,196 @@ pub enum AlterTableOperation {
     RenameTable { new_table_name: Ident },
     /// `MODIFY COLUMN <column_name> <column_type>`
     ModifyDataType { column: Column },
+    /// `SET <table attrs key> = <table attr value>`
+    SetTableOptions { options: Vec<AlterTableOption> },
+    /// `UNSET <table attrs key>`
+    UnsetTableOptions { keys: Vec<String> },
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
+pub enum Ttl {
+    Duration(Duration),
+    Instant,
+    #[default]
+    Forever,
+}
+
+impl Display for Ttl {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Ttl::Duration(d) => write!(f, "{}", d),
+            Ttl::Instant => write!(f, "{}", INSTANT),
+            Ttl::Forever => write!(f, "{}", FOREVER),
+        }
+    }
+}
+
+#[derive(Debug, EnumIter, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub enum AlterTableOption {
+    Ttl(Ttl),
+    TwcsTimeWindow(Duration),
+    TwcsMaxOutputFileSize(ReadableSize),
+    TwcsMaxInactiveWindowFiles(u64),
+    TwcsMaxActiveWindowFiles(u64),
+    TwcsMaxInactiveWindowRuns(u64),
+    TwcsMaxActiveWindowRuns(u64),
+}
+
+impl AlterTableOption {
+    pub fn key(&self) -> &str {
+        match self {
+            AlterTableOption::Ttl(_) => TTL_KEY,
+            AlterTableOption::TwcsTimeWindow(_) => TWCS_TIME_WINDOW,
+            AlterTableOption::TwcsMaxOutputFileSize(_) => TWCS_MAX_OUTPUT_FILE_SIZE,
+            AlterTableOption::TwcsMaxInactiveWindowFiles(_) => TWCS_MAX_INACTIVE_WINDOW_FILES,
+            AlterTableOption::TwcsMaxActiveWindowFiles(_) => TWCS_MAX_ACTIVE_WINDOW_FILES,
+            AlterTableOption::TwcsMaxInactiveWindowRuns(_) => TWCS_MAX_INACTIVE_WINDOW_RUNS,
+            AlterTableOption::TwcsMaxActiveWindowRuns(_) => TWCS_MAX_ACTIVE_WINDOW_RUNS,
+        }
+    }
+
+    /// Parses the AlterTableOption from a key-value pair
+    fn parse_kv(key: &str, value: &str) -> Result<Self> {
+        match key {
+            TTL_KEY => {
+                let ttl = if value.to_lowercase() == INSTANT {
+                    Ttl::Instant
+                } else if value.to_lowercase() == FOREVER {
+                    Ttl::Forever
+                } else {
+                    let duration = humantime::parse_duration(value).unwrap();
+                    Ttl::Duration(duration.into())
+                };
+                Ok(AlterTableOption::Ttl(ttl))
+            }
+            TWCS_MAX_ACTIVE_WINDOW_RUNS => {
+                let runs = value.parse().unwrap();
+                Ok(AlterTableOption::TwcsMaxActiveWindowRuns(runs))
+            }
+            TWCS_MAX_ACTIVE_WINDOW_FILES => {
+                let files = value.parse().unwrap();
+                Ok(AlterTableOption::TwcsMaxActiveWindowFiles(files))
+            }
+            TWCS_MAX_INACTIVE_WINDOW_RUNS => {
+                let runs = value.parse().unwrap();
+                Ok(AlterTableOption::TwcsMaxInactiveWindowRuns(runs))
+            }
+            TWCS_MAX_INACTIVE_WINDOW_FILES => {
+                let files = value.parse().unwrap();
+                Ok(AlterTableOption::TwcsMaxInactiveWindowFiles(files))
+            }
+            TWCS_MAX_OUTPUT_FILE_SIZE => {
+                // may be "1M" instead of "1 MiB"
+                let value = if value.ends_with("B") {
+                    value.to_string()
+                } else {
+                    format!("{}B", value)
+                };
+                let size = ReadableSize::from_str(&value).unwrap();
+                Ok(AlterTableOption::TwcsMaxOutputFileSize(size))
+            }
+            TWCS_TIME_WINDOW => {
+                let time = humantime::parse_duration(value).unwrap();
+                Ok(AlterTableOption::TwcsTimeWindow(time.into()))
+            }
+            _ => error::UnexpectedSnafu {
+                violated: format!("Unknown table option key: {}", key),
+            }
+            .fail(),
+        }
+    }
+
+    /// Parses the AlterTableOption from comma-separated string
+    pub fn parse_kv_pairs(option_string: &str) -> Result<Vec<Self>> {
+        let mut options = vec![];
+        for pair in option_string.split(',') {
+            let pair = pair.trim();
+            let (key, value) = pair.split_once('=').unwrap();
+            let key = key.trim().replace("\'", "");
+            let value = value.trim().replace('\'', "");
+            // Currently we have only one compaction type, so we ignore it
+            // Cautious: COMPACTION_TYPE may be kept even if there are no compaction options enabled
+            if key == COMPACTION_TYPE || key == APPEND_MODE_KEY {
+                continue;
+            } else {
+                let option = AlterTableOption::parse_kv(&key, &value)?;
+                options.push(option);
+            }
+        }
+        Ok(options)
+    }
+}
+
+impl Display for AlterTableOption {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            AlterTableOption::Ttl(d) => write!(f, "'{}' = '{}'", TTL_KEY, d),
+            AlterTableOption::TwcsTimeWindow(d) => write!(f, "'{}' = '{}'", TWCS_TIME_WINDOW, d),
+            AlterTableOption::TwcsMaxOutputFileSize(s) => {
+                // Caution: to_string loses precision for ReadableSize
+                write!(f, "'{}' = '{}'", TWCS_MAX_OUTPUT_FILE_SIZE, s)
+            }
+            AlterTableOption::TwcsMaxInactiveWindowFiles(u) => {
+                write!(f, "'{}' = '{}'", TWCS_MAX_INACTIVE_WINDOW_FILES, u)
+            }
+            AlterTableOption::TwcsMaxActiveWindowFiles(u) => {
+                write!(f, "'{}' = '{}'", TWCS_MAX_ACTIVE_WINDOW_FILES, u)
+            }
+            AlterTableOption::TwcsMaxInactiveWindowRuns(u) => {
+                write!(f, "'{}' = '{}'", TWCS_MAX_INACTIVE_WINDOW_RUNS, u)
+            }
+            AlterTableOption::TwcsMaxActiveWindowRuns(u) => {
+                write!(f, "'{}' = '{}'", TWCS_MAX_ACTIVE_WINDOW_RUNS, u)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_kv_pairs() {
+        let option_string =
+            "compaction.twcs.max_output_file_size = '1M', compaction.type = 'twcs', ttl = 'forever'";
+        let options = AlterTableOption::parse_kv_pairs(option_string).unwrap();
+        assert_eq!(options.len(), 2);
+        assert_eq!(
+            options,
+            vec![
+                AlterTableOption::TwcsMaxOutputFileSize(ReadableSize::from_str("1MB").unwrap()),
+                AlterTableOption::Ttl(Ttl::Forever),
+            ]
+        );
+
+        let option_string = "compaction.twcs.max_active_window_files = '5030469694939972912',
+  compaction.twcs.max_active_window_runs = '8361168990283879099',
+  compaction.twcs.max_inactive_window_files = '6028716566907830876',
+  compaction.twcs.max_inactive_window_runs = '10622283085591494074',
+  compaction.twcs.max_output_file_size = '15686.4PiB',
+  compaction.twcs.time_window = '2061999256ms',
+  compaction.type = 'twcs',
+  ttl = '1month 3days 15h 49m 8s 279ms'";
+        let options = AlterTableOption::parse_kv_pairs(option_string).unwrap();
+        assert_eq!(options.len(), 7);
+        let expected = vec![
+            AlterTableOption::TwcsMaxActiveWindowFiles(5030469694939972912),
+            AlterTableOption::TwcsMaxActiveWindowRuns(8361168990283879099),
+            AlterTableOption::TwcsMaxInactiveWindowFiles(6028716566907830876),
+            AlterTableOption::TwcsMaxInactiveWindowRuns(10622283085591494074),
+            AlterTableOption::TwcsMaxOutputFileSize(ReadableSize::from_str("15686.4PiB").unwrap()),
+            AlterTableOption::TwcsTimeWindow(Duration::new_nanosecond(2_061_999_256_000_000)),
+            AlterTableOption::Ttl(Ttl::Duration(Duration::new_millisecond(
+                // A month is 2_630_016 seconds
+                2_630_016 * 1000
+                    + 3 * 24 * 60 * 60 * 1000
+                    + 15 * 60 * 60 * 1000
+                    + 49 * 60 * 1000
+                    + 8 * 1000
+                    + 279,
+            ))),
+        ];
+        assert_eq!(options, expected);
+    }
 }
diff --git a/tests-fuzz/src/test_utils.rs b/tests-fuzz/src/test_utils.rs
index e65548969ac1..bef96a1fd7f9 100644
--- a/tests-fuzz/src/test_utils.rs
+++ b/tests-fuzz/src/test_utils.rs
@@ -55,5 +55,6 @@ pub fn new_test_ctx() -> TableContext {
         ],
         partition: None,
         primary_keys: vec![],
+        table_options: vec![],
     }
 }
diff --git a/tests-fuzz/src/translator.rs b/tests-fuzz/src/translator.rs
index 1745aa933601..673b543f2c0b 100644
--- a/tests-fuzz/src/translator.rs
+++ b/tests-fuzz/src/translator.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+mod common;
 pub mod mysql;
 pub mod postgres;
 
diff --git a/tests-fuzz/src/translator/common.rs b/tests-fuzz/src/translator/common.rs
new file mode 100644
index 000000000000..2b968ed4391a
--- /dev/null
+++ b/tests-fuzz/src/translator/common.rs
@@ -0,0 +1,67 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::Display;
+
+use super::DslTranslator;
+use crate::error::{Error, Result};
+use crate::ir::alter_expr::AlterTableOperation;
+use crate::ir::{AlterTableExpr, AlterTableOption};
+
+/// Shared translator for `ALTER TABLE` operations.
+pub(crate) struct CommonAlterTableTranslator;
+
+impl DslTranslator<AlterTableExpr, String> for CommonAlterTableTranslator {
+    type Error = Error;
+
+    fn translate(&self, input: &AlterTableExpr) -> Result<String> {
+        Ok(match &input.alter_kinds {
+            AlterTableOperation::DropColumn { name } => Self::format_drop(&input.table_name, name),
+            AlterTableOperation::SetTableOptions { options } => {
+                Self::format_set_table_options(&input.table_name, options)
+            }
+            AlterTableOperation::UnsetTableOptions { keys } => {
+                Self::format_unset_table_options(&input.table_name, keys)
+            }
+            _ => unimplemented!(),
+        })
+    }
+}
+
+impl CommonAlterTableTranslator {
+    fn format_drop(name: impl Display, column: impl Display) -> String {
+        format!("ALTER TABLE {name} DROP COLUMN {column};")
+    }
+
+    fn format_set_table_options(name: impl Display, options: &[AlterTableOption]) -> String {
+        format!(
+            "ALTER TABLE {name} SET {};",
+            options
+                .iter()
+                .map(|option| option.to_string())
+                .collect::<Vec<_>>()
+                .join(", ")
+        )
+    }
+
+    fn format_unset_table_options(name: impl Display, keys: &[String]) -> String {
+        format!(
+            "ALTER TABLE {name} UNSET {};",
+            keys.iter()
+                .map(|key| format!("'{}'", key))
+                .collect::<Vec<_>>()
+                .join(", ")
+        )
+    }
+}
diff --git a/tests-fuzz/src/translator/mysql/alter_expr.rs b/tests-fuzz/src/translator/mysql/alter_expr.rs
index c973d7cb4b2a..3bf30b09a3ba 100644
--- a/tests-fuzz/src/translator/mysql/alter_expr.rs
+++ b/tests-fuzz/src/translator/mysql/alter_expr.rs
@@ -22,6 +22,7 @@ use crate::error::{Error, Result};
 use crate::ir::alter_expr::AlterTableOperation;
 use crate::ir::create_expr::ColumnOption;
 use crate::ir::{AlterTableExpr, Column};
+use crate::translator::common::CommonAlterTableTranslator;
 use crate::translator::DslTranslator;
 
 pub struct AlterTableExprTranslator;
@@ -30,26 +31,22 @@ impl DslTranslator<AlterTableExpr, String> for AlterTableExprTranslator {
     type Error = Error;
 
     fn translate(&self, input: &AlterTableExpr) -> Result<String> {
-        Ok(match &input.alter_options {
+        Ok(match &input.alter_kinds {
             AlterTableOperation::AddColumn { column, location } => {
                 Self::format_add_column(&input.table_name, column, location)
             }
-            AlterTableOperation::DropColumn { name } => Self::format_drop(&input.table_name, name),
             AlterTableOperation::RenameTable { new_table_name } => {
                 Self::format_rename(&input.table_name, new_table_name)
             }
             AlterTableOperation::ModifyDataType { column } => {
                 Self::format_modify_data_type(&input.table_name, column)
             }
+            _ => CommonAlterTableTranslator.translate(input)?,
         })
     }
 }
 
 impl AlterTableExprTranslator {
-    fn format_drop(name: impl Display, column: impl Display) -> String {
-        format!("ALTER TABLE {name} DROP COLUMN {column};")
-    }
-
     fn format_rename(name: impl Display, new_name: impl Display) -> String {
         format!("ALTER TABLE {name} RENAME {new_name};")
     }
@@ -119,11 +116,15 @@ impl AlterTableExprTranslator {
 
 #[cfg(test)]
 mod tests {
+    use std::str::FromStr;
+
+    use common_base::readable_size::ReadableSize;
     use common_query::AddColumnLocation;
+    use common_time::Duration;
     use datatypes::data_type::ConcreteDataType;
 
     use super::AlterTableExprTranslator;
-    use crate::ir::alter_expr::AlterTableOperation;
+    use crate::ir::alter_expr::{AlterTableOperation, AlterTableOption, Ttl};
     use crate::ir::create_expr::ColumnOption;
     use crate::ir::{AlterTableExpr, Column};
     use crate::translator::DslTranslator;
@@ -132,7 +133,7 @@ mod tests {
     fn test_alter_table_expr() {
         let alter_expr = AlterTableExpr {
             table_name: "test".into(),
-            alter_options: AlterTableOperation::AddColumn {
+            alter_kinds: AlterTableOperation::AddColumn {
                 column: Column {
                     name: "host".into(),
                     column_type: ConcreteDataType::string_datatype(),
@@ -150,7 +151,7 @@ mod tests {
 
         let alter_expr = AlterTableExpr {
             table_name: "test".into(),
-            alter_options: AlterTableOperation::RenameTable {
+            alter_kinds: AlterTableOperation::RenameTable {
                 new_table_name: "foo".into(),
             },
         };
@@ -160,7 +161,7 @@ mod tests {
 
         let alter_expr = AlterTableExpr {
             table_name: "test".into(),
-            alter_options: AlterTableOperation::DropColumn { name: "foo".into() },
+            alter_kinds: AlterTableOperation::DropColumn { name: "foo".into() },
         };
 
         let output = AlterTableExprTranslator.translate(&alter_expr).unwrap();
@@ -168,7 +169,7 @@ mod tests {
 
         let alter_expr = AlterTableExpr {
             table_name: "test".into(),
-            alter_options: AlterTableOperation::ModifyDataType {
+            alter_kinds: AlterTableOperation::ModifyDataType {
                 column: Column {
                     name: "host".into(),
                     column_type: ConcreteDataType::string_datatype(),
@@ -180,4 +181,48 @@ mod tests {
         let output = AlterTableExprTranslator.translate(&alter_expr).unwrap();
         assert_eq!("ALTER TABLE test MODIFY COLUMN host STRING;", output);
     }
+
+    #[test]
+    fn test_alter_table_expr_set_table_options() {
+        let alter_expr = AlterTableExpr {
+            table_name: "test".into(),
+            alter_kinds: AlterTableOperation::SetTableOptions {
+                options: vec![
+                    AlterTableOption::Ttl(Ttl::Duration(Duration::new_second(60))),
+                    AlterTableOption::TwcsTimeWindow(Duration::new_second(60)),
+                    AlterTableOption::TwcsMaxOutputFileSize(ReadableSize::from_str("1GB").unwrap()),
+                    AlterTableOption::TwcsMaxActiveWindowFiles(10),
+                    AlterTableOption::TwcsMaxActiveWindowRuns(10),
+                    AlterTableOption::TwcsMaxInactiveWindowFiles(5),
+                    AlterTableOption::TwcsMaxInactiveWindowRuns(5),
+                ],
+            },
+        };
+
+        let output = AlterTableExprTranslator.translate(&alter_expr).unwrap();
+        let expected = concat!(
+            "ALTER TABLE test SET 'ttl' = '60s', ",
+            "'compaction.twcs.time_window' = '60s', ",
+            "'compaction.twcs.max_output_file_size' = '1.0GiB', ",
+            "'compaction.twcs.max_active_window_files' = '10', ",
+            "'compaction.twcs.max_active_window_runs' = '10', ",
+            "'compaction.twcs.max_inactive_window_files' = '5', ",
+            "'compaction.twcs.max_inactive_window_runs' = '5';"
+        );
+        assert_eq!(expected, output);
+    }
+
+    #[test]
+    fn test_alter_table_expr_unset_table_options() {
+        let alter_expr = AlterTableExpr {
+            table_name: "test".into(),
+            alter_kinds: AlterTableOperation::UnsetTableOptions {
+                keys: vec!["ttl".into(), "compaction.twcs.time_window".into()],
+            },
+        };
+
+        let output = AlterTableExprTranslator.translate(&alter_expr).unwrap();
+        let expected = "ALTER TABLE test UNSET 'ttl', 'compaction.twcs.time_window';";
+        assert_eq!(expected, output);
+    }
 }
diff --git a/tests-fuzz/src/translator/postgres/alter_expr.rs b/tests-fuzz/src/translator/postgres/alter_expr.rs
index 42db202efef0..f66ce0db923d 100644
--- a/tests-fuzz/src/translator/postgres/alter_expr.rs
+++ b/tests-fuzz/src/translator/postgres/alter_expr.rs
@@ -21,6 +21,7 @@ use crate::error::{Error, Result};
 use crate::ir::alter_expr::AlterTableOperation;
 use crate::ir::create_expr::ColumnOption;
 use crate::ir::{AlterTableExpr, Column};
+use crate::translator::common::CommonAlterTableTranslator;
 use crate::translator::postgres::sql_data_type_to_postgres_data_type;
 use crate::translator::DslTranslator;
 
@@ -30,26 +31,22 @@ impl DslTranslator<AlterTableExpr, String> for AlterTableExprTranslator {
     type Error = Error;
 
     fn translate(&self, input: &AlterTableExpr) -> Result<String> {
-        Ok(match &input.alter_options {
+        Ok(match &input.alter_kinds {
             AlterTableOperation::AddColumn { column, .. } => {
                 Self::format_add_column(&input.table_name, column)
             }
-            AlterTableOperation::DropColumn { name } => Self::format_drop(&input.table_name, name),
             AlterTableOperation::RenameTable { new_table_name } => {
                 Self::format_rename(&input.table_name, new_table_name)
             }
             AlterTableOperation::ModifyDataType { column } => {
                 Self::format_modify_data_type(&input.table_name, column)
             }
+            _ => CommonAlterTableTranslator.translate(input)?,
         })
     }
 }
 
 impl AlterTableExprTranslator {
-    fn format_drop(name: impl Display, column: impl Display) -> String {
-        format!("ALTER TABLE {name} DROP COLUMN {column};")
-    }
-
     fn format_rename(name: impl Display, new_name: impl Display) -> String {
         format!("ALTER TABLE {name} RENAME TO {new_name};")
     }
@@ -116,11 +113,15 @@ impl AlterTableExprTranslator {
 
 #[cfg(test)]
 mod tests {
+    use std::str::FromStr;
+
+    use common_base::readable_size::ReadableSize;
     use common_query::AddColumnLocation;
+    use common_time::Duration;
     use datatypes::data_type::ConcreteDataType;
 
     use super::AlterTableExprTranslator;
-    use crate::ir::alter_expr::AlterTableOperation;
+    use crate::ir::alter_expr::{AlterTableOperation, AlterTableOption, Ttl};
     use crate::ir::create_expr::ColumnOption;
     use crate::ir::{AlterTableExpr, Column};
     use crate::translator::DslTranslator;
@@ -129,7 +130,7 @@ mod tests {
     fn test_alter_table_expr() {
         let alter_expr = AlterTableExpr {
             table_name: "test".into(),
-            alter_options: AlterTableOperation::AddColumn {
+            alter_kinds: AlterTableOperation::AddColumn {
                 column: Column {
                     name: "host".into(),
                     column_type: ConcreteDataType::string_datatype(),
@@ -145,7 +146,7 @@ mod tests {
 
         let alter_expr = AlterTableExpr {
             table_name: "test".into(),
-            alter_options: AlterTableOperation::RenameTable {
+            alter_kinds: AlterTableOperation::RenameTable {
                 new_table_name: "foo".into(),
             },
         };
@@ -155,7 +156,7 @@ mod tests {
 
         let alter_expr = AlterTableExpr {
             table_name: "test".into(),
-            alter_options: AlterTableOperation::DropColumn { name: "foo".into() },
+            alter_kinds: AlterTableOperation::DropColumn { name: "foo".into() },
         };
 
         let output = AlterTableExprTranslator.translate(&alter_expr).unwrap();
@@ -163,7 +164,7 @@ mod tests {
 
         let alter_expr = AlterTableExpr {
             table_name: "test".into(),
-            alter_options: AlterTableOperation::ModifyDataType {
+            alter_kinds: AlterTableOperation::ModifyDataType {
                 column: Column {
                     name: "host".into(),
                     column_type: ConcreteDataType::string_datatype(),
@@ -176,4 +177,48 @@ mod tests {
         // Ignores the location and primary key option.
         assert_eq!("ALTER TABLE test MODIFY COLUMN host STRING;", output);
     }
+
+    #[test]
+    fn test_alter_table_expr_set_table_options() {
+        let alter_expr = AlterTableExpr {
+            table_name: "test".into(),
+            alter_kinds: AlterTableOperation::SetTableOptions {
+                options: vec![
+                    AlterTableOption::Ttl(Ttl::Duration(Duration::new_second(60))),
+                    AlterTableOption::TwcsTimeWindow(Duration::new_second(60)),
+                    AlterTableOption::TwcsMaxOutputFileSize(ReadableSize::from_str("1GB").unwrap()),
+                    AlterTableOption::TwcsMaxActiveWindowFiles(10),
+                    AlterTableOption::TwcsMaxActiveWindowRuns(10),
+                    AlterTableOption::TwcsMaxInactiveWindowFiles(5),
+                    AlterTableOption::TwcsMaxInactiveWindowRuns(5),
+                ],
+            },
+        };
+
+        let output = AlterTableExprTranslator.translate(&alter_expr).unwrap();
+        let expected = concat!(
+            "ALTER TABLE test SET 'ttl' = '60s', ",
+            "'compaction.twcs.time_window' = '60s', ",
+            "'compaction.twcs.max_output_file_size' = '1.0GiB', ",
+            "'compaction.twcs.max_active_window_files' = '10', ",
+            "'compaction.twcs.max_active_window_runs' = '10', ",
+            "'compaction.twcs.max_inactive_window_files' = '5', ",
+            "'compaction.twcs.max_inactive_window_runs' = '5';"
+        );
+        assert_eq!(expected, output);
+    }
+
+    #[test]
+    fn test_alter_table_expr_unset_table_options() {
+        let alter_expr = AlterTableExpr {
+            table_name: "test".into(),
+            alter_kinds: AlterTableOperation::UnsetTableOptions {
+                keys: vec!["ttl".into(), "compaction.twcs.time_window".into()],
+            },
+        };
+
+        let output = AlterTableExprTranslator.translate(&alter_expr).unwrap();
+        let expected = "ALTER TABLE test UNSET 'ttl', 'compaction.twcs.time_window';";
+        assert_eq!(expected, output);
+    }
 }
diff --git a/tests-fuzz/src/validator.rs b/tests-fuzz/src/validator.rs
index cf2df9af229c..406dd66041a2 100644
--- a/tests-fuzz/src/validator.rs
+++ b/tests-fuzz/src/validator.rs
@@ -14,3 +14,4 @@
 
 pub mod column;
 pub mod row;
+pub mod table;
diff --git a/tests-fuzz/src/validator/table.rs b/tests-fuzz/src/validator/table.rs
new file mode 100644
index 000000000000..406719b2d660
--- /dev/null
+++ b/tests-fuzz/src/validator/table.rs
@@ -0,0 +1,103 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use snafu::{ensure, ResultExt};
+use sqlx::database::HasArguments;
+use sqlx::{ColumnIndex, Database, Decode, Encode, Executor, IntoArguments, Row, Type};
+
+use crate::error::{self, Result, UnexpectedSnafu};
+use crate::ir::alter_expr::AlterTableOption;
+
+/// Parses table options from the result of `SHOW CREATE TABLE`
+/// An example of the result of `SHOW CREATE TABLE`:
+/// +-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+/// | Table | Create Table                                                                                                                                                                                           |
+/// +-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+/// | json  | CREATE TABLE IF NOT EXISTS `json` (`ts` TIMESTAMP(3) NOT NULL, `j` JSON NULL, TIME INDEX (`ts`)) ENGINE=mito WITH(compaction.twcs.max_output_file_size = '1M', compaction.type = 'twcs', ttl = '1day') |
+/// +-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+fn parse_show_create(show_create: &str) -> Result<Vec<AlterTableOption>> {
+    if let Some(option_start) = show_create.find("WITH(") {
+        let option_end = {
+            let remain_str = &show_create[option_start..];
+            if let Some(end) = remain_str.find(')') {
+                end + option_start
+            } else {
+                return UnexpectedSnafu {
+                    violated: format!("Cannot find the end of the options in: {}", show_create),
+                }
+                .fail();
+            }
+        };
+        let options = &show_create[option_start + 5..option_end];
+        Ok(AlterTableOption::parse_kv_pairs(options)?)
+    } else {
+        Ok(vec![])
+    }
+}
+
+/// Fetches table options from the context
+pub async fn fetch_table_options<'a, DB, E>(e: E, sql: &'a str) -> Result<Vec<AlterTableOption>>
+where
+    DB: Database,
+    <DB as HasArguments<'a>>::Arguments: IntoArguments<'a, DB>,
+    for<'c> E: 'a + Executor<'c, Database = DB>,
+    for<'c> String: Decode<'c, DB> + Type<DB>,
+    for<'c> String: Encode<'c, DB> + Type<DB>,
+    usize: ColumnIndex<<DB as Database>::Row>,
+{
+    let fetched_rows = sqlx::query(sql)
+        .fetch_all(e)
+        .await
+        .context(error::ExecuteQuerySnafu { sql })?;
+    ensure!(
+        fetched_rows.len() == 1,
+        error::AssertSnafu {
+            reason: format!(
+                "Expected fetched row length: 1, got: {}",
+                fetched_rows.len(),
+            )
+        }
+    );
+
+    let row = fetched_rows.first().unwrap();
+    let show_create = row.try_get::<String, usize>(1).unwrap();
+    parse_show_create(&show_create)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::str::FromStr;
+
+    use common_base::readable_size::ReadableSize;
+    use common_time::Duration;
+
+    use super::*;
+    use crate::ir::alter_expr::Ttl;
+    use crate::ir::AlterTableOption;
+
+    #[test]
+    fn test_parse_show_create() {
+        let show_create = "CREATE TABLE IF NOT EXISTS `json` (`ts` TIMESTAMP(3) NOT NULL, `j` JSON NULL, TIME INDEX (`ts`)) ENGINE=mito WITH(compaction.twcs.max_output_file_size = '1M', compaction.type = 'twcs', ttl = '1day')";
+        let options = parse_show_create(show_create).unwrap();
+        assert_eq!(options.len(), 2);
+        assert_eq!(
+            options[0],
+            AlterTableOption::TwcsMaxOutputFileSize(ReadableSize::from_str("1MB").unwrap())
+        );
+        assert_eq!(
+            options[1],
+            AlterTableOption::Ttl(Ttl::Duration(Duration::new_second(24 * 60 * 60)))
+        );
+    }
+}
diff --git a/tests-fuzz/targets/fuzz_alter_logical_table.rs b/tests-fuzz/targets/ddl/fuzz_alter_logical_table.rs
similarity index 100%
rename from tests-fuzz/targets/fuzz_alter_logical_table.rs
rename to tests-fuzz/targets/ddl/fuzz_alter_logical_table.rs
diff --git a/tests-fuzz/targets/fuzz_alter_table.rs b/tests-fuzz/targets/ddl/fuzz_alter_table.rs
similarity index 72%
rename from tests-fuzz/targets/fuzz_alter_table.rs
rename to tests-fuzz/targets/ddl/fuzz_alter_table.rs
index 7f2a809c9e14..247d7632eeb5 100644
--- a/tests-fuzz/targets/fuzz_alter_table.rs
+++ b/tests-fuzz/targets/ddl/fuzz_alter_table.rs
@@ -34,10 +34,13 @@ use tests_fuzz::fake::{
 use tests_fuzz::generator::alter_expr::{
     AlterExprAddColumnGeneratorBuilder, AlterExprDropColumnGeneratorBuilder,
     AlterExprModifyDataTypeGeneratorBuilder, AlterExprRenameGeneratorBuilder,
+    AlterExprSetTableOptionsGeneratorBuilder, AlterExprUnsetTableOptionsGeneratorBuilder,
 };
 use tests_fuzz::generator::create_expr::CreateTableExprGeneratorBuilder;
 use tests_fuzz::generator::Generator;
-use tests_fuzz::ir::{droppable_columns, modifiable_columns, AlterTableExpr, CreateTableExpr};
+use tests_fuzz::ir::{
+    droppable_columns, modifiable_columns, AlterTableExpr, AlterTableOption, CreateTableExpr,
+};
 use tests_fuzz::translator::mysql::alter_expr::AlterTableExprTranslator;
 use tests_fuzz::translator::mysql::create_expr::CreateTableExprTranslator;
 use tests_fuzz::translator::DslTranslator;
@@ -62,11 +65,13 @@ struct FuzzInput {
 }
 
 #[derive(Debug, EnumIter)]
-enum AlterTableOption {
+enum AlterTableKind {
     AddColumn,
     DropColumn,
     RenameTable,
     ModifyDataType,
+    SetTableOptions,
+    UnsetTableOptions,
 }
 
 fn generate_create_table_expr<R: Rng + 'static>(rng: &mut R) -> Result<CreateTableExpr> {
@@ -93,23 +98,23 @@ fn generate_alter_table_expr<R: Rng + 'static>(
     table_ctx: TableContextRef,
     rng: &mut R,
 ) -> Result<AlterTableExpr> {
-    let options = AlterTableOption::iter().collect::<Vec<_>>();
-    match options[rng.gen_range(0..options.len())] {
-        AlterTableOption::DropColumn if !droppable_columns(&table_ctx.columns).is_empty() => {
+    let kinds = AlterTableKind::iter().collect::<Vec<_>>();
+    match kinds[rng.gen_range(0..kinds.len())] {
+        AlterTableKind::DropColumn if !droppable_columns(&table_ctx.columns).is_empty() => {
             AlterExprDropColumnGeneratorBuilder::default()
                 .table_ctx(table_ctx)
                 .build()
                 .unwrap()
                 .generate(rng)
         }
-        AlterTableOption::ModifyDataType if !modifiable_columns(&table_ctx.columns).is_empty() => {
+        AlterTableKind::ModifyDataType if !modifiable_columns(&table_ctx.columns).is_empty() => {
             AlterExprModifyDataTypeGeneratorBuilder::default()
                 .table_ctx(table_ctx)
                 .build()
                 .unwrap()
                 .generate(rng)
         }
-        AlterTableOption::RenameTable => AlterExprRenameGeneratorBuilder::default()
+        AlterTableKind::RenameTable => AlterExprRenameGeneratorBuilder::default()
             .table_ctx(table_ctx)
             .name_generator(Box::new(MappedGenerator::new(
                 WordGenerator,
@@ -118,6 +123,20 @@ fn generate_alter_table_expr<R: Rng + 'static>(
             .build()
             .unwrap()
             .generate(rng),
+        AlterTableKind::SetTableOptions => {
+            let expr_generator = AlterExprSetTableOptionsGeneratorBuilder::default()
+                .table_ctx(table_ctx)
+                .build()
+                .unwrap();
+            expr_generator.generate(rng)
+        }
+        AlterTableKind::UnsetTableOptions => {
+            let expr_generator = AlterExprUnsetTableOptionsGeneratorBuilder::default()
+                .table_ctx(table_ctx)
+                .build()
+                .unwrap();
+            expr_generator.generate(rng)
+        }
         _ => {
             let location = rng.gen_bool(0.5);
             let expr_generator = AlterExprAddColumnGeneratorBuilder::default()
@@ -179,6 +198,31 @@ async fn execute_alter_table(ctx: FuzzContext, input: FuzzInput) -> Result<()> {
         let mut columns = table_ctx.columns.clone();
         columns.sort_by(|a, b| a.name.value.cmp(&b.name.value));
         validator::column::assert_eq(&column_entries, &columns)?;
+
+        // Validates table options
+        let sql = format!("SHOW CREATE TABLE {}", table_ctx.name);
+        let mut table_options = validator::table::fetch_table_options(&ctx.greptime, &sql).await?;
+        table_options.sort_by(|a, b| a.key().cmp(b.key()));
+        let mut expected_table_options = table_ctx.table_options.clone();
+        expected_table_options.sort_by(|a, b| a.key().cmp(b.key()));
+        table_options
+            .iter()
+            .zip(expected_table_options.iter())
+            .for_each(|(a, b)| {
+                if let (
+                    AlterTableOption::TwcsMaxOutputFileSize(a),
+                    AlterTableOption::TwcsMaxOutputFileSize(b),
+                ) = (a, b)
+                {
+                    // to_string loses precision for ReadableSize, so the size in generated SQL is not the same as the size in the table context,
+                    // but the string representation should be the same. For example:
+                    //                                     to_string()                from_str()
+                    // ReadableSize(13001360408898724524) ------------> "11547.5PiB" -----------> ReadableSize(13001329174265200640)
+                    assert_eq!(a.to_string(), b.to_string());
+                } else {
+                    assert_eq!(a, b);
+                }
+            });
     }
 
     // Cleans up
diff --git a/tests-fuzz/targets/fuzz_create_database.rs b/tests-fuzz/targets/ddl/fuzz_create_database.rs
similarity index 100%
rename from tests-fuzz/targets/fuzz_create_database.rs
rename to tests-fuzz/targets/ddl/fuzz_create_database.rs
diff --git a/tests-fuzz/targets/fuzz_create_logical_table.rs b/tests-fuzz/targets/ddl/fuzz_create_logical_table.rs
similarity index 100%
rename from tests-fuzz/targets/fuzz_create_logical_table.rs
rename to tests-fuzz/targets/ddl/fuzz_create_logical_table.rs
diff --git a/tests-fuzz/targets/fuzz_create_table.rs b/tests-fuzz/targets/ddl/fuzz_create_table.rs
similarity index 100%
rename from tests-fuzz/targets/fuzz_create_table.rs
rename to tests-fuzz/targets/ddl/fuzz_create_table.rs

From 2137c53274d162f4a4131ca0d9b1d5a7bb9f155b Mon Sep 17 00:00:00 2001
From: Weny Xu <wenymedia@gmail.com>
Date: Thu, 12 Dec 2024 12:45:40 +0800
Subject: [PATCH 33/36] feat(index): add `file_size_hint` for remote blob
 reader (#5147)

feat(index): add file_size_hint for remote blob reader
---
 src/common/base/src/range_read.rs             | 17 +++++++++++++++
 src/mito2/src/sst/file.rs                     | 20 ++++++++++++++++++
 .../src/sst/index/inverted_index/applier.rs   | 17 +++++++++------
 .../src/sst/index/inverted_index/creator.rs   |  2 +-
 src/mito2/src/sst/index/store.rs              | 21 +++++++++++++++----
 src/mito2/src/sst/parquet/reader.rs           |  7 +++++--
 src/puffin/src/partial_reader/async.rs        |  4 ++++
 src/puffin/src/puffin_manager.rs              |  3 ++-
 .../fs_puffin_manager/reader.rs               | 21 ++++++++++++++++++-
 9 files changed, 97 insertions(+), 15 deletions(-)

diff --git a/src/common/base/src/range_read.rs b/src/common/base/src/range_read.rs
index 91f865d17ef6..61f28cb629fd 100644
--- a/src/common/base/src/range_read.rs
+++ b/src/common/base/src/range_read.rs
@@ -36,6 +36,11 @@ pub struct Metadata {
 /// `RangeReader` reads a range of bytes from a source.
 #[async_trait]
 pub trait RangeReader: Send + Unpin {
+    /// Sets the file size hint for the reader.
+    ///
+    /// It's used to optimize the reading process by reducing the number of remote requests.
+    fn with_file_size_hint(&mut self, file_size_hint: u64);
+
     /// Returns the metadata of the source.
     async fn metadata(&mut self) -> io::Result<Metadata>;
 
@@ -70,6 +75,10 @@ pub trait RangeReader: Send + Unpin {
 
 #[async_trait]
 impl<R: ?Sized + RangeReader> RangeReader for &mut R {
+    fn with_file_size_hint(&mut self, file_size_hint: u64) {
+        (*self).with_file_size_hint(file_size_hint)
+    }
+
     async fn metadata(&mut self) -> io::Result<Metadata> {
         (*self).metadata().await
     }
@@ -186,6 +195,10 @@ impl<R: RangeReader + 'static> AsyncRead for AsyncReadAdapter<R> {
 
 #[async_trait]
 impl RangeReader for Vec<u8> {
+    fn with_file_size_hint(&mut self, _file_size_hint: u64) {
+        // do nothing
+    }
+
     async fn metadata(&mut self) -> io::Result<Metadata> {
         Ok(Metadata {
             content_length: self.len() as u64,
@@ -222,6 +235,10 @@ impl FileReader {
 
 #[async_trait]
 impl RangeReader for FileReader {
+    fn with_file_size_hint(&mut self, _file_size_hint: u64) {
+        // do nothing
+    }
+
     async fn metadata(&mut self) -> io::Result<Metadata> {
         Ok(Metadata {
             content_length: self.content_length,
diff --git a/src/mito2/src/sst/file.rs b/src/mito2/src/sst/file.rs
index 4353ae55e3e9..5a9932ab433b 100644
--- a/src/mito2/src/sst/file.rs
+++ b/src/mito2/src/sst/file.rs
@@ -146,13 +146,33 @@ pub enum IndexType {
 }
 
 impl FileMeta {
+    /// Returns true if the file has an inverted index
     pub fn inverted_index_available(&self) -> bool {
         self.available_indexes.contains(&IndexType::InvertedIndex)
     }
 
+    /// Returns true if the file has a fulltext index
     pub fn fulltext_index_available(&self) -> bool {
         self.available_indexes.contains(&IndexType::FulltextIndex)
     }
+
+    /// Returns the size of the inverted index file
+    pub fn inverted_index_size(&self) -> Option<u64> {
+        if self.available_indexes.len() == 1 && self.inverted_index_available() {
+            Some(self.index_file_size)
+        } else {
+            None
+        }
+    }
+
+    /// Returns the size of the fulltext index file
+    pub fn fulltext_index_size(&self) -> Option<u64> {
+        if self.available_indexes.len() == 1 && self.fulltext_index_available() {
+            Some(self.index_file_size)
+        } else {
+            None
+        }
+    }
 }
 
 /// Handle to a SST file.
diff --git a/src/mito2/src/sst/index/inverted_index/applier.rs b/src/mito2/src/sst/index/inverted_index/applier.rs
index bf5206ef44be..d060d4bec17b 100644
--- a/src/mito2/src/sst/index/inverted_index/applier.rs
+++ b/src/mito2/src/sst/index/inverted_index/applier.rs
@@ -113,7 +113,7 @@ impl InvertedIndexApplier {
     }
 
     /// Applies predicates to the provided SST file id and returns the relevant row group ids
-    pub async fn apply(&self, file_id: FileId) -> Result<ApplyOutput> {
+    pub async fn apply(&self, file_id: FileId, file_size_hint: Option<u64>) -> Result<ApplyOutput> {
         let _timer = INDEX_APPLY_ELAPSED
             .with_label_values(&[TYPE_INVERTED_INDEX])
             .start_timer();
@@ -129,8 +129,7 @@ impl InvertedIndexApplier {
                 if let Err(err) = other {
                     warn!(err; "An unexpected error occurred while reading the cached index file. Fallback to remote index file.")
                 }
-
-                self.remote_blob_reader(file_id).await?
+                self.remote_blob_reader(file_id, file_size_hint).await?
             }
         };
 
@@ -181,16 +180,22 @@ impl InvertedIndexApplier {
     }
 
     /// Creates a blob reader from the remote index file.
-    async fn remote_blob_reader(&self, file_id: FileId) -> Result<BlobReader> {
+    async fn remote_blob_reader(
+        &self,
+        file_id: FileId,
+        file_size_hint: Option<u64>,
+    ) -> Result<BlobReader> {
         let puffin_manager = self
             .puffin_manager_factory
             .build(self.store.clone())
             .with_puffin_metadata_cache(self.puffin_metadata_cache.clone());
+
         let file_path = location::index_file_path(&self.region_dir, file_id);
         puffin_manager
             .reader(&file_path)
             .await
             .context(PuffinBuildReaderSnafu)?
+            .with_file_size_hint(file_size_hint)
             .blob(INDEX_BLOB_TYPE)
             .await
             .context(PuffinReadBlobSnafu)?
@@ -250,7 +255,7 @@ mod tests {
             Box::new(mock_index_applier),
             puffin_manager_factory,
         );
-        let output = sst_index_applier.apply(file_id).await.unwrap();
+        let output = sst_index_applier.apply(file_id, None).await.unwrap();
         assert_eq!(
             output,
             ApplyOutput {
@@ -290,7 +295,7 @@ mod tests {
             Box::new(mock_index_applier),
             puffin_manager_factory,
         );
-        let res = sst_index_applier.apply(file_id).await;
+        let res = sst_index_applier.apply(file_id, None).await;
         assert!(format!("{:?}", res.unwrap_err()).contains("Blob not found"));
     }
 }
diff --git a/src/mito2/src/sst/index/inverted_index/creator.rs b/src/mito2/src/sst/index/inverted_index/creator.rs
index 029a0da8484f..43cf54fa2811 100644
--- a/src/mito2/src/sst/index/inverted_index/creator.rs
+++ b/src/mito2/src/sst/index/inverted_index/creator.rs
@@ -464,7 +464,7 @@ mod tests {
             .unwrap();
             Box::pin(async move {
                 applier
-                    .apply(sst_file_id)
+                    .apply(sst_file_id, None)
                     .await
                     .unwrap()
                     .matched_segment_ids
diff --git a/src/mito2/src/sst/index/store.rs b/src/mito2/src/sst/index/store.rs
index 2750c69fc249..7322bd4db496 100644
--- a/src/mito2/src/sst/index/store.rs
+++ b/src/mito2/src/sst/index/store.rs
@@ -68,6 +68,7 @@ impl InstrumentedStore {
             path: path.to_string(),
             read_byte_count,
             read_count,
+            file_size_hint: None,
         })
     }
 
@@ -262,15 +263,27 @@ pub(crate) struct InstrumentedRangeReader<'a> {
     path: String,
     read_byte_count: &'a IntCounter,
     read_count: &'a IntCounter,
+    file_size_hint: Option<u64>,
 }
 
 #[async_trait]
 impl RangeReader for InstrumentedRangeReader<'_> {
+    fn with_file_size_hint(&mut self, file_size_hint: u64) {
+        self.file_size_hint = Some(file_size_hint);
+    }
+
     async fn metadata(&mut self) -> io::Result<Metadata> {
-        let stat = self.store.stat(&self.path).await?;
-        Ok(Metadata {
-            content_length: stat.content_length(),
-        })
+        match self.file_size_hint {
+            Some(file_size_hint) => Ok(Metadata {
+                content_length: file_size_hint,
+            }),
+            None => {
+                let stat = self.store.stat(&self.path).await?;
+                Ok(Metadata {
+                    content_length: stat.content_length(),
+                })
+            }
+        }
     }
 
     async fn read(&mut self, range: Range<u64>) -> io::Result<Bytes> {
diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs
index b73026a7a6e3..02c5c2cf3cba 100644
--- a/src/mito2/src/sst/parquet/reader.rs
+++ b/src/mito2/src/sst/parquet/reader.rs
@@ -475,8 +475,11 @@ impl ParquetReaderBuilder {
         if !self.file_handle.meta_ref().inverted_index_available() {
             return false;
         }
-
-        let apply_output = match index_applier.apply(self.file_handle.file_id()).await {
+        let file_size_hint = self.file_handle.meta_ref().inverted_index_size();
+        let apply_output = match index_applier
+            .apply(self.file_handle.file_id(), file_size_hint)
+            .await
+        {
             Ok(output) => output,
             Err(err) => {
                 if cfg!(any(test, feature = "test")) {
diff --git a/src/puffin/src/partial_reader/async.rs b/src/puffin/src/partial_reader/async.rs
index 3de40cb3a190..4eedd1ee31f5 100644
--- a/src/puffin/src/partial_reader/async.rs
+++ b/src/puffin/src/partial_reader/async.rs
@@ -23,6 +23,10 @@ use crate::partial_reader::PartialReader;
 
 #[async_trait]
 impl<R: RangeReader> RangeReader for PartialReader<R> {
+    fn with_file_size_hint(&mut self, _file_size_hint: u64) {
+        // do nothing
+    }
+
     async fn metadata(&mut self) -> io::Result<Metadata> {
         Ok(Metadata {
             content_length: self.size,
diff --git a/src/puffin/src/puffin_manager.rs b/src/puffin/src/puffin_manager.rs
index 17101b1662e8..204bc2c66e2e 100644
--- a/src/puffin/src/puffin_manager.rs
+++ b/src/puffin/src/puffin_manager.rs
@@ -73,11 +73,12 @@ pub struct PutOptions {
 
 /// The `PuffinReader` trait provides methods for reading blobs and directories from a Puffin file.
 #[async_trait]
-#[auto_impl::auto_impl(Arc)]
 pub trait PuffinReader {
     type Blob: BlobGuard;
     type Dir: DirGuard;
 
+    fn with_file_size_hint(self, file_size_hint: Option<u64>) -> Self;
+
     /// Reads a blob from the Puffin file.
     ///
     /// The returned `BlobGuard` is used to access the blob data.
diff --git a/src/puffin/src/puffin_manager/fs_puffin_manager/reader.rs b/src/puffin/src/puffin_manager/fs_puffin_manager/reader.rs
index 2e1ae594adc6..a5da2f75f858 100644
--- a/src/puffin/src/puffin_manager/fs_puffin_manager/reader.rs
+++ b/src/puffin/src/puffin_manager/fs_puffin_manager/reader.rs
@@ -43,6 +43,9 @@ pub struct FsPuffinReader<S, F> {
     /// The name of the puffin file.
     puffin_file_name: String,
 
+    /// The file size hint.
+    file_size_hint: Option<u64>,
+
     /// The stager.
     stager: S,
 
@@ -62,6 +65,7 @@ impl<S, F> FsPuffinReader<S, F> {
     ) -> Self {
         Self {
             puffin_file_name,
+            file_size_hint: None,
             stager,
             puffin_file_accessor,
             puffin_file_metadata_cache,
@@ -78,11 +82,19 @@ where
     type Blob = Either<RandomReadBlob<F>, S::Blob>;
     type Dir = S::Dir;
 
+    fn with_file_size_hint(mut self, file_size_hint: Option<u64>) -> Self {
+        self.file_size_hint = file_size_hint;
+        self
+    }
+
     async fn blob(&self, key: &str) -> Result<Self::Blob> {
-        let reader = self
+        let mut reader = self
             .puffin_file_accessor
             .reader(&self.puffin_file_name)
             .await?;
+        if let Some(file_size_hint) = self.file_size_hint {
+            reader.with_file_size_hint(file_size_hint);
+        }
         let mut file = PuffinFileReader::new(reader);
 
         let metadata = self.get_puffin_file_metadata(&mut file).await?;
@@ -303,6 +315,13 @@ where
     A: RangeReader,
     B: RangeReader,
 {
+    fn with_file_size_hint(&mut self, file_size_hint: u64) {
+        match self {
+            Either::L(a) => a.with_file_size_hint(file_size_hint),
+            Either::R(b) => b.with_file_size_hint(file_size_hint),
+        }
+    }
+
     async fn metadata(&mut self) -> io::Result<Metadata> {
         match self {
             Either::L(a) => a.metadata().await,

From b8a78b78389ae9edd6b3e4a05ee8697ad0c578a3 Mon Sep 17 00:00:00 2001
From: localhost <xpaomian@gmail.com>
Date: Thu, 12 Dec 2024 17:01:21 +0800
Subject: [PATCH 34/36] chore: decide tag column in log api follow table schema
 if table exists (#5138)

* chore: decide tag column in log api follow table schema if table exists

* chore: add more test for greptime_identity pipeline

* chore: change pipeline get_table function signature

* chore: change identity_pipeline_inner tag_column_names type
---
 src/frontend/src/instance/log_handler.rs      |  15 ++-
 .../src/etl/transform/transformer/greptime.rs | 117 +++++++++++++++---
 src/servers/src/http/event.rs                 |  13 +-
 src/servers/src/query_handler.rs              |   8 +-
 4 files changed, 130 insertions(+), 23 deletions(-)

diff --git a/src/frontend/src/instance/log_handler.rs b/src/frontend/src/instance/log_handler.rs
index c3422066a387..9ae782c7d4ab 100644
--- a/src/frontend/src/instance/log_handler.rs
+++ b/src/frontend/src/instance/log_handler.rs
@@ -25,8 +25,9 @@ use servers::error::{
 };
 use servers::interceptor::{LogIngestInterceptor, LogIngestInterceptorRef};
 use servers::query_handler::PipelineHandler;
-use session::context::QueryContextRef;
+use session::context::{QueryContext, QueryContextRef};
 use snafu::ResultExt;
+use table::Table;
 
 use crate::instance::Instance;
 
@@ -84,6 +85,18 @@ impl PipelineHandler for Instance {
             .await
             .context(PipelineSnafu)
     }
+
+    async fn get_table(
+        &self,
+        table: &str,
+        query_ctx: &QueryContext,
+    ) -> std::result::Result<Option<Arc<Table>>, catalog::error::Error> {
+        let catalog = query_ctx.current_catalog();
+        let schema = query_ctx.current_schema();
+        self.catalog_manager
+            .table(catalog, &schema, table, None)
+            .await
+    }
 }
 
 impl Instance {
diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs
index 3b43696b5ab7..5d69a03ea23e 100644
--- a/src/pipeline/src/etl/transform/transformer/greptime.rs
+++ b/src/pipeline/src/etl/transform/transformer/greptime.rs
@@ -15,6 +15,7 @@
 pub mod coerce;
 
 use std::collections::HashSet;
+use std::sync::Arc;
 
 use ahash::HashMap;
 use api::helper::proto_value_type;
@@ -367,20 +368,15 @@ fn json_value_to_row(
     Ok(Row { values: row })
 }
 
-/// Identity pipeline for Greptime
-/// This pipeline will convert the input JSON array to Greptime Rows
-/// 1. The pipeline will add a default timestamp column to the schema
-/// 2. The pipeline not resolve NULL value
-/// 3. The pipeline assumes that the json format is fixed
-/// 4. The pipeline will return an error if the same column datatype is mismatched
-/// 5. The pipeline will analyze the schema of each json record and merge them to get the final schema.
-pub fn identity_pipeline(array: Vec<serde_json::Value>) -> Result<Rows> {
+fn identity_pipeline_inner<'a>(
+    array: Vec<serde_json::Value>,
+    tag_column_names: Option<impl Iterator<Item = &'a String>>,
+) -> Result<Rows> {
     let mut rows = Vec::with_capacity(array.len());
-
-    let mut schema = SchemaInfo::default();
+    let mut schema_info = SchemaInfo::default();
     for value in array {
         if let serde_json::Value::Object(map) = value {
-            let row = json_value_to_row(&mut schema, map)?;
+            let row = json_value_to_row(&mut schema_info, map)?;
             rows.push(row);
         }
     }
@@ -395,7 +391,7 @@ pub fn identity_pipeline(array: Vec<serde_json::Value>) -> Result<Rows> {
     let ts = GreptimeValue {
         value_data: Some(ValueData::TimestampNanosecondValue(ns)),
     };
-    let column_count = schema.schema.len();
+    let column_count = schema_info.schema.len();
     for row in rows.iter_mut() {
         let diff = column_count - row.values.len();
         for _ in 0..diff {
@@ -403,15 +399,49 @@ pub fn identity_pipeline(array: Vec<serde_json::Value>) -> Result<Rows> {
         }
         row.values.push(ts.clone());
     }
-    schema.schema.push(greptime_timestamp_schema);
+    schema_info.schema.push(greptime_timestamp_schema);
+
+    // set the semantic type of the row key column to Tag
+    if let Some(tag_column_names) = tag_column_names {
+        tag_column_names.for_each(|tag_column_name| {
+            if let Some(index) = schema_info.index.get(tag_column_name) {
+                schema_info.schema[*index].semantic_type = SemanticType::Tag as i32;
+            }
+        });
+    }
     Ok(Rows {
-        schema: schema.schema,
+        schema: schema_info.schema,
         rows,
     })
 }
 
+/// Identity pipeline for Greptime
+/// This pipeline will convert the input JSON array to Greptime Rows
+/// params table is used to set the semantic type of the row key column to Tag
+/// 1. The pipeline will add a default timestamp column to the schema
+/// 2. The pipeline not resolve NULL value
+/// 3. The pipeline assumes that the json format is fixed
+/// 4. The pipeline will return an error if the same column datatype is mismatched
+/// 5. The pipeline will analyze the schema of each json record and merge them to get the final schema.
+pub fn identity_pipeline(
+    array: Vec<serde_json::Value>,
+    table: Option<Arc<table::Table>>,
+) -> Result<Rows> {
+    match table {
+        Some(table) => {
+            let table_info = table.table_info();
+            let tag_column_names = table_info.meta.row_key_column_names();
+            identity_pipeline_inner(array, Some(tag_column_names))
+        }
+        None => identity_pipeline_inner(array, None::<std::iter::Empty<&String>>),
+    }
+}
+
 #[cfg(test)]
 mod tests {
+    use api::v1::SemanticType;
+
+    use crate::etl::transform::transformer::greptime::identity_pipeline_inner;
     use crate::identity_pipeline;
 
     #[test]
@@ -437,7 +467,7 @@ mod tests {
                     "gaga": "gaga"
                 }),
             ];
-            let rows = identity_pipeline(array);
+            let rows = identity_pipeline(array, None);
             assert!(rows.is_err());
             assert_eq!(
                 rows.err().unwrap().to_string(),
@@ -465,7 +495,7 @@ mod tests {
                     "gaga": "gaga"
                 }),
             ];
-            let rows = identity_pipeline(array);
+            let rows = identity_pipeline(array, None);
             assert!(rows.is_err());
             assert_eq!(
                 rows.err().unwrap().to_string(),
@@ -493,7 +523,7 @@ mod tests {
                     "gaga": "gaga"
                 }),
             ];
-            let rows = identity_pipeline(array);
+            let rows = identity_pipeline(array, None);
             assert!(rows.is_ok());
             let rows = rows.unwrap();
             assert_eq!(rows.schema.len(), 8);
@@ -501,5 +531,58 @@ mod tests {
             assert_eq!(8, rows.rows[0].values.len());
             assert_eq!(8, rows.rows[1].values.len());
         }
+        {
+            let array = vec![
+                serde_json::json!({
+                    "woshinull": null,
+                    "name": "Alice",
+                    "age": 20,
+                    "is_student": true,
+                    "score": 99.5,
+                    "hobbies": "reading",
+                    "address": "Beijing",
+                }),
+                serde_json::json!({
+                    "name": "Bob",
+                    "age": 21,
+                    "is_student": false,
+                    "score": 88.5,
+                    "hobbies": "swimming",
+                    "address": "Shanghai",
+                    "gaga": "gaga"
+                }),
+            ];
+            let tag_column_names = ["name".to_string(), "address".to_string()];
+            let rows = identity_pipeline_inner(array, Some(tag_column_names.iter()));
+            assert!(rows.is_ok());
+            let rows = rows.unwrap();
+            assert_eq!(rows.schema.len(), 8);
+            assert_eq!(rows.rows.len(), 2);
+            assert_eq!(8, rows.rows[0].values.len());
+            assert_eq!(8, rows.rows[1].values.len());
+            assert_eq!(
+                rows.schema
+                    .iter()
+                    .find(|x| x.column_name == "name")
+                    .unwrap()
+                    .semantic_type,
+                SemanticType::Tag as i32
+            );
+            assert_eq!(
+                rows.schema
+                    .iter()
+                    .find(|x| x.column_name == "address")
+                    .unwrap()
+                    .semantic_type,
+                SemanticType::Tag as i32
+            );
+            assert_eq!(
+                rows.schema
+                    .iter()
+                    .filter(|x| x.semantic_type == SemanticType::Tag as i32)
+                    .count(),
+                2
+            );
+        }
     }
 }
diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs
index 69498c209ab4..5069db51975d 100644
--- a/src/servers/src/http/event.rs
+++ b/src/servers/src/http/event.rs
@@ -46,8 +46,8 @@ use session::context::{Channel, QueryContext, QueryContextRef};
 use snafu::{ensure, OptionExt, ResultExt};
 
 use crate::error::{
-    DecodeOtlpRequestSnafu, Error, InvalidParameterSnafu, ParseJson5Snafu, ParseJsonSnafu,
-    PipelineSnafu, Result, UnsupportedContentTypeSnafu,
+    CatalogSnafu, DecodeOtlpRequestSnafu, Error, InvalidParameterSnafu, ParseJson5Snafu,
+    ParseJsonSnafu, PipelineSnafu, Result, UnsupportedContentTypeSnafu,
 };
 use crate::http::extractor::LogTableName;
 use crate::http::header::CONTENT_TYPE_PROTOBUF_STR;
@@ -612,10 +612,15 @@ async fn ingest_logs_inner(
     let mut results = Vec::with_capacity(pipeline_data.len());
     let transformed_data: Rows;
     if pipeline_name == GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME {
-        let rows = pipeline::identity_pipeline(pipeline_data)
+        let table = state
+            .get_table(&table_name, &query_ctx)
+            .await
+            .context(CatalogSnafu)?;
+        let rows = pipeline::identity_pipeline(pipeline_data, table)
             .context(PipelineTransformSnafu)
             .context(PipelineSnafu)?;
-        transformed_data = rows;
+
+        transformed_data = rows
     } else {
         let pipeline = state
             .get_pipeline(&pipeline_name, version, query_ctx.clone())
diff --git a/src/servers/src/query_handler.rs b/src/servers/src/query_handler.rs
index 58812e9350bc..96a01593a8f1 100644
--- a/src/servers/src/query_handler.rs
+++ b/src/servers/src/query_handler.rs
@@ -39,7 +39,7 @@ use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequ
 use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest;
 use pipeline::{GreptimeTransformer, Pipeline, PipelineInfo, PipelineVersion, PipelineWay};
 use serde_json::Value;
-use session::context::QueryContextRef;
+use session::context::{QueryContext, QueryContextRef};
 
 use crate::error::Result;
 use crate::influxdb::InfluxdbRequest;
@@ -164,4 +164,10 @@ pub trait PipelineHandler {
         version: PipelineVersion,
         query_ctx: QueryContextRef,
     ) -> Result<Option<()>>;
+
+    async fn get_table(
+        &self,
+        table: &str,
+        query_ctx: &QueryContext,
+    ) -> std::result::Result<Option<Arc<table::Table>>, catalog::error::Error>;
 }

From fee75a1fadfda2f98a496090158e99e4b93915f4 Mon Sep 17 00:00:00 2001
From: Yingwen <realevenyag@gmail.com>
Date: Thu, 12 Dec 2024 19:27:22 +0800
Subject: [PATCH 35/36] feat: collect reader metrics from prune reader (#5152)

---
 src/mito2/src/read/last_row.rs      | 14 +++++++++++++-
 src/mito2/src/read/prune.rs         | 16 +++++++++++++---
 src/mito2/src/read/scan_util.rs     |  5 +++--
 src/mito2/src/sst/parquet/reader.rs |  4 ++--
 4 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/src/mito2/src/read/last_row.rs b/src/mito2/src/read/last_row.rs
index 79d035e03271..1e2a6a5844c6 100644
--- a/src/mito2/src/read/last_row.rs
+++ b/src/mito2/src/read/last_row.rs
@@ -27,7 +27,7 @@ use crate::cache::{
 use crate::error::Result;
 use crate::read::{Batch, BatchReader, BoxedBatchReader};
 use crate::sst::file::FileId;
-use crate::sst::parquet::reader::RowGroupReader;
+use crate::sst::parquet::reader::{ReaderMetrics, RowGroupReader};
 
 /// Reader to keep the last row for each time series.
 /// It assumes that batches from the input reader are
@@ -115,6 +115,14 @@ impl RowGroupLastRowCachedReader {
         }
     }
 
+    /// Gets the underlying reader metrics if uncached.
+    pub(crate) fn metrics(&self) -> Option<&ReaderMetrics> {
+        match self {
+            RowGroupLastRowCachedReader::Hit(_) => None,
+            RowGroupLastRowCachedReader::Miss(reader) => Some(reader.metrics()),
+        }
+    }
+
     /// Creates new Hit variant and updates metrics.
     fn new_hit(value: Arc<SelectorResultValue>) -> Self {
         selector_result_cache_hit();
@@ -234,6 +242,10 @@ impl RowGroupLastRowReader {
         });
         cache.put_selector_result(self.key, value);
     }
+
+    fn metrics(&self) -> &ReaderMetrics {
+        self.reader.metrics()
+    }
 }
 
 /// Push last row into `yielded_batches`.
diff --git a/src/mito2/src/read/prune.rs b/src/mito2/src/read/prune.rs
index cb0066e73472..500cd1430242 100644
--- a/src/mito2/src/read/prune.rs
+++ b/src/mito2/src/read/prune.rs
@@ -72,11 +72,21 @@ impl PruneReader {
         self.source = source;
     }
 
-    pub(crate) fn metrics(&mut self) -> &ReaderMetrics {
+    /// Merge metrics with the inner reader and return the merged metrics.
+    pub(crate) fn metrics(&self) -> ReaderMetrics {
+        let mut metrics = self.metrics.clone();
         match &self.source {
-            Source::RowGroup(r) => r.metrics(),
-            Source::LastRow(_) => &self.metrics,
+            Source::RowGroup(r) => {
+                metrics.merge_from(r.metrics());
+            }
+            Source::LastRow(r) => {
+                if let Some(inner_metrics) = r.metrics() {
+                    metrics.merge_from(inner_metrics);
+                }
+            }
         }
+
+        metrics
     }
 
     pub(crate) async fn next_batch(&mut self) -> Result<Option<Batch>> {
diff --git a/src/mito2/src/read/scan_util.rs b/src/mito2/src/read/scan_util.rs
index df790d191a4e..0bdf62e77e03 100644
--- a/src/mito2/src/read/scan_util.rs
+++ b/src/mito2/src/read/scan_util.rs
@@ -181,8 +181,9 @@ pub(crate) fn scan_file_ranges(
                 }
                 yield batch;
             }
-            if let Source::PruneReader(mut reader) = source {
-                reader_metrics.merge_from(reader.metrics());
+            if let Source::PruneReader(reader) = source {
+                let prune_metrics = reader.metrics();
+                reader_metrics.merge_from(&prune_metrics);
             }
         }
 
diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs
index 02c5c2cf3cba..335b09426eca 100644
--- a/src/mito2/src/sst/parquet/reader.rs
+++ b/src/mito2/src/sst/parquet/reader.rs
@@ -918,10 +918,10 @@ enum ReaderState {
 
 impl ReaderState {
     /// Returns the metrics of the reader.
-    fn metrics(&mut self) -> &ReaderMetrics {
+    fn metrics(&self) -> ReaderMetrics {
         match self {
             ReaderState::Readable(reader) => reader.metrics(),
-            ReaderState::Exhausted(m) => m,
+            ReaderState::Exhausted(m) => m.clone(),
         }
     }
 }

From e8e95267389148fefb8422a61e33bd593a0359c3 Mon Sep 17 00:00:00 2001
From: localhost <xpaomian@gmail.com>
Date: Thu, 12 Dec 2024 19:47:21 +0800
Subject: [PATCH 36/36] chore: pipeline dryrun api can currently receives
 pipeline raw content (#5142)

* chore: pipeline dryrun api can currently receives pipeline raw content

* chore: remove dryrun v1 and add test

* chore: change dryrun pipeline api body schema

* chore: remove useless struct PipelineInfo

* chore: update PipelineDryrunParams doc

* chore: increase code readability

* chore: add some comment for pipeline dryrun test

* Apply suggestions from code review

Co-authored-by: shuiyisong <113876041+shuiyisong@users.noreply.github.com>

* chore: format code

---------

Co-authored-by: shuiyisong <113876041+shuiyisong@users.noreply.github.com>
---
 src/frontend/src/instance/log_handler.rs      |   5 +
 src/pipeline/benches/processor.rs             |   2 +-
 src/pipeline/src/etl.rs                       |  18 +-
 src/pipeline/src/manager/pipeline_operator.rs |   5 +
 src/pipeline/src/manager/table.rs             |   2 +-
 src/pipeline/tests/common.rs                  |   2 +-
 src/pipeline/tests/dissect.rs                 |   2 +-
 src/pipeline/tests/pipeline.rs                |  10 +-
 src/servers/src/http/event.rs                 | 142 +++++++---
 src/servers/src/query_handler.rs              |   3 +
 tests-integration/tests/http.rs               | 253 ++++++++++++------
 11 files changed, 304 insertions(+), 140 deletions(-)

diff --git a/src/frontend/src/instance/log_handler.rs b/src/frontend/src/instance/log_handler.rs
index 9ae782c7d4ab..2da2d6717d3b 100644
--- a/src/frontend/src/instance/log_handler.rs
+++ b/src/frontend/src/instance/log_handler.rs
@@ -19,6 +19,7 @@ use async_trait::async_trait;
 use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
 use client::Output;
 use common_error::ext::BoxedError;
+use pipeline::pipeline_operator::PipelineOperator;
 use pipeline::{GreptimeTransformer, Pipeline, PipelineInfo, PipelineVersion};
 use servers::error::{
     AuthSnafu, Error as ServerError, ExecuteGrpcRequestSnafu, PipelineSnafu, Result as ServerResult,
@@ -97,6 +98,10 @@ impl PipelineHandler for Instance {
             .table(catalog, &schema, table, None)
             .await
     }
+
+    fn build_pipeline(&self, pipeline: &str) -> ServerResult<Pipeline<GreptimeTransformer>> {
+        PipelineOperator::build_pipeline(pipeline).context(PipelineSnafu)
+    }
 }
 
 impl Instance {
diff --git a/src/pipeline/benches/processor.rs b/src/pipeline/benches/processor.rs
index 09462753d892..8cf221af5b10 100644
--- a/src/pipeline/benches/processor.rs
+++ b/src/pipeline/benches/processor.rs
@@ -223,7 +223,7 @@ transform:
     type: uint32
 "#;
 
-    parse(&Content::Yaml(pipeline_yaml.into())).unwrap()
+    parse(&Content::Yaml(pipeline_yaml)).unwrap()
 }
 
 fn criterion_benchmark(c: &mut Criterion) {
diff --git a/src/pipeline/src/etl.rs b/src/pipeline/src/etl.rs
index 9bd47a899ec6..45feb4b02ff6 100644
--- a/src/pipeline/src/etl.rs
+++ b/src/pipeline/src/etl.rs
@@ -37,9 +37,9 @@ const PROCESSORS: &str = "processors";
 const TRANSFORM: &str = "transform";
 const TRANSFORMS: &str = "transforms";
 
-pub enum Content {
-    Json(String),
-    Yaml(String),
+pub enum Content<'a> {
+    Json(&'a str),
+    Yaml(&'a str),
 }
 
 pub fn parse<T>(input: &Content) -> Result<Pipeline<T>>
@@ -379,8 +379,7 @@ transform:
   - field: field2
     type: uint32
 "#;
-        let pipeline: Pipeline<GreptimeTransformer> =
-            parse(&Content::Yaml(pipeline_yaml.into())).unwrap();
+        let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_yaml)).unwrap();
         let mut payload = pipeline.init_intermediate_state();
         pipeline.prepare(input_value, &mut payload).unwrap();
         assert_eq!(&["my_field"].to_vec(), pipeline.required_keys());
@@ -432,8 +431,7 @@ transform:
   - field: ts
     type: timestamp, ns
     index: time"#;
-        let pipeline: Pipeline<GreptimeTransformer> =
-            parse(&Content::Yaml(pipeline_str.into())).unwrap();
+        let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_str)).unwrap();
         let mut payload = pipeline.init_intermediate_state();
         pipeline
             .prepare(serde_json::Value::String(message), &mut payload)
@@ -509,8 +507,7 @@ transform:
     type: uint32
 "#;
 
-        let pipeline: Pipeline<GreptimeTransformer> =
-            parse(&Content::Yaml(pipeline_yaml.into())).unwrap();
+        let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_yaml)).unwrap();
         let mut payload = pipeline.init_intermediate_state();
         pipeline.prepare(input_value, &mut payload).unwrap();
         assert_eq!(&["my_field"].to_vec(), pipeline.required_keys());
@@ -554,8 +551,7 @@ transform:
     index: time
 "#;
 
-        let pipeline: Pipeline<GreptimeTransformer> =
-            parse(&Content::Yaml(pipeline_yaml.into())).unwrap();
+        let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_yaml)).unwrap();
         let schema = pipeline.schemas().clone();
         let mut result = pipeline.init_intermediate_state();
         pipeline.prepare(input_value, &mut result).unwrap();
diff --git a/src/pipeline/src/manager/pipeline_operator.rs b/src/pipeline/src/manager/pipeline_operator.rs
index 2e838144a483..4f43b89e2e74 100644
--- a/src/pipeline/src/manager/pipeline_operator.rs
+++ b/src/pipeline/src/manager/pipeline_operator.rs
@@ -243,4 +243,9 @@ impl PipelineOperator {
             })
             .await
     }
+
+    /// Compile a pipeline.
+    pub fn build_pipeline(pipeline: &str) -> Result<Pipeline<GreptimeTransformer>> {
+        PipelineTable::compile_pipeline(pipeline)
+    }
 }
diff --git a/src/pipeline/src/manager/table.rs b/src/pipeline/src/manager/table.rs
index 7b3719b66707..c2a36c63ec6d 100644
--- a/src/pipeline/src/manager/table.rs
+++ b/src/pipeline/src/manager/table.rs
@@ -203,7 +203,7 @@ impl PipelineTable {
 
     /// Compile a pipeline from a string.
     pub fn compile_pipeline(pipeline: &str) -> Result<Pipeline<GreptimeTransformer>> {
-        let yaml_content = Content::Yaml(pipeline.into());
+        let yaml_content = Content::Yaml(pipeline);
         parse::<GreptimeTransformer>(&yaml_content).context(CompilePipelineSnafu)
     }
 
diff --git a/src/pipeline/tests/common.rs b/src/pipeline/tests/common.rs
index aa96d14d5591..d825c91e4cb3 100644
--- a/src/pipeline/tests/common.rs
+++ b/src/pipeline/tests/common.rs
@@ -19,7 +19,7 @@ use pipeline::{parse, Content, GreptimeTransformer, Pipeline};
 pub fn parse_and_exec(input_str: &str, pipeline_yaml: &str) -> Rows {
     let input_value = serde_json::from_str::<serde_json::Value>(input_str).unwrap();
 
-    let yaml_content = Content::Yaml(pipeline_yaml.into());
+    let yaml_content = Content::Yaml(pipeline_yaml);
     let pipeline: Pipeline<GreptimeTransformer> =
         parse(&yaml_content).expect("failed to parse pipeline");
     let mut result = pipeline.init_intermediate_state();
diff --git a/src/pipeline/tests/dissect.rs b/src/pipeline/tests/dissect.rs
index 7577d58080c7..56386d0e860a 100644
--- a/src/pipeline/tests/dissect.rs
+++ b/src/pipeline/tests/dissect.rs
@@ -270,7 +270,7 @@ transform:
 
     let input_value = serde_json::from_str::<serde_json::Value>(input_str).unwrap();
 
-    let yaml_content = pipeline::Content::Yaml(pipeline_yaml.into());
+    let yaml_content = pipeline::Content::Yaml(pipeline_yaml);
     let pipeline: pipeline::Pipeline<pipeline::GreptimeTransformer> =
         pipeline::parse(&yaml_content).expect("failed to parse pipeline");
     let mut result = pipeline.init_intermediate_state();
diff --git a/src/pipeline/tests/pipeline.rs b/src/pipeline/tests/pipeline.rs
index e68c7b9e6a6e..de724e1a27d2 100644
--- a/src/pipeline/tests/pipeline.rs
+++ b/src/pipeline/tests/pipeline.rs
@@ -417,7 +417,7 @@ transform:
     .map(|(_, d)| GreptimeValue { value_data: d })
     .collect::<Vec<GreptimeValue>>();
 
-    let yaml_content = Content::Yaml(pipeline_yaml.into());
+    let yaml_content = Content::Yaml(pipeline_yaml);
     let pipeline: Pipeline<GreptimeTransformer> =
         parse(&yaml_content).expect("failed to parse pipeline");
     let mut stats = pipeline.init_intermediate_state();
@@ -487,7 +487,7 @@ transform:
       type: json
 "#;
 
-    let yaml_content = Content::Yaml(pipeline_yaml.into());
+    let yaml_content = Content::Yaml(pipeline_yaml);
     let pipeline: Pipeline<GreptimeTransformer> = parse(&yaml_content).unwrap();
 
     let mut status = pipeline.init_intermediate_state();
@@ -592,7 +592,7 @@ transform:
       type: json
 "#;
 
-    let yaml_content = Content::Yaml(pipeline_yaml.into());
+    let yaml_content = Content::Yaml(pipeline_yaml);
     let pipeline: Pipeline<GreptimeTransformer> = parse(&yaml_content).unwrap();
 
     let mut status = pipeline.init_intermediate_state();
@@ -655,7 +655,7 @@ transform:
     index: timestamp
 "#;
 
-    let yaml_content = Content::Yaml(pipeline_yaml.into());
+    let yaml_content = Content::Yaml(pipeline_yaml);
     let pipeline: Pipeline<GreptimeTransformer> = parse(&yaml_content).unwrap();
 
     let mut status = pipeline.init_intermediate_state();
@@ -691,7 +691,7 @@ transform:
       - message
     type: string
 "#;
-    let yaml_content = Content::Yaml(pipeline_yaml.into());
+    let yaml_content = Content::Yaml(pipeline_yaml);
     let pipeline: Pipeline<GreptimeTransformer> = parse(&yaml_content).unwrap();
 
     let mut status = pipeline.init_intermediate_state();
diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs
index 5069db51975d..b6b520627d66 100644
--- a/src/servers/src/http/event.rs
+++ b/src/servers/src/http/event.rs
@@ -38,7 +38,7 @@ use lazy_static::lazy_static;
 use loki_api::prost_types::Timestamp;
 use pipeline::error::PipelineTransformSnafu;
 use pipeline::util::to_pipeline_version;
-use pipeline::PipelineVersion;
+use pipeline::{GreptimeTransformer, PipelineVersion};
 use prost::Message;
 use serde::{Deserialize, Serialize};
 use serde_json::{Deserializer, Map, Value};
@@ -276,39 +276,11 @@ fn transform_ndjson_array_factory(
         })
 }
 
-#[axum_macros::debug_handler]
-pub async fn pipeline_dryrun(
-    State(log_state): State<LogState>,
-    Query(query_params): Query<LogIngesterQueryParams>,
-    Extension(mut query_ctx): Extension<QueryContext>,
-    TypedHeader(content_type): TypedHeader<ContentType>,
-    payload: String,
+/// Dryrun pipeline with given data
+fn dryrun_pipeline_inner(
+    value: Vec<Value>,
+    pipeline: &pipeline::Pipeline<GreptimeTransformer>,
 ) -> Result<Response> {
-    let handler = log_state.log_handler;
-    let pipeline_name = query_params.pipeline_name.context(InvalidParameterSnafu {
-        reason: "pipeline_name is required",
-    })?;
-
-    let version = to_pipeline_version(query_params.version).context(PipelineSnafu)?;
-
-    let ignore_errors = query_params.ignore_errors.unwrap_or(false);
-
-    let value = extract_pipeline_value_by_content_type(content_type, payload, ignore_errors)?;
-
-    ensure!(
-        value.len() <= 10,
-        InvalidParameterSnafu {
-            reason: "too many rows for dryrun",
-        }
-    );
-
-    query_ctx.set_channel(Channel::Http);
-    let query_ctx = Arc::new(query_ctx);
-
-    let pipeline = handler
-        .get_pipeline(&pipeline_name, version, query_ctx.clone())
-        .await?;
-
     let mut intermediate_state = pipeline.init_intermediate_state();
 
     let mut results = Vec::with_capacity(value.len());
@@ -387,6 +359,110 @@ pub async fn pipeline_dryrun(
     Ok(Json(result).into_response())
 }
 
+/// Dryrun pipeline with given data
+/// pipeline_name and pipeline_version to specify pipeline stored in db
+/// pipeline to specify pipeline raw content
+/// data to specify data
+/// data maght be list of string or list of object
+#[derive(Debug, Default, Serialize, Deserialize)]
+pub struct PipelineDryrunParams {
+    pub pipeline_name: Option<String>,
+    pub pipeline_version: Option<String>,
+    pub pipeline: Option<String>,
+    pub data: Vec<Value>,
+}
+
+/// Check if the payload is valid json
+/// Check if the payload contains pipeline or pipeline_name and data
+/// Return Some if valid, None if invalid
+fn check_pipeline_dryrun_params_valid(payload: &str) -> Option<PipelineDryrunParams> {
+    match serde_json::from_str::<PipelineDryrunParams>(payload) {
+        // payload with pipeline or pipeline_name and data is array
+        Ok(params) if params.pipeline.is_some() || params.pipeline_name.is_some() => Some(params),
+        // because of the pipeline_name or pipeline is required
+        Ok(_) => None,
+        // invalid json
+        Err(_) => None,
+    }
+}
+
+/// Check if the pipeline_name exists
+fn check_pipeline_name_exists(pipeline_name: Option<String>) -> Result<String> {
+    pipeline_name.context(InvalidParameterSnafu {
+        reason: "pipeline_name is required",
+    })
+}
+
+/// Check if the data length less than 10
+fn check_data_valid(data_len: usize) -> Result<()> {
+    ensure!(
+        data_len <= 10,
+        InvalidParameterSnafu {
+            reason: "data is required",
+        }
+    );
+    Ok(())
+}
+
+#[axum_macros::debug_handler]
+pub async fn pipeline_dryrun(
+    State(log_state): State<LogState>,
+    Query(query_params): Query<LogIngesterQueryParams>,
+    Extension(mut query_ctx): Extension<QueryContext>,
+    TypedHeader(content_type): TypedHeader<ContentType>,
+    payload: String,
+) -> Result<Response> {
+    let handler = log_state.log_handler;
+
+    match check_pipeline_dryrun_params_valid(&payload) {
+        Some(params) => {
+            let data = params.data;
+
+            check_data_valid(data.len())?;
+
+            match params.pipeline {
+                None => {
+                    let version =
+                        to_pipeline_version(params.pipeline_version).context(PipelineSnafu)?;
+                    let pipeline_name = check_pipeline_name_exists(params.pipeline_name)?;
+                    let pipeline = handler
+                        .get_pipeline(&pipeline_name, version, Arc::new(query_ctx))
+                        .await?;
+                    dryrun_pipeline_inner(data, &pipeline)
+                }
+                Some(pipeline) => {
+                    let pipeline = handler.build_pipeline(&pipeline)?;
+                    dryrun_pipeline_inner(data, &pipeline)
+                }
+            }
+        }
+        None => {
+            // This path is for back compatibility with the previous dry run code
+            // where the payload is just data (JSON or plain text) and the pipeline name
+            // is specified using query param.
+            let pipeline_name = check_pipeline_name_exists(query_params.pipeline_name)?;
+
+            let version = to_pipeline_version(query_params.version).context(PipelineSnafu)?;
+
+            let ignore_errors = query_params.ignore_errors.unwrap_or(false);
+
+            let value =
+                extract_pipeline_value_by_content_type(content_type, payload, ignore_errors)?;
+
+            check_data_valid(value.len())?;
+
+            query_ctx.set_channel(Channel::Http);
+            let query_ctx = Arc::new(query_ctx);
+
+            let pipeline = handler
+                .get_pipeline(&pipeline_name, version, query_ctx.clone())
+                .await?;
+
+            dryrun_pipeline_inner(value, &pipeline)
+        }
+    }
+}
+
 #[axum_macros::debug_handler]
 pub async fn loki_ingest(
     State(log_state): State<LogState>,
diff --git a/src/servers/src/query_handler.rs b/src/servers/src/query_handler.rs
index 96a01593a8f1..ff92d3c5d15b 100644
--- a/src/servers/src/query_handler.rs
+++ b/src/servers/src/query_handler.rs
@@ -170,4 +170,7 @@ pub trait PipelineHandler {
         table: &str,
         query_ctx: &QueryContext,
     ) -> std::result::Result<Option<Arc<table::Table>>, catalog::error::Error>;
+
+    //// Build a pipeline from a string.
+    fn build_pipeline(&self, pipeline: &str) -> Result<Pipeline<GreptimeTransformer>>;
 }
diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs
index 5a48fef39e43..ab2ec4ea6777 100644
--- a/tests-integration/tests/http.rs
+++ b/tests-integration/tests/http.rs
@@ -1319,7 +1319,7 @@ pub async fn test_test_pipeline_api(store_type: StorageType) {
     // handshake
     let client = TestClient::new(app);
 
-    let body = r#"
+    let pipeline_content = r#"
 processors:
   - date:
       field: time
@@ -1346,7 +1346,7 @@ transform:
     let res = client
         .post("/v1/events/pipelines/test")
         .header("Content-Type", "application/x-yaml")
-        .body(body)
+        .body(pipeline_content)
         .send()
         .await;
 
@@ -1367,113 +1367,192 @@ transform:
     let pipeline = pipelines.first().unwrap();
     assert_eq!(pipeline.get("name").unwrap(), "test");
 
-    // 2. write data
-    let data_body = r#"
+    let dryrun_schema = json!([
+        {
+            "colume_type": "FIELD",
+            "data_type": "INT32",
+            "fulltext": false,
+            "name": "id1"
+        },
+        {
+            "colume_type": "FIELD",
+            "data_type": "INT32",
+            "fulltext": false,
+            "name": "id2"
+        },
+        {
+            "colume_type": "FIELD",
+            "data_type": "STRING",
+            "fulltext": false,
+            "name": "type"
+        },
+        {
+            "colume_type": "FIELD",
+            "data_type": "STRING",
+            "fulltext": false,
+            "name": "log"
+        },
+        {
+            "colume_type": "FIELD",
+            "data_type": "STRING",
+            "fulltext": false,
+            "name": "logger"
+        },
+        {
+            "colume_type": "TIMESTAMP",
+            "data_type": "TIMESTAMP_NANOSECOND",
+            "fulltext": false,
+            "name": "time"
+        }
+    ]);
+    let dryrun_rows = json!([
         [
-          {
-            "id1": "2436",
-            "id2": "2528",
-            "logger": "INTERACT.MANAGER",
-            "type": "I",
-            "time": "2024-05-25 20:16:37.217",
-            "log": "ClusterAdapter:enter sendTextDataToCluster\\n"
-          }
-        ]
-        "#;
-    let res = client
-        .post("/v1/events/pipelines/dryrun?pipeline_name=test")
-        .header("Content-Type", "application/json")
-        .body(data_body)
-        .send()
-        .await;
-    assert_eq!(res.status(), StatusCode::OK);
-    let body: Value = res.json().await;
-    let schema = &body["schema"];
-    let rows = &body["rows"];
-    assert_eq!(
-        schema,
-        &json!([
             {
-                "colume_type": "FIELD",
                 "data_type": "INT32",
-                "fulltext": false,
-                "name": "id1"
+                "key": "id1",
+                "semantic_type": "FIELD",
+                "value": 2436
             },
             {
-                "colume_type": "FIELD",
                 "data_type": "INT32",
-                "fulltext": false,
-                "name": "id2"
+                "key": "id2",
+                "semantic_type": "FIELD",
+                "value": 2528
             },
             {
-                "colume_type": "FIELD",
                 "data_type": "STRING",
-                "fulltext": false,
-                "name": "type"
+                "key": "type",
+                "semantic_type": "FIELD",
+                "value": "I"
             },
             {
-                "colume_type": "FIELD",
                 "data_type": "STRING",
-                "fulltext": false,
-                "name": "log"
+                "key": "log",
+                "semantic_type": "FIELD",
+                "value": "ClusterAdapter:enter sendTextDataToCluster\\n"
             },
             {
-                "colume_type": "FIELD",
                 "data_type": "STRING",
-                "fulltext": false,
-                "name": "logger"
+                "key": "logger",
+                "semantic_type": "FIELD",
+                "value": "INTERACT.MANAGER"
             },
             {
-                "colume_type": "TIMESTAMP",
                 "data_type": "TIMESTAMP_NANOSECOND",
-                "fulltext": false,
-                "name": "time"
+                "key": "time",
+                "semantic_type": "TIMESTAMP",
+                "value": "2024-05-25 20:16:37.217+0000"
             }
-        ])
-    );
-    assert_eq!(
-        rows,
-        &json!([
-            [
-                {
-                    "data_type": "INT32",
-                    "key": "id1",
-                    "semantic_type": "FIELD",
-                    "value": 2436
-                },
-                {
-                    "data_type": "INT32",
-                    "key": "id2",
-                    "semantic_type": "FIELD",
-                    "value": 2528
-                },
-                {
-                    "data_type": "STRING",
-                    "key": "type",
-                    "semantic_type": "FIELD",
-                    "value": "I"
-                },
-                {
-                    "data_type": "STRING",
-                    "key": "log",
-                    "semantic_type": "FIELD",
-                    "value": "ClusterAdapter:enter sendTextDataToCluster\\n"
-                },
-                {
-                    "data_type": "STRING",
-                    "key": "logger",
-                    "semantic_type": "FIELD",
-                    "value": "INTERACT.MANAGER"
-                },
+        ]
+    ]);
+    {
+        // test original api
+        let data_body = r#"
+        [
+          {
+            "id1": "2436",
+            "id2": "2528",
+            "logger": "INTERACT.MANAGER",
+            "type": "I",
+            "time": "2024-05-25 20:16:37.217",
+            "log": "ClusterAdapter:enter sendTextDataToCluster\\n"
+          }
+        ]
+        "#;
+        let res = client
+            .post("/v1/events/pipelines/dryrun?pipeline_name=test")
+            .header("Content-Type", "application/json")
+            .body(data_body)
+            .send()
+            .await;
+        assert_eq!(res.status(), StatusCode::OK);
+        let body: Value = res.json().await;
+        let schema = &body["schema"];
+        let rows = &body["rows"];
+        assert_eq!(schema, &dryrun_schema);
+        assert_eq!(rows, &dryrun_rows);
+    }
+    {
+        // test new api specify pipeline via pipeline_name
+        let body = r#"
+            {
+            "pipeline_name": "test",
+            "data": [
                 {
-                    "data_type": "TIMESTAMP_NANOSECOND",
-                    "key": "time",
-                    "semantic_type": "TIMESTAMP",
-                    "value": "2024-05-25 20:16:37.217+0000"
+                "id1": "2436",
+                "id2": "2528",
+                "logger": "INTERACT.MANAGER",
+                "type": "I",
+                "time": "2024-05-25 20:16:37.217",
+                "log": "ClusterAdapter:enter sendTextDataToCluster\\n"
                 }
             ]
-        ])
-    );
+            }
+        "#;
+        let res = client
+            .post("/v1/events/pipelines/dryrun")
+            .header("Content-Type", "application/json")
+            .body(body)
+            .send()
+            .await;
+        assert_eq!(res.status(), StatusCode::OK);
+        let body: Value = res.json().await;
+        let schema = &body["schema"];
+        let rows = &body["rows"];
+        assert_eq!(schema, &dryrun_schema);
+        assert_eq!(rows, &dryrun_rows);
+    }
+    {
+        // test new api specify pipeline via pipeline raw data
+        let mut body = json!({
+        "data": [
+            {
+            "id1": "2436",
+            "id2": "2528",
+            "logger": "INTERACT.MANAGER",
+            "type": "I",
+            "time": "2024-05-25 20:16:37.217",
+            "log": "ClusterAdapter:enter sendTextDataToCluster\\n"
+            }
+        ]
+        });
+        body["pipeline"] = json!(pipeline_content);
+        let res = client
+            .post("/v1/events/pipelines/dryrun")
+            .header("Content-Type", "application/json")
+            .body(body.to_string())
+            .send()
+            .await;
+        assert_eq!(res.status(), StatusCode::OK);
+        let body: Value = res.json().await;
+        let schema = &body["schema"];
+        let rows = &body["rows"];
+        assert_eq!(schema, &dryrun_schema);
+        assert_eq!(rows, &dryrun_rows);
+    }
+    {
+        // failback to old version api
+        // not pipeline and pipeline_name in the body
+        let body = json!({
+        "data": [
+            {
+            "id1": "2436",
+            "id2": "2528",
+            "logger": "INTERACT.MANAGER",
+            "type": "I",
+            "time": "2024-05-25 20:16:37.217",
+            "log": "ClusterAdapter:enter sendTextDataToCluster\\n"
+            }
+        ]
+        });
+        let res = client
+            .post("/v1/events/pipelines/dryrun")
+            .header("Content-Type", "application/json")
+            .body(body.to_string())
+            .send()
+            .await;
+        assert_eq!(res.status(), StatusCode::BAD_REQUEST);
+    }
     guard.remove_all().await;
 }