diff --git a/distribution/ecs/README.md b/distribution/ecs/README.md index e201ec04e14..bb6415f482c 100644 --- a/distribution/ecs/README.md +++ b/distribution/ecs/README.md @@ -64,7 +64,7 @@ IAM policies to indexers. We provide an example of self contained deployment with an ad-hoc VPC. > [!IMPORTANT] -> This stack costs ~$150/month to run (Fargate tasks, NAT Gateways +> This stack costs ~$200/month to run (Fargate tasks, NAT Gateways > and RDS) ### Deploy the Quickwit module and connect through a bastion diff --git a/distribution/ecs/quickwit/variables.tf b/distribution/ecs/quickwit/variables.tf index 0a343a50d01..6ff953c8a01 100644 --- a/distribution/ecs/quickwit/variables.tf +++ b/distribution/ecs/quickwit/variables.tf @@ -73,8 +73,8 @@ variable "quickwit_indexer" { description = "Indexer service sizing configurations" type = object({ desired_count = optional(number, 1) - memory = optional(number, 4096) - cpu = optional(number, 1024) + memory = optional(number, 8192) + cpu = optional(number, 2048) ephemeral_storage_gib = optional(number, 21) extra_task_policy_arns = optional(list(string), []) }) @@ -95,7 +95,7 @@ variable "quickwit_searcher" { description = "Searcher service sizing configurations" type = object({ desired_count = optional(number, 1) - memory = optional(number, 2048) + memory = optional(number, 4096) cpu = optional(number, 1024) ephemeral_storage_gib = optional(number, 21) }) diff --git a/docs/deployment/node-sizing.md b/docs/deployment/node-sizing.md new file mode 100644 index 00000000000..19e3f24237b --- /dev/null +++ b/docs/deployment/node-sizing.md @@ -0,0 +1,101 @@ +--- +title: Cluster sizing +sidebar_position: 2 +--- + +In this guide, we discuss how to size your Quickwit cluster and nodes. As shown +in the [architecture section](../overview/architecture.md), a Quickwit cluster +has 5 main components: the Indexers, the Searchers, the Control Plane, the +Metastore and the Janitor. Each component has different resource requirements +and can be scaled independently. We will also discuss how to size the metastore +PostgreSQL database. + +:::note + +This guide provides general guidelines. The actual resource requirements depend +strongly on the workload. We recommend monitoring the resource usage and +adjusting the cluster size accordingly. + +::: + +## Quickwit services + +### Indexers + +Here are some high-level guidelines to size your Indexer nodes: +- Quickwit can index at around **7.5MB per second per core** +- For the general use case, configure 4GB of RAM per core + - Workloads with a large number of indexes or data sources consume more RAM + + - Don't use instances with less than 8GB of RAM + +- Mount the data directory to a volume of at least 110GB to store the [split + cache](../configuration/node-config.md#Indexer-configuration) and the [ingest + queue](../configuration/node-config.md#ingest-api-configuration). + + +:::note + +To utilize all CPUs on Indexer nodes that have more than 4 cores, your indexing +workload needs to be broken down into multiple indexing pipelines. This can be +achieved by creating multiple indexes or by using a [partitioned data +source](../configuration/source-config.md#number-of-pipelines) such as +[Kafka](../configuration/source-config.md#kafka-source). + + + +::: + + +### Searchers + +Search performance is highly dependent on the workload. For example, term queries +are usually cheaper than aggregations. A good starting point for dimensioning +Searcher nodes: +- Configure 8GB of RAM per core when using a high latency / low bandwidth object + store like AWS S3 +- Increase the CPU / RAM ratio (e.g 4GB/core) when using a faster object store +- Provision more RAM if you expect many concurrent aggregation requests. By + default, each request can use up to 500MB of RAM on each node. +- Avoid instances with less than 4GB of RAM + +- Searcher nodes don't use disk unless the [split + cache](../configuration/node-config.md#Searcher-split-cache-configuration) is + explicitely enabled + +One strength of Quickwit is that its Searchers are stateless, which makes it +easy to scale them up and down based on the workload. Scale the number of +Searcher nodes based on: +- the number of concurrent requests expected +- aggregations that run on large amounts of data (without + [time](../overview/concepts/querying.md#time-sharding) or + [tag](../overview/concepts/querying.md#tag-pruning) pruning) + +### Other services + +The Control Plane, the Metastore and the Janitor are lightweight components. +Each of these services requires 1 replica. + +The Control Plane only needs a single core and 2GB of RAM. It doesn't require any disk. + +The Metastore also requires a single core and 2GB of RAM. For clusters handling +hundreds of indexes, you might increase the size to 2 cores and 4GB of RAM. It +doesn't write to disk. + +In general the Janitor requires 1 core and 2GB of RAM and doesn't use the disk. +If you use the [delete API](https://quickwit.io/docs/overview/concepts/deletes), +the Janitor should be dimensioned like an indexer. + +### Single node deployments + +For experimentations and small scale POCs, it is possible to deploy all the +services on a single node (see +[tutorial](../get-started/tutorials/tutorial-hdfs-logs.md)). We recommend at +least 2 cores and 8GB of RAM. + +## Postgres Metastore backend + +For most use cases, a Postgres instance with 4GB of RAM and 1 core is +sufficient: +- with the AWS RDS managed service, use the t4g.medium instance type. Enable + multi-AZ with one standby for high availability.