diff --git a/infra/prod-aws/.terraform.lock.hcl b/infra/prod-aws/.terraform.lock.hcl new file mode 100644 index 00000000..c3578aee --- /dev/null +++ b/infra/prod-aws/.terraform.lock.hcl @@ -0,0 +1,44 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/aws" { + version = "5.43.0" + constraints = "~> 5.0" + hashes = [ + "h1:g+aulJVHZfXjrC06odZcQPCkNZqD2jiJGsxGnh34Tmw=", + "zh:07fb2abb9cf4d2042b41b2b2c642d4c4bd2feccbd856cd7040a7d15158fed478", + "zh:1373339e796d8d8473c267c0ecddb701559fce454c2cdd192cf8b0eadf759b48", + "zh:1644b4e0fd2e0b28d465bb5cf08b1f594a623324d176e879e5052f78cd2ea8cb", + "zh:385943b8d4170c5269b8e13e876636b7edc0ad2576edc7eb5d81cd4286a461d8", + "zh:48cf103f4fa866b67b686e8c085ac15264d6f020b6ad4a90f496b7283d31faa6", + "zh:4a4c4b4236542089d1bdb688c248e0b7c941ce42887da87e487bfb15038dcaf9", + "zh:5d84f3e12100bdd62a8c295b56358b82afc130642dca80d104bd868fdc28ed7c", + "zh:68294a601ce588a8838bcf4e136bb5ed8d2b1ee410f8871d88e35ce4861cf33f", + "zh:7ae1af6e9b95bd6c33dd0922216ac2b59f2f5b22fedbeab1db7a80b2f4358919", + "zh:89c718d41b2eeeaefd1acdbd839f1326a8c866bd49752648b0b32d3dd4a38163", + "zh:96e54ccb0f5ddf60465edf5c9f46e64f7d2f392507b851f102723797b4a15d09", + "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", + "zh:b102ce204ebbbf32d68ff47b5224eeb60873bef5b58a7fd7790f6b4020801578", + "zh:cae4cb16d15ac4b15c8de5bc9dddc2032583e12c4f31e23b3a7ef22da60657dc", + "zh:fecbcbd63111c9518de261bcb37482cb06ee149e7298f567d45b2a55674faa75", + ] +} + +provider "registry.terraform.io/hashicorp/random" { + version = "3.6.0" + hashes = [ + "h1:R5Ucn26riKIEijcsiOMBR3uOAjuOMfI1x7XvH4P6B1w=", + "zh:03360ed3ecd31e8c5dac9c95fe0858be50f3e9a0d0c654b5e504109c2159287d", + "zh:1c67ac51254ba2a2bb53a25e8ae7e4d076103483f55f39b426ec55e47d1fe211", + "zh:24a17bba7f6d679538ff51b3a2f378cedadede97af8a1db7dad4fd8d6d50f829", + "zh:30ffb297ffd1633175d6545d37c2217e2cef9545a6e03946e514c59c0859b77d", + "zh:454ce4b3dbc73e6775f2f6605d45cee6e16c3872a2e66a2c97993d6e5cbd7055", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:91df0a9fab329aff2ff4cf26797592eb7a3a90b4a0c04d64ce186654e0cc6e17", + "zh:aa57384b85622a9f7bfb5d4512ca88e61f22a9cea9f30febaa4c98c68ff0dc21", + "zh:c4a3e329ba786ffb6f2b694e1fd41d413a7010f3a53c20b432325a94fa71e839", + "zh:e2699bc9116447f96c53d55f2a00570f982e6f9935038c3810603572693712d0", + "zh:e747c0fd5d7684e5bfad8aa0ca441903f15ae7a98a737ff6aca24ba223207e2c", + "zh:f1ca75f417ce490368f047b63ec09fd003711ae48487fba90b4aba2ccf71920e", + ] +} diff --git a/infra/prod-aws/README.txt b/infra/prod-aws/README.txt new file mode 100644 index 00000000..1c06c482 --- /dev/null +++ b/infra/prod-aws/README.txt @@ -0,0 +1,36 @@ +README +====== + +This directory contains Terraform files that stand up raw-data services +on AWS. The following components are managed: + +1. module.vpc - Networking components, subnets, gateways, routes +2. module.db - AWS RDS Aurora serverless database with bells and whistles. +3. resource.aws_ecs_cluster - An ECS cluster to contain the services +4. module.alb - Application load balancer for the API +5. module.alb-flower - Application load balancer for the monitoring service. +6. module.ecs-api - API containers +7. module.ecs-worker-daemon - Containers for daemon workers +8. module.ecs-worker-ondemand - Containers for on-demand workers +9. module.ecs-flower - Containers for flower: the queue monitoring service. +10. redis resources - Managed by AWS Elasticache and associated services. +11. An ECS instance (VM) running the backend service. +12. An ECS instance for SSH tunnels (jump-hosts) +13. Route53 resources for DNS mapping for the load balancers. + +RUNNING TERRAFORM +================= + +TBD + +Variables +--------- + +- TBD + +TODO +==== + +- Remove OSM app credentials from worker containers +- Event-driven scaling for workers +- ... diff --git a/infra/prod-aws/ami_lookup.tf b/infra/prod-aws/ami_lookup.tf new file mode 100644 index 00000000..f28925f2 --- /dev/null +++ b/infra/prod-aws/ami_lookup.tf @@ -0,0 +1,83 @@ +data "aws_ami" "debian_bookworm_x86" { + most_recent = true + + filter { + name = "name" + values = ["debian-12-amd64-*"] + } + + filter { + name = "virtualization-type" + values = ["hvm"] + } + + filter { + name = "architecture" + values = ["x86_64"] + } + + owners = ["136693071363"] # Debian +} + +data "aws_ami" "debian_bookworm_arm" { + most_recent = true + + filter { + name = "name" + values = ["debian-12-arm64-*"] + } + + filter { + name = "virtualization-type" + values = ["hvm"] + } + + filter { + name = "architecture" + values = ["arm64"] + } + + owners = ["136693071363"] # Debian +} + +data "aws_ami" "debian_bullseye_x86" { + most_recent = true + + filter { + name = "name" + values = ["debian-11-amd64-*"] + } + + filter { + name = "virtualization-type" + values = ["hvm"] + } + + filter { + name = "architecture" + values = ["x86_64"] + } + + owners = ["903794441882"] # Debian +} + +data "aws_ami" "debian_bullseye_arm" { + most_recent = true + + filter { + name = "name" + values = ["debian-11-arm64-*"] + } + + filter { + name = "virtualization-type" + values = ["hvm"] + } + + filter { + name = "architecture" + values = ["arm64"] + } + + owners = ["903794441882"] # Debian +} diff --git a/infra/prod-aws/main.tf b/infra/prod-aws/main.tf new file mode 100644 index 00000000..4041f720 --- /dev/null +++ b/infra/prod-aws/main.tf @@ -0,0 +1,518 @@ +resource "aws_cloudwatch_log_group" "main" { + name = "raw-data-services" + retention_in_days = 7 +} + +module "vpc" { + source = "git::https://gitlab.com/eternaltyro/terraform-aws-vpc.git" + + project_meta = var.project_meta + + deployment_environment = var.deployment_environment + default_tags = var.default_tags +} + +module "db" { + source = "git::https://gitlab.com/eternaltyro/terraform-aws-rds.git" + + project_meta = var.project_meta + org_meta = var.org_meta + + vpc_id = module.vpc.vpc_id + subnet_ids = module.vpc.private_subnets + + deployment_environment = var.deployment_environment + deletion_protection = false + default_tags = var.default_tags + + database = { + name = "rawdata" + admin_user = "datadba" + password_length = 48 + engine_version = 15 + port = 5432 + } + + backup = { + retention_days = 7 + skip_final_snapshot = true + final_snapshot_identifier = "final" + } +} + +resource "aws_ecs_cluster" "main" { + name = lookup(var.project_meta, "name") + + setting { + name = "containerInsights" + value = "enabled" + } + + tags = { + Name = lookup(var.project_meta, "name") + } +} + +locals { + redis_connection_string = join("", [ + "redis://", + aws_elasticache_cluster.main.cache_nodes[0].address, + ":", + aws_elasticache_cluster.main.cache_nodes[0].port, + "/0" + ]) + + container_secrets = merge(var.container_secrets, { REMOTE_DB = var.remote_db_arn }) + + container_envvars = merge( + var.container_envvars, + { + CELERY_BROKER_URL = local.redis_connection_string + CELERY_RESULT_BACKEND = local.redis_connection_string + RATE_LIMITER_STORAGE_URI = local.redis_connection_string + BUCKET_NAME = var.bucket_name + } + ) +} + +module "alb" { + source = "git::https://gitlab.com/eternaltyro/terraform-aws-alb.git" + + vpc_id = module.vpc.vpc_id + app_port = 8000 + acm_tls_cert_domain = "*.${var.DNS_domain}" + health_check_path = "/docs" + alb_subnets = module.vpc.public_subnets + alb_name = "raw-data-services-staging" + target_group_name = "raw-data-api-staging" +} + +module "alb-flower" { + source = "git::https://gitlab.com/eternaltyro/terraform-aws-alb.git" + + vpc_id = module.vpc.vpc_id + app_port = 5555 + acm_tls_cert_domain = "*.${var.DNS_domain}" + health_check_path = "/healthcheck" + alb_subnets = module.vpc.public_subnets + alb_name = "raw-data-flower-staging" + target_group_name = "raw-data-flower-staging" +} + +module "ecs-api" { + source = "git::https://gitlab.com/eternaltyro/terraform-aws-ecs.git" + + service_name = "api" + aws_vpc_id = module.vpc.vpc_id + + scaling_target_values = { + container_min_count = 1 + container_max_count = 1 + } + + ecs_cluster_name = aws_ecs_cluster.main.name + ecs_cluster_arn = aws_ecs_cluster.main.arn + + load_balancer_settings = { + enabled = true + target_group_arn = module.alb.target_group_arn + target_group_arn_suffix = module.alb.target_group_arn_suffix + arn_suffix = module.alb.load_balancer_arn_suffix + scaling_request_count = 1000 + } + task_role_arn = var.task_role_arn + + service_security_groups = [ + module.alb.load_balancer_app_security_group, + module.db.database_security_group_id, + aws_security_group.redis.id + ] + + container_secrets = merge(local.container_secrets, + { + SENTRY_DSN = var.sentry_dsn + APP_SECRET_KEY = var.app_secret_key + } + ) + container_envvars = local.container_envvars + + service_subnets = module.vpc.private_subnets + + container_capacity = { + cpu = 1024 + memory_mb = 2048 + } + + container_settings = { + app_port = 8000 + cpu_architecture = "X86_64" + image_url = "ghcr.io/hotosm/raw-data-api" + image_tag = lookup(var.project_meta, "version") + service_name = "raw-data-api" + } + + log_configuration = { + logdriver = "awslogs" + options = { + awslogs-group = aws_cloudwatch_log_group.main.name + awslogs-region = var.aws_region + awslogs-stream-prefix = "api" + } + } + + default_tags = var.default_tags + efs_settings = var.efs_settings +} + +module "ecs-worker-daemon" { + source = "git::https://gitlab.com/eternaltyro/terraform-aws-ecs.git" + + service_name = "worker-daemon" + task_role_arn = var.task_role_arn + + aws_vpc_id = module.vpc.vpc_id + scaling_target_values = { + container_min_count = 2 + container_max_count = 4 + } + scale_by_cpu = { + enabled = true + cpu_pct = 50 + } + scale_by_memory = { + enabled = true + memory_pct = 50 + } + ecs_cluster_name = aws_ecs_cluster.main.name + ecs_cluster_arn = aws_ecs_cluster.main.arn + + service_security_groups = [ + module.db.database_security_group_id, + aws_security_group.redis.id + ] + + container_commands = [ + "celery", + "--app", "API.api_worker", + "worker", + "--loglevel=DEBUG", + "--queues=raw_daemon", + "--concurrency", "1", + "-n", "ondemand_daemon-%h" + ] + container_secrets = local.container_secrets + container_envvars = local.container_envvars + + service_subnets = module.vpc.private_subnets + + container_ephemeral_storage = 100 + container_capacity = { + cpu = 2048 + memory_mb = 16384 + } + + container_settings = { + app_port = 8000 + cpu_architecture = "X86_64" + image_url = "ghcr.io/hotosm/raw-data-api" + image_tag = lookup(var.project_meta, "version") + service_name = "raw-data-worker-daemon" + } + + log_configuration = { + logdriver = "awslogs" + options = { + awslogs-group = aws_cloudwatch_log_group.main.name + awslogs-region = var.aws_region + awslogs-stream-prefix = "worker-daemon" + } + } + + default_tags = var.default_tags + efs_settings = var.efs_settings +} + +module "ecs-worker-ondemand" { + source = "git::https://gitlab.com/eternaltyro/terraform-aws-ecs.git" + + service_name = "worker-ondemand" + task_role_arn = var.task_role_arn + + aws_vpc_id = module.vpc.vpc_id + // TBD: SCALE BY QUEUE - EVENT DRIVEN SCALING + scaling_target_values = { + container_min_count = 1 + container_max_count = 2 + } + + ecs_cluster_name = aws_ecs_cluster.main.name + ecs_cluster_arn = aws_ecs_cluster.main.arn + + service_security_groups = [ + module.db.database_security_group_id, + aws_security_group.redis.id + ] + + container_commands = [ + "celery", + "--app", "API.api_worker", + "worker", + "--loglevel=DEBUG", + "--queues=raw_ondemand", + "--concurrency", "1", + "-n", "ondemand_worker-%h" + ] + container_secrets = local.container_secrets + container_envvars = merge(local.container_envvars, { MAX_WORKERS = "2" }) + + service_subnets = module.vpc.private_subnets + + container_ephemeral_storage = 150 + container_capacity = { + cpu = 4096 + memory_mb = 24576 + } + + container_settings = { + app_port = 8000 + cpu_architecture = "X86_64" + image_url = "ghcr.io/hotosm/raw-data-api" + image_tag = lookup(var.project_meta, "version") + service_name = "raw-data-worker-ondemand" + } + + log_configuration = { + logdriver = "awslogs" + options = { + awslogs-group = aws_cloudwatch_log_group.main.name + awslogs-region = var.aws_region + awslogs-stream-prefix = "worker-ondemand" + } + } + + default_tags = var.default_tags + efs_settings = var.efs_settings +} + +module "ecs-flower" { + source = "git::https://gitlab.com/eternaltyro/terraform-aws-ecs.git" + + service_name = "worker-monitoring" + load_balancer_settings = { + enabled = true + target_group_arn = module.alb-flower.target_group_arn + target_group_arn_suffix = module.alb-flower.target_group_arn_suffix + arn_suffix = module.alb-flower.load_balancer_arn_suffix + scaling_request_count = 100 + } + + aws_vpc_id = module.vpc.vpc_id + scaling_target_values = { + container_min_count = 1 + container_max_count = 1 + } + ecs_cluster_name = aws_ecs_cluster.main.name + ecs_cluster_arn = aws_ecs_cluster.main.arn + + service_security_groups = [ + module.alb-flower.load_balancer_app_security_group, + aws_security_group.redis.id + ] + + container_commands = [ + "celery", + "flower" + ] + + container_secrets = { + DUMMY = var.dummy_arn + } + + container_envvars = { + FLOWER_PERSISTENT = "True" + FLOWER_STATE_SAVE_INTERVAL = "10000" + FLOWER_DB = "flower_db" + FLOWER_BASIC_AUTH = var.flower_creds + CELERY_BROKER_URL = local.redis_connection_string + } + + service_subnets = module.vpc.private_subnets + + container_settings = { + app_port = 5555 + cpu_architecture = "X86_64" + image_url = "ghcr.io/hotosm/raw-data-api" + image_tag = lookup(var.project_meta, "version") + service_name = "raw-data-monitoring" + } + + log_configuration = { + logdriver = "awslogs" + options = { + awslogs-group = aws_cloudwatch_log_group.main.name + awslogs-region = var.aws_region + awslogs-stream-prefix = "flower" + } + } + + default_tags = var.default_tags + efs_settings = var.efs_settings +} + +resource "aws_security_group" "redis" { + name = "redis_private_access" + description = "Attach this to give access to Raw Data redis" + vpc_id = module.vpc.vpc_id + + ingress { + description = "Allow connections from self" + from_port = 6379 + to_port = 6379 + protocol = "tcp" + self = true + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + ipv6_cidr_blocks = ["::/0"] + } + + tags = { + Name = "Access to elasticache Redis" + } +} + +resource "aws_elasticache_subnet_group" "private" { + name = lookup(var.project_meta, "short_name") + subnet_ids = module.vpc.private_subnets +} + +resource "aws_elasticache_cluster" "main" { + cluster_id = lookup(var.project_meta, "short_name") + engine = "redis" + node_type = "cache.m7g.large" // TODO: PARAMETERIZE + num_cache_nodes = 1 + parameter_group_name = "default.redis7" // TODO: PARAMETERIZE + subnet_group_name = aws_elasticache_subnet_group.private.name + security_group_ids = [aws_security_group.redis.id] + network_type = "dual_stack" +} + +resource "aws_instance" "jump" { + # EC2 machine for backend - Storage 100G for stage; 500G for prod + ami = data.aws_ami.debian_bookworm_x86.id + associate_public_ip_address = true + instance_type = "t3.small" + key_name = var.SSH_key_name + vpc_security_group_ids = [ + module.vpc.default_security_group_id, + module.db.database_security_group_id, + aws_security_group.redis.id + ] + subnet_id = element(module.vpc.public_subnets, 1) + + root_block_device { + volume_type = "gp3" + volume_size = 50 + } + + tags = { + Name = "raw-data-jump" + } + + lifecycle { + ignore_changes = [ + ami, + ] + } +} + +resource "aws_instance" "backend" { + # EC2 machine for backend - Storage 100G for stage; 500G for prod + ami = data.aws_ami.debian_bookworm_x86.id + associate_public_ip_address = false + instance_type = "t3.large" + key_name = var.SSH_key_name + vpc_security_group_ids = [ + module.vpc.default_security_group_id, + module.db.database_security_group_id, + aws_security_group.redis.id + ] + subnet_id = element(module.vpc.private_subnets, 1) + + root_block_device { + volume_type = "gp3" + volume_size = 80 + } + + ebs_block_device { + device_name = "/dev/sdf" + volume_type = "gp3" + volume_size = 500 + } + + metadata_options { + http_tokens = "required" + } + + tags = { + Name = "raw-data-backend" + } + + lifecycle { + ignore_changes = [ + ami, + ] + } +} + +resource "aws_route53_record" "stage-v4" { + zone_id = var.DNS_zone + name = "api-prod.${var.DNS_domain}" + type = "A" + + alias { + name = module.alb.load_balancer_dns + zone_id = module.alb.load_balancer_dns_zone + evaluate_target_health = true + } +} + +resource "aws_route53_record" "stage-v6" { + zone_id = var.DNS_zone + name = "api-prod.${var.DNS_domain}" + type = "AAAA" + + alias { + name = module.alb.load_balancer_dns + zone_id = module.alb.load_balancer_dns_zone + evaluate_target_health = true + } +} + +resource "aws_route53_record" "stage-flower-v4" { + zone_id = var.DNS_zone + name = "flower-prod.${var.DNS_domain}" + type = "A" + + alias { + name = module.alb-flower.load_balancer_dns + zone_id = module.alb-flower.load_balancer_dns_zone + evaluate_target_health = true + } +} + +resource "aws_route53_record" "stage-flower-v6" { + zone_id = var.DNS_zone + name = "flower-prod.${var.DNS_domain}" + type = "AAAA" + + alias { + name = module.alb-flower.load_balancer_dns + zone_id = module.alb-flower.load_balancer_dns_zone + evaluate_target_health = true + } +} diff --git a/infra/prod-aws/outputs.tf b/infra/prod-aws/outputs.tf new file mode 100644 index 00000000..04dd8856 --- /dev/null +++ b/infra/prod-aws/outputs.tf @@ -0,0 +1,51 @@ +output "VPC" { + description = "VPC ID" + value = module.vpc.vpc_id +} + +output "public_subnets" { + description = "List of IDs of public subnets" + value = module.vpc.public_subnets +} + +output "private_subnets" { + description = "List of IDs of private subnets" + value = module.vpc.private_subnets +} + +output "IPv4_prefix_list" { + description = "ID of prefix list for IPv4 addresses" + value = module.vpc.ipv4_prefix_list_id +} + +output "IPv6_prefix_list" { + description = "ID of prefix list for IPv6 addresses" + value = module.vpc.ipv6_prefix_list_id +} + +output "secrets_manager_entries" { + description = "List of secrets manager ARNs containing sensitive strings" + value = concat( + [ + module.db.database_credentials, + values(var.container_secrets), + var.sentry_dsn, + var.remote_db_arn, + var.app_secret_key + ], + values(var.container_secrets) + ) +} + +output "default_security_group_id" { + description = "Default Security Group ID for the VPC" + value = module.vpc.default_security_group_id +} + +output "load_balancer_dns" { + description = "List of DNS of the Application Load balancers" + value = [ + module.alb.load_balancer_dns, + module.alb-flower.load_balancer_dns + ] +} diff --git a/infra/prod-aws/provider.tf b/infra/prod-aws/provider.tf new file mode 100644 index 00000000..7c0e8ce1 --- /dev/null +++ b/infra/prod-aws/provider.tf @@ -0,0 +1,26 @@ +terraform { + required_version = ">= 1.6" + + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5" + } + } + + backend "remote" { + organization = "hotosm" + + workspaces { + name = "raw-data" + } + } +} + +provider "aws" { + region = var.aws_region + + default_tags { + tags = var.default_tags + } +} diff --git a/infra/prod-aws/variables.tf b/infra/prod-aws/variables.tf new file mode 100644 index 00000000..8c9a6cf2 --- /dev/null +++ b/infra/prod-aws/variables.tf @@ -0,0 +1,91 @@ +variable "org_meta" { + description = "Org info for secrets manager prefix" + + default = { + name = "hotosm.org" + short_name = "hot" + url = "hotosm.org" + } +} + +variable "project_meta" { + description = "Metadata relating to the project for which the VPC is being created" + type = map(string) + + default = { + name = "raw-data-services" + short_name = "raw-data" + version = "1.1.0" + url = "https://raw-data.hotosm.org" + } +} + +variable "deployment_environment" { + description = "Deployment flavour or variant identified by this name" + type = string + + default = "dev" +} + +variable "default_tags" { + description = "Default resource tags to apply to AWS resources" + type = map(string) + + default = { + project = "raw-data-services" + maintainer = "kshitij.sharma@hotosm.org" + documentation = "https://docs.hotosm.org" + cost_center = "raw-data-services" + IaC_Management = "Terraform" + } +} + +variable "aws_region" { + description = "AWS region in which to launch the application" + type = string + + default = "us-east-1" +} + +variable "container_envvars" { + description = "Plain-text environment variables to pass to the container" + default = { + EXPORT_MAX_AREA_SQKM = "80000" + RATE_LIMIT_PER_MIN = "50" + ENABLE_TILES = "true" + ENABLE_POLYGON_STATISTICS_ENDPOINTS = "true" + POLYGON_STATISTICS_API_URL = "https://apps.kontur.io/insights-api/graphql" + ENABLE_HDX_EXPORTS = "true" + FILE_UPLOAD_METHOD = "s3" + USE_DUCK_DB_FOR_CUSTOM_EXPORTS = "True" + ENABLE_CUSTOM_EXPORTS = "True" + POLYGON_STATISTICS_API_RATE_LIMIT = "60" + EXPORT_PATH = "/tmp/app-data" + } +} + +variable "task_role_arn" { + type = string + default = "arn:aws:iam::670261699094:role/raw-data-testing" +} + +# Null defaults + +variable "efs_settings" { + default = { + file_system_id = "" + access_point_id = "" + root_directory = "/" + transit_encryption = "ENABLED" + iam_authz = "DISABLED" + } +} + +variable "alarm_settings" { + default = { + names = [] + enable = false + rollback = false + } +} +