Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

require squad for cloud run services/jobs, and allow alerting based on squad #624

Merged
merged 2 commits into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions modules/alerting/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,17 @@ locals {
]
}

locals {
squad_log_filter = var.squad == "" ? "" : "labels.squad=\"${var.squad}\""
}

locals {
bad_rollout_filter = <<EOT
resource.type="cloud_run_revision"
severity=ERROR
protoPayload.status.message:"Ready condition status changed to False"
protoPayload.response.kind="Revision"
${local.squad_log_filter}
EOT
}

Expand Down Expand Up @@ -67,6 +72,7 @@ logName: "/logs/run.googleapis.com%2Fvarlog%2Fsystem"
severity=ERROR
textPayload:"Consider increasing the memory limit"
${var.oom_filter}
${local.squad_log_filter}
EOF
}

Expand Down Expand Up @@ -118,6 +124,7 @@ severity=WARNING
textPayload=~"^Container terminated on signal [^01]+\.$"
${var.signal_filter}
-resource.labels.service_name:"-ing-vuln"
${local.squad_log_filter}
EOT
}

Expand Down Expand Up @@ -169,6 +176,7 @@ resource.type="cloud_run_revision" OR resource.type="cloud_run_job"
severity=ERROR
textPayload=~"panic: .*"
${var.panic_filter}
${local.squad_log_filter}
EOF
}

Expand Down Expand Up @@ -217,6 +225,7 @@ locals {
panic_stacktrace_filter = <<EOF
resource.type="cloud_run_revision" OR resource.type="cloud_run_job"
jsonPayload.stacktrace:"runtime.gopanic"
${local.squad_log_filter}
EOF
}

Expand Down Expand Up @@ -265,6 +274,7 @@ locals {
fatal_filter = <<EOF
resource.type="cloud_run_revision" OR resource.type="cloud_run_job"
textPayload:"fatal error: "
${local.squad_log_filter}
EOF
}

Expand Down Expand Up @@ -466,6 +476,7 @@ resource "google_monitoring_alert_policy" "cloud-run-scaling-failure" {
severity=ERROR
textPayload:"The request was aborted because there was no available instance."
${var.scaling_issue_filter}
${local.squad_log_filter}
EOT

label_extractors = {
Expand Down Expand Up @@ -533,6 +544,7 @@ resource "google_monitoring_alert_policy" "cloud-run-failed-req" {
severity=ERROR
textPayload:"The request failed because either the HTTP response was malformed or connection to the instance had an error."
${var.failed_req_filter}
${local.squad_log_filter}
EOT

label_extractors = {
Expand Down Expand Up @@ -611,6 +623,7 @@ resource "google_monitoring_alert_policy" "cloudrun_timeout" {
severity=ERROR
textPayload="The request has been terminated because it has reached the maximum request timeout. To change this limit, see https://cloud.google.com/run/docs/configuring/request-timeout"
${var.timeout_filter}
${local.squad_log_filter}
-resource.labels.service_name:"-ing-vuln"
EOT

Expand Down
6 changes: 6 additions & 0 deletions modules/alerting/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,9 @@ EOT
type = bool
default = false
}

variable "squad" {
description = "squad to filter on if non-empty"
type = string
default = ""
}
2 changes: 2 additions & 0 deletions modules/cron/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,12 @@ No modules.
| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project that will host the cron job. | `string` | n/a | yes |
| <a name="input_region"></a> [region](#input\_region) | The region to run the job. | `string` | `"us-east4"` | no |
| <a name="input_repository"></a> [repository](#input\_repository) | Container repository to publish images to. | `string` | `""` | no |
| <a name="input_require_squad"></a> [require\_squad](#input\_require\_squad) | Whether to require squad variable to be specified | `bool` | `true` | no |
| <a name="input_schedule"></a> [schedule](#input\_schedule) | The cron schedule on which to run the job. | `any` | n/a | yes |
| <a name="input_scheduled_env_overrides"></a> [scheduled\_env\_overrides](#input\_scheduled\_env\_overrides) | List of env object overrides. | <pre>list(object({<br/> name = string<br/> value = string<br/> }))</pre> | `[]` | no |
| <a name="input_secret_env"></a> [secret\_env](#input\_secret\_env) | A map of secrets to mount as environment variables from Google Secrets Manager (e.g. secret\_key=secret\_name) | `map` | `{}` | no |
| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | The email address of the service account to run the service as, and to invoke the job as. | `string` | n/a | yes |
| <a name="input_squad"></a> [squad](#input\_squad) | squad label to apply to the service. | `string` | `""` | no |
| <a name="input_success_alert_alignment_period_seconds"></a> [success\_alert\_alignment\_period\_seconds](#input\_success\_alert\_alignment\_period\_seconds) | Alignment period for successful completion alert. 0 (default) to not create alert. | `number` | `0` | no |
| <a name="input_task_count"></a> [task\_count](#input\_task\_count) | The number of tasks to run. | `number` | `1` | no |
| <a name="input_timeout"></a> [timeout](#input\_timeout) | The maximum amount of time in seconds to allow the job to run. | `string` | `"600s"` | no |
Expand Down
6 changes: 5 additions & 1 deletion modules/cron/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ resource "google_project_service" "cloudscheduler" {

locals {
repo = var.repository != "" ? var.repository : "gcr.io/${var.project_id}/${var.name}"
squad_label = {
"squad" : var.squad
}
}

resource "ko_build" "image" {
Expand Down Expand Up @@ -54,13 +57,14 @@ resource "google_cloud_run_v2_job" "job" {

name = "${var.name}-cron"
location = var.region
labels = merge(var.labels, local.squad_label)

deletion_protection = var.deletion_protection

template {
parallelism = var.parallelism
task_count = var.task_count
labels = var.labels
labels = merge(var.labels, local.squad_label)

template {
execution_environment = var.execution_environment
Expand Down
17 changes: 17 additions & 0 deletions modules/cron/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -209,3 +209,20 @@ variable "labels" {
type = map(string)
default = {}
}

variable "require_squad" {
description = "Whether to require squad variable to be specified"
type = bool
default = true
}

variable "squad" {
description = "squad label to apply to the service."
type = string
default = ""

validation {
condition = !var.require_squad || var.squad != ""
error_message = "squad needs to specified or disable check by setting require_squad = false"
}
}
2 changes: 2 additions & 0 deletions modules/github-bots/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,9 @@ No requirements.
| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes |
| <a name="input_raw_filter"></a> [raw\_filter](#input\_raw\_filter) | Raw PubSub filter to apply, ignores other variables. https://cloud.google.com/pubsub/docs/subscription-message-filter#filtering_syntax | `string` | `""` | no |
| <a name="input_regions"></a> [regions](#input\_regions) | A map from region names to a network and subnetwork. | <pre>map(object({<br/> network = string<br/> subnet = string<br/> }))</pre> | n/a | yes |
| <a name="input_require_squad"></a> [require\_squad](#input\_require\_squad) | Whether to require squad variable to be specified | `bool` | `true` | no |
| <a name="input_service_account_email"></a> [service\_account\_email](#input\_service\_account\_email) | The email of the service account being authorized to invoke the private Cloud Run service. If empty, a service account will be created and used. | `string` | `""` | no |
| <a name="input_squad"></a> [squad](#input\_squad) | squad label to apply to the service. | `string` | `""` | no |

## Outputs

Expand Down
3 changes: 3 additions & 0 deletions modules/github-bots/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ module "service" {

labels = var.labels

squad = var.squad
require_squad = var.require_squad

service_account = var.service_account_email == "" ? google_service_account.sa[0].email : var.service_account_email

egress = "PRIVATE_RANGES_ONLY" // Makes GitHub API calls
Expand Down
17 changes: 17 additions & 0 deletions modules/github-bots/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,20 @@ variable "labels" {
type = map(string)
default = {}
}

variable "require_squad" {
description = "Whether to require squad variable to be specified"
type = bool
default = true
}

variable "squad" {
description = "squad label to apply to the service."
type = string
default = ""

validation {
condition = !var.require_squad || var.squad != ""
error_message = "squad needs to specified or disable check by setting require_squad = false"
}
}
2 changes: 2 additions & 0 deletions modules/regional-go-service/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,10 @@ No requirements.
| <a name="input_regional-volumes"></a> [regional-volumes](#input\_regional-volumes) | The volumes to make available to the containers in the service for mounting. | <pre>list(object({<br/> name = string<br/> gcs = optional(map(object({<br/> bucket = string<br/> read_only = optional(bool, true)<br/> })), {})<br/> nfs = optional(map(object({<br/> server = string<br/> path = string<br/> read_only = optional(bool, true)<br/> })), {})<br/> }))</pre> | `[]` | no |
| <a name="input_regions"></a> [regions](#input\_regions) | A map from region names to a network and subnetwork. A service will be created in each region configured to egress the specified traffic via the specified subnetwork. | <pre>map(object({<br/> network = string<br/> subnet = string<br/> }))</pre> | n/a | yes |
| <a name="input_request_timeout_seconds"></a> [request\_timeout\_seconds](#input\_request\_timeout\_seconds) | The timeout for requests to the service, in seconds. | `number` | `300` | no |
| <a name="input_require_squad"></a> [require\_squad](#input\_require\_squad) | Whether to require squad variable to be specified | `bool` | `true` | no |
| <a name="input_scaling"></a> [scaling](#input\_scaling) | The scaling configuration for the service. | <pre>object({<br/> min_instances = optional(number, 0)<br/> max_instances = optional(number, 100)<br/> max_instance_request_concurrency = optional(number)<br/> })</pre> | `{}` | no |
| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | The service account as which to run the service. | `string` | n/a | yes |
| <a name="input_squad"></a> [squad](#input\_squad) | squad label to apply to the service. | `string` | `""` | no |
| <a name="input_volumes"></a> [volumes](#input\_volumes) | The volumes to make available to the containers in the service for mounting. | <pre>list(object({<br/> name = string<br/> empty_dir = optional(object({<br/> medium = optional(string, "MEMORY")<br/> size_limit = optional(string, "2G")<br/> }))<br/> secret = optional(object({<br/> secret = string<br/> items = list(object({<br/> version = string<br/> path = string<br/> }))<br/> }))<br/> }))</pre> | `[]` | no |

## Outputs
Expand Down
2 changes: 2 additions & 0 deletions modules/regional-go-service/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ module "this" {
}

labels = var.labels
squad = var.squad
require_squad = var.require_squad
scaling = var.scaling
volumes = var.volumes
regional-volumes = var.regional-volumes
Expand Down
17 changes: 17 additions & 0 deletions modules/regional-go-service/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,23 @@ variable "labels" {
default = {}
}

variable "require_squad" {
description = "Whether to require squad variable to be specified"
type = bool
default = true
}

variable "squad" {
description = "squad label to apply to the service."
type = string
default = ""

validation {
condition = !var.require_squad || var.squad != ""
error_message = "squad needs to specified or disable check by setting require_squad = false"
}
}

variable "otel_collector_image" {
type = string
default = "chainguard/opentelemetry-collector-contrib:latest"
Expand Down
2 changes: 2 additions & 0 deletions modules/regional-service/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,10 @@ No modules.
| <a name="input_regional-volumes"></a> [regional-volumes](#input\_regional-volumes) | The volumes to make available to the containers in the service for mounting. | <pre>list(object({<br/> name = string<br/> gcs = optional(map(object({<br/> bucket = string<br/> read_only = optional(bool, true)<br/> })), {})<br/> nfs = optional(map(object({<br/> server = string<br/> path = string<br/> read_only = optional(bool, true)<br/> })), {})<br/> }))</pre> | `[]` | no |
| <a name="input_regions"></a> [regions](#input\_regions) | A map from region names to a network and subnetwork. A service will be created in each region configured to egress the specified traffic via the specified subnetwork. | <pre>map(object({<br/> network = string<br/> subnet = string<br/> }))</pre> | n/a | yes |
| <a name="input_request_timeout_seconds"></a> [request\_timeout\_seconds](#input\_request\_timeout\_seconds) | The timeout for requests to the service, in seconds. | `number` | `300` | no |
| <a name="input_require_squad"></a> [require\_squad](#input\_require\_squad) | Whether to require squad variable to be specified | `bool` | `true` | no |
| <a name="input_scaling"></a> [scaling](#input\_scaling) | The scaling configuration for the service. | <pre>object({<br/> min_instances = optional(number, 0)<br/> max_instances = optional(number, 100)<br/> max_instance_request_concurrency = optional(number)<br/> })</pre> | `{}` | no |
| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | The service account as which to run the service. | `string` | n/a | yes |
| <a name="input_squad"></a> [squad](#input\_squad) | squad label to apply to the service. | `string` | `""` | no |
| <a name="input_volumes"></a> [volumes](#input\_volumes) | The volumes to make available to the containers in the service for mounting. | <pre>list(object({<br/> name = string<br/> empty_dir = optional(object({<br/> medium = optional(string, "MEMORY")<br/> size_limit = optional(string, "2G")<br/> }))<br/> secret = optional(object({<br/> secret = string<br/> items = list(object({<br/> version = string<br/> path = string<br/> }))<br/> }))<br/> }))</pre> | `[]` | no |

## Outputs
Expand Down
8 changes: 6 additions & 2 deletions modules/regional-service/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ locals {
"regional-service" : var.name
}

squad_label = {
"squad" : var.squad
}

main_container_idx = keys(local.has_port)[0]
main_container = local.has_port[local.main_container_idx]
}
Expand All @@ -51,7 +55,7 @@ resource "google_cloud_run_v2_service" "this" {
project = var.project_id
name = var.name
location = each.key
labels = merge(var.labels, local.default_labels)
labels = merge(var.labels, local.default_labels, local.squad_label)
ingress = var.ingress

deletion_protection = var.deletion_protection
Expand All @@ -63,7 +67,7 @@ resource "google_cloud_run_v2_service" "this" {
}
max_instance_request_concurrency = var.scaling.max_instance_request_concurrency
execution_environment = var.execution_environment
labels = merge(var.labels, local.default_labels)
labels = merge(var.labels, local.default_labels, local.squad_label)

vpc_access {
network_interfaces {
Expand Down
17 changes: 17 additions & 0 deletions modules/regional-service/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,23 @@ variable "labels" {
default = {}
}

variable "require_squad" {
description = "Whether to require squad variable to be specified"
type = bool
default = true
}

variable "squad" {
description = "squad label to apply to the service."
type = string
default = ""

validation {
condition = !var.require_squad || var.squad != ""
error_message = "squad needs to specified or disable check by setting require_squad = false"
}
}

variable "otel_collector_image" {
type = string
default = "chainguard/opentelemetry-collector-contrib:latest"
Expand Down
Loading